### 1. Create pandas data frame by hard coding

In [1]:
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')
df = pd.DataFrame(zip(['001','001','002','002','002'],
                      ['2019-01-01','2019-01-01','2019-05-01','2019-05-01','2019-05-01'],
                      ['2019-12-01','2019-12-01','2020-07-31','2020-07-31','2020-07-31'],
                      ['prod1','prod2','prod1','prod3','prod4'],
                      ['prod1_name','prod2_name','prod1_name','prod3_name','prod4_name']),
                  columns = ['customer_id', 'start_date','end_date','product_id','product_name'])
df

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
0,1,2019-01-01,2019-12-01,prod1,prod1_name
1,1,2019-01-01,2019-12-01,prod2,prod2_name
2,2,2019-05-01,2020-07-31,prod1,prod1_name
3,2,2019-05-01,2020-07-31,prod3,prod3_name
4,2,2019-05-01,2020-07-31,prod4,prod4_name


### 2. Get schema from pandas data frame

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
customer_id     5 non-null object
start_date      5 non-null object
end_date        5 non-null object
product_id      5 non-null object
product_name    5 non-null object
dtypes: object(5)
memory usage: 328.0+ bytes


### 3. Filter functions in pandas

#### filter rows with loc function (loc for location)

In [3]:
#get index row upto 2
df.loc[:2]

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
0,1,2019-01-01,2019-12-01,prod1,prod1_name
1,1,2019-01-01,2019-12-01,prod2,prod2_name
2,2,2019-05-01,2020-07-31,prod1,prod1_name


#### filter columns with square brackets

In [4]:
df[['start_date','end_date']]

Unnamed: 0,start_date,end_date
0,2019-01-01,2019-12-01
1,2019-01-01,2019-12-01
2,2019-05-01,2020-07-31
3,2019-05-01,2020-07-31
4,2019-05-01,2020-07-31


#### filter datetime columns

In [5]:
#Change date column from string type (text column) to datetime type
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])
print(df.info())
df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
customer_id     5 non-null object
start_date      5 non-null datetime64[ns]
end_date        5 non-null datetime64[ns]
product_id      5 non-null object
product_name    5 non-null object
dtypes: datetime64[ns](2), object(3)
memory usage: 328.0+ bytes
None


Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
0,1,2019-01-01,2019-12-01,prod1,prod1_name
1,1,2019-01-01,2019-12-01,prod2,prod2_name


In [6]:
#filter using start_date as criterion
df[df.start_date>= '2019-02-03']

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
2,2,2019-05-01,2020-07-31,prod1,prod1_name
3,2,2019-05-01,2020-07-31,prod3,prod3_name
4,2,2019-05-01,2020-07-31,prod4,prod4_name


In [7]:
#alternative method: using column name with quote inside bracket
df[df['start_date']>= '2019-02-03']

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
2,2,2019-05-01,2020-07-31,prod1,prod1_name
3,2,2019-05-01,2020-07-31,prod3,prod3_name
4,2,2019-05-01,2020-07-31,prod4,prod4_name


#### text/string filter

In [8]:
#filter if column contain any part of string. 
# case = False for ignore case-sensitive (default = True). 
# na = False for skip na entry (default = True)
# regex = True for including regular expression pattern for filter text (default = False)
df[df.product_id.str.contains('1',case=True,na=False,regex=False)]

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
0,1,2019-01-01,2019-12-01,prod1,prod1_name
2,2,2019-05-01,2020-07-31,prod1,prod1_name


In [9]:
#Test with regular expression string
#More detail about regular expression in python: 
print('https://docs.python.org/3/library/re.html')
print('Explanation for re pattern: 3 lowercase letters followed by 1 number from 1 to 2')
df[df.product_id.str.contains('[a-z]{3}[1-2]',regex=True)]

https://docs.python.org/3/library/re.html
Explanation for re pattern: 3 lowercase letters followed by 1 number from 1 to 2


Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
0,1,2019-01-01,2019-12-01,prod1,prod1_name
1,1,2019-01-01,2019-12-01,prod2,prod2_name
2,2,2019-05-01,2020-07-31,prod1,prod1_name


In [10]:
#starts with
df[df.product_id.str.startswith('pr')]

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
0,1,2019-01-01,2019-12-01,prod1,prod1_name
1,1,2019-01-01,2019-12-01,prod2,prod2_name
2,2,2019-05-01,2020-07-31,prod1,prod1_name
3,2,2019-05-01,2020-07-31,prod3,prod3_name
4,2,2019-05-01,2020-07-31,prod4,prod4_name


In [11]:
#ends with
df[df.product_id.str.endswith('2')]

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
1,1,2019-01-01,2019-12-01,prod2,prod2_name


### 4. Create more columns

In [12]:
#similar to sql case... when statement
df.loc[df.start_date<'2019-02-03','customer_category'] = 'old customer'
df.loc[df.start_date>='2019-02-03','customer_category'] = 'new customer'
df

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,customer_category
0,1,2019-01-01,2019-12-01,prod1,prod1_name,old customer
1,1,2019-01-01,2019-12-01,prod2,prod2_name,old customer
2,2,2019-05-01,2020-07-31,prod1,prod1_name,new customer
3,2,2019-05-01,2020-07-31,prod3,prod3_name,new customer
4,2,2019-05-01,2020-07-31,prod4,prod4_name,new customer


In [13]:
#hard code new column (get transaction value default to 0)
df['transaction1']=0
df.loc[df.product_id =='prod1','transaction1'] = 10
df.loc[df.product_id =='prod2','transaction1'] = 20
#non-default value for new column will result to na value
df.loc[df.product_id =='prod3','transaction2'] = 30 
df

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,customer_category,transaction1,transaction2
0,1,2019-01-01,2019-12-01,prod1,prod1_name,old customer,10,
1,1,2019-01-01,2019-12-01,prod2,prod2_name,old customer,20,
2,2,2019-05-01,2020-07-31,prod1,prod1_name,new customer,10,
3,2,2019-05-01,2020-07-31,prod3,prod3_name,new customer,0,30.0
4,2,2019-05-01,2020-07-31,prod4,prod4_name,new customer,0,


In [14]:
#filter for na values
df[pd.isna(df['transaction2'])]

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,customer_category,transaction1,transaction2
0,1,2019-01-01,2019-12-01,prod1,prod1_name,old customer,10,
1,1,2019-01-01,2019-12-01,prod2,prod2_name,old customer,20,
2,2,2019-05-01,2020-07-31,prod1,prod1_name,new customer,10,
4,2,2019-05-01,2020-07-31,prod4,prod4_name,new customer,0,


In [15]:
#fix na value with fillna and pass in a dictionary with multiple columns
df = df.fillna(value={'transaction2':0})
df

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,customer_category,transaction1,transaction2
0,1,2019-01-01,2019-12-01,prod1,prod1_name,old customer,10,0.0
1,1,2019-01-01,2019-12-01,prod2,prod2_name,old customer,20,0.0
2,2,2019-05-01,2020-07-31,prod1,prod1_name,new customer,10,0.0
3,2,2019-05-01,2020-07-31,prod3,prod3_name,new customer,0,30.0
4,2,2019-05-01,2020-07-31,prod4,prod4_name,new customer,0,0.0


In [19]:
#Create dummy columns
df1 = pd.get_dummies(df['customer_category'],prefix = 'group_')
df1

Unnamed: 0,group__new customer,group__old customer
0,0,1
1,0,1
2,1,0
3,1,0
4,1,0


### 5. Concat function

In [45]:
#Two ways to write an images to Jupyter in markdown. But this way only show image inside Jupyter, not in git
print('![](https://i.stack.imgur.com/dcoE3.jpg)')
print('<img src= "https://i.stack.imgur.com/dcoE3.jpg" style= "width=200px;height=100px"/>')

![](https://i.stack.imgur.com/dcoE3.jpg)
<img src= "https://i.stack.imgur.com/dcoE3.jpg" style= "width=200px;height=100px"/>


__We can concatenate dataframes by vertical or horizontal axis__

In [48]:
from IPython.display import display, HTML
display(HTML('<img src= "https://i.stack.imgur.com/dcoE3.jpg" style= "width=200px;height=100px"/>'))

In [20]:
#Vertical concatenate (axis =1)
df2 = pd.concat([df.drop(columns = ['transaction1','transaction2']),df1],axis =1)
df2

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,customer_category,group__new customer,group__old customer
0,1,2019-01-01,2019-12-01,prod1,prod1_name,old customer,0,1
1,1,2019-01-01,2019-12-01,prod2,prod2_name,old customer,0,1
2,2,2019-05-01,2020-07-31,prod1,prod1_name,new customer,1,0
3,2,2019-05-01,2020-07-31,prod3,prod3_name,new customer,1,0
4,2,2019-05-01,2020-07-31,prod4,prod4_name,new customer,1,0


In [30]:
#Create a new dataframe to concat with original df
df3 = pd.DataFrame(zip(['003','003'],
                      ['2019-02-05','2019-02-05'],
                      ['2020-01-08','2020-01-08'],
                      ['prod2','prod4'],
                      ['prod2_name','prod4_name']),
                  columns = ['customer_id', 'start_date','end_date','product_id','product_name'])
df3['start_date'] = pd.to_datetime(df3['start_date'])
df3['end_date'] = pd.to_datetime(df3['end_date'])
df3.loc[df3.start_date<'2019-02-03','customer_category'] = 'old customer'
df3.loc[df3.start_date>='2019-02-03','customer_category'] = 'new customer'
df3

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,customer_category
0,3,2019-02-05,2020-01-08,prod2,prod2_name,new customer
1,3,2019-02-05,2020-01-08,prod4,prod4_name,new customer


In [31]:
#Horizontal concatenate (axis =0)
combined_df = pd.concat([df[df3.columns],df3],axis =0) #select only common columns from df
combined_df

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,customer_category
0,1,2019-01-01,2019-12-01,prod1,prod1_name,old customer
1,1,2019-01-01,2019-12-01,prod2,prod2_name,old customer
2,2,2019-05-01,2020-07-31,prod1,prod1_name,new customer
3,2,2019-05-01,2020-07-31,prod3,prod3_name,new customer
4,2,2019-05-01,2020-07-31,prod4,prod4_name,new customer
0,3,2019-02-05,2020-01-08,prod2,prod2_name,new customer
1,3,2019-02-05,2020-01-08,prod4,prod4_name,new customer


### 5. Group functions in pandas

In [42]:
#Group Data
#len to count number of rows
#rename column using dictionary
df_group = combined_df.groupby(('customer_id', 'start_date','end_date'))\
                      .agg({'product_id': lambda x :list(x),
                            'product_name': lambda x :list(x),
                            'customer_category':len})\
                      .rename(columns ={'customer_category':'product_count'})

df_group.reset_index(inplace = True)
df_group

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,product_count
0,1,2019-01-01,2019-12-01,"[prod1, prod2]","[prod1_name, prod2_name]",2
1,2,2019-05-01,2020-07-31,"[prod1, prod3, prod4]","[prod1_name, prod3_name, prod4_name]",3
2,3,2019-02-05,2020-01-08,"[prod2, prod4]","[prod2_name, prod4_name]",2


In [43]:
#Rename all columns
df_group.columns =['customerId','startDate','endDate','productId','productName','productCount']
df_group

Unnamed: 0,customerId,startDate,endDate,productId,productName,productCount
0,1,2019-01-01,2019-12-01,"[prod1, prod2]","[prod1_name, prod2_name]",2
1,2,2019-05-01,2020-07-31,"[prod1, prod3, prod4]","[prod1_name, prod3_name, prod4_name]",3
2,3,2019-02-05,2020-01-08,"[prod2, prod4]","[prod2_name, prod4_name]",2


### 6.Transpose function