### 1. Create pandas data frame by hard coding

In [1]:
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')

Use `zip` and `columns=` to create dataframe

In [2]:
df = pd.DataFrame(zip(['001','001','002','002','002'],
                      ['2019-01-01','2019-01-01','2019-05-01','2019-05-01','2019-05-01'],
                      ['2019-12-01','2019-12-01','2020-07-31','2020-07-31','2020-07-31'],
                      ['prod1','prod2','prod1','prod3','prod4'],
                      ['prod1_name','prod2_name','prod1_name','prod3_name','prod4_name']),
                  columns = ['customer_id', 'start_date','end_date','product_id','product_name'])
df

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
0,1,2019-01-01,2019-12-01,prod1,prod1_name
1,1,2019-01-01,2019-12-01,prod2,prod2_name
2,2,2019-05-01,2020-07-31,prod1,prod1_name
3,2,2019-05-01,2020-07-31,prod3,prod3_name
4,2,2019-05-01,2020-07-31,prod4,prod4_name


Use `dictionary` to create dataframe

In [3]:
data = {'customer_id':['001','001','002','002','002'],
        'start_date':['2019-01-01','2019-01-01','2019-05-01','2019-05-01','2019-05-01'],
        'end_date':['2019-12-01','2019-12-01','2020-07-31','2020-07-31','2020-07-31'],
        'product_id':['prod1','prod2','prod1','prod3','prod4'],
        'product_name':['prod1_name','prod2_name','prod1_name','prod3_name','prod4_name']}
df = pd.DataFrame(data)
df

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
0,1,2019-01-01,2019-12-01,prod1,prod1_name
1,1,2019-01-01,2019-12-01,prod2,prod2_name
2,2,2019-05-01,2020-07-31,prod1,prod1_name
3,2,2019-05-01,2020-07-31,prod3,prod3_name
4,2,2019-05-01,2020-07-31,prod4,prod4_name


#Read file from local csv

df = pd.read_csv('url_path/filename.csv',delimiter=',')

### 2. Get schema from pandas data frame

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
customer_id     5 non-null object
start_date      5 non-null object
end_date        5 non-null object
product_id      5 non-null object
product_name    5 non-null object
dtypes: object(5)
memory usage: 328.0+ bytes


### 3. Filter functions in pandas

#### filter rows with loc function (loc for location)

In [5]:
#get index row upto 2
df.loc[:2]

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
0,1,2019-01-01,2019-12-01,prod1,prod1_name
1,1,2019-01-01,2019-12-01,prod2,prod2_name
2,2,2019-05-01,2020-07-31,prod1,prod1_name


#### filter columns with square brackets

In [6]:
df[['start_date','end_date']]

Unnamed: 0,start_date,end_date
0,2019-01-01,2019-12-01
1,2019-01-01,2019-12-01
2,2019-05-01,2020-07-31
3,2019-05-01,2020-07-31
4,2019-05-01,2020-07-31


#### filter datetime columns

In [7]:
#Change date column from string type (text column) to datetime type
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])
print(df.info())
df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
customer_id     5 non-null object
start_date      5 non-null datetime64[ns]
end_date        5 non-null datetime64[ns]
product_id      5 non-null object
product_name    5 non-null object
dtypes: datetime64[ns](2), object(3)
memory usage: 328.0+ bytes
None


Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
0,1,2019-01-01,2019-12-01,prod1,prod1_name
1,1,2019-01-01,2019-12-01,prod2,prod2_name


In [8]:
#filter using start_date as criterion
df[df.start_date>= '2019-02-03']

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
2,2,2019-05-01,2020-07-31,prod1,prod1_name
3,2,2019-05-01,2020-07-31,prod3,prod3_name
4,2,2019-05-01,2020-07-31,prod4,prod4_name


In [9]:
#alternative method: using column name with quote inside bracket
df[df['start_date']>= '2019-02-03']

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
2,2,2019-05-01,2020-07-31,prod1,prod1_name
3,2,2019-05-01,2020-07-31,prod3,prod3_name
4,2,2019-05-01,2020-07-31,prod4,prod4_name


#### text/string filter

- Filter if column contain any part of string with `str.contains()`.
- `case = True/False` for keep or ignore case-sensitive (default = True). 
- `na = True/False` for keep/skip na entry (default = True)
- `regex = True/False` for including or excluding regular expression pattern for filter text (default = False)

In [10]:
df[df.product_id.str.contains('1',case=True,na=False,regex=False)]

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
0,1,2019-01-01,2019-12-01,prod1,prod1_name
2,2,2019-05-01,2020-07-31,prod1,prod1_name


In [11]:
#Test with regular expression string
#More detail about regular expression in python: 
print('https://docs.python.org/3/library/re.html')
print('Explanation for re pattern: 3 lowercase letters followed by 1 number from 1 to 2')
df[df.product_id.str.contains('[a-z]{3}[1-2]',regex=True)]

https://docs.python.org/3/library/re.html
Explanation for re pattern: 3 lowercase letters followed by 1 number from 1 to 2


Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
0,1,2019-01-01,2019-12-01,prod1,prod1_name
1,1,2019-01-01,2019-12-01,prod2,prod2_name
2,2,2019-05-01,2020-07-31,prod1,prod1_name


In [12]:
#starts with
df[df.product_id.str.startswith('pr')]

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
0,1,2019-01-01,2019-12-01,prod1,prod1_name
1,1,2019-01-01,2019-12-01,prod2,prod2_name
2,2,2019-05-01,2020-07-31,prod1,prod1_name
3,2,2019-05-01,2020-07-31,prod3,prod3_name
4,2,2019-05-01,2020-07-31,prod4,prod4_name


In [13]:
#ends with
df[df.product_id.str.endswith('2')]

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name
1,1,2019-01-01,2019-12-01,prod2,prod2_name


### 4. Create more columns

In [14]:
#similar to sql case... when statement
df.loc[df.start_date<'2019-02-03','customer_category'] = 'old_customer'
df.loc[df.start_date>='2019-02-03','customer_category'] = 'new_customer'
df

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,customer_category
0,1,2019-01-01,2019-12-01,prod1,prod1_name,old_customer
1,1,2019-01-01,2019-12-01,prod2,prod2_name,old_customer
2,2,2019-05-01,2020-07-31,prod1,prod1_name,new_customer
3,2,2019-05-01,2020-07-31,prod3,prod3_name,new_customer
4,2,2019-05-01,2020-07-31,prod4,prod4_name,new_customer


In [15]:
#hard code new column (get transaction value default to 0)
df['transaction1']=0
df.loc[df.product_id =='prod1','transaction1'] = 10
df.loc[df.product_id =='prod2','transaction1'] = 20
#non-default value for new column will result to na value
df.loc[df.product_id =='prod3','transaction2'] = 30 
df

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,customer_category,transaction1,transaction2
0,1,2019-01-01,2019-12-01,prod1,prod1_name,old_customer,10,
1,1,2019-01-01,2019-12-01,prod2,prod2_name,old_customer,20,
2,2,2019-05-01,2020-07-31,prod1,prod1_name,new_customer,10,
3,2,2019-05-01,2020-07-31,prod3,prod3_name,new_customer,0,30.0
4,2,2019-05-01,2020-07-31,prod4,prod4_name,new_customer,0,


In [16]:
#filter for na values
df[pd.isna(df['transaction2'])]

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,customer_category,transaction1,transaction2
0,1,2019-01-01,2019-12-01,prod1,prod1_name,old_customer,10,
1,1,2019-01-01,2019-12-01,prod2,prod2_name,old_customer,20,
2,2,2019-05-01,2020-07-31,prod1,prod1_name,new_customer,10,
4,2,2019-05-01,2020-07-31,prod4,prod4_name,new_customer,0,


In [17]:
#fix na value with fillna and pass in a dictionary with multiple columns
df = df.fillna(value={'transaction2':0})
df

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,customer_category,transaction1,transaction2
0,1,2019-01-01,2019-12-01,prod1,prod1_name,old_customer,10,0.0
1,1,2019-01-01,2019-12-01,prod2,prod2_name,old_customer,20,0.0
2,2,2019-05-01,2020-07-31,prod1,prod1_name,new_customer,10,0.0
3,2,2019-05-01,2020-07-31,prod3,prod3_name,new_customer,0,30.0
4,2,2019-05-01,2020-07-31,prod4,prod4_name,new_customer,0,0.0


In [18]:
#Create dummy columns
df1 = pd.get_dummies(df['customer_category'],prefix = 'group')
df1

Unnamed: 0,group_new_customer,group_old_customer
0,0,1
1,0,1
2,1,0
3,1,0
4,1,0


### 5. Concat functions

In [19]:
#Two ways to write an images to Jupyter in markdown. But this way only show image inside Jupyter, not in git
print('![](https://i.stack.imgur.com/dcoE3.jpg)')
print('<img src= "https://i.stack.imgur.com/dcoE3.jpg" style= "width=200px;height=100px"/>')

![](https://i.stack.imgur.com/dcoE3.jpg)
<img src= "https://i.stack.imgur.com/dcoE3.jpg" style= "width=200px;height=100px"/>


__We can concatenate dataframes by vertical or horizontal axis__

In [20]:
from IPython.display import display, HTML
display(HTML('<img src= "https://i.stack.imgur.com/dcoE3.jpg" style= "width=200px;height=100px"/>'))

In [21]:
#Vertical concatenate (axis =1)
df2 = pd.concat([df.drop(columns = ['transaction1','transaction2']),df1],axis =1)
df2

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,customer_category,group_new_customer,group_old_customer
0,1,2019-01-01,2019-12-01,prod1,prod1_name,old_customer,0,1
1,1,2019-01-01,2019-12-01,prod2,prod2_name,old_customer,0,1
2,2,2019-05-01,2020-07-31,prod1,prod1_name,new_customer,1,0
3,2,2019-05-01,2020-07-31,prod3,prod3_name,new_customer,1,0
4,2,2019-05-01,2020-07-31,prod4,prod4_name,new_customer,1,0


In [22]:
#Create a new dataframe to concat with original df
df3 = pd.DataFrame(zip(['003','003'],
                      ['2019-02-05','2019-02-05'],
                      ['2020-01-08','2020-01-08'],
                      ['prod2','prod4'],
                      ['prod2_name','prod4_name']),
                  columns = ['customer_id', 'start_date','end_date','product_id','product_name'])
df3['start_date'] = pd.to_datetime(df3['start_date'])
df3['end_date'] = pd.to_datetime(df3['end_date'])
df3.loc[df3.start_date<'2019-02-03','customer_category'] = 'old_customer'
df3.loc[df3.start_date>='2019-02-03','customer_category'] = 'new_customer'
df3

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,customer_category
0,3,2019-02-05,2020-01-08,prod2,prod2_name,new_customer
1,3,2019-02-05,2020-01-08,prod4,prod4_name,new_customer


In [23]:
#Horizontal concatenate (axis =0)
combined_df = pd.concat([df[df3.columns],df3],axis =0) #select only common columns from df
combined_df

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,customer_category
0,1,2019-01-01,2019-12-01,prod1,prod1_name,old_customer
1,1,2019-01-01,2019-12-01,prod2,prod2_name,old_customer
2,2,2019-05-01,2020-07-31,prod1,prod1_name,new_customer
3,2,2019-05-01,2020-07-31,prod3,prod3_name,new_customer
4,2,2019-05-01,2020-07-31,prod4,prod4_name,new_customer
0,3,2019-02-05,2020-01-08,prod2,prod2_name,new_customer
1,3,2019-02-05,2020-01-08,prod4,prod4_name,new_customer


### 6. Group functions in pandas

`groupby` syntax in pandas work well with multiple aggregate functions: 
- `list`: collect a list of all items in a columns and keep original order of items
- `set`: collect an unique list of all items in a columns without keeping original order of items
- `len`: count the volume of a specific category in groupby
- `sum`: mathematical sum of numeric items or concatenate of character items

In order to avoid multiple levels of column titles, we may need to use `inplace = True` in `reset_index` after executing groupby syntax

In [24]:
#Group Data
df_group = combined_df.groupby(('customer_id', 'start_date','end_date'))\
                      .agg({'product_id': lambda x :list(x),
                            'product_name': lambda x :list(x),
                            'customer_category':len})\
                      .rename(columns ={'customer_category':'product_count'})

df_group.reset_index(inplace = True)
df_group

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,product_count
0,1,2019-01-01,2019-12-01,"[prod1, prod2]","[prod1_name, prod2_name]",2
1,2,2019-05-01,2020-07-31,"[prod1, prod3, prod4]","[prod1_name, prod3_name, prod4_name]",3
2,3,2019-02-05,2020-01-08,"[prod2, prod4]","[prod2_name, prod4_name]",2


In [25]:
#Rename all columns
df_group.columns =['customerId','startDate','endDate','productId','productName','productCount']
df_group

Unnamed: 0,customerId,startDate,endDate,productId,productName,productCount
0,1,2019-01-01,2019-12-01,"[prod1, prod2]","[prod1_name, prod2_name]",2
1,2,2019-05-01,2020-07-31,"[prod1, prod3, prod4]","[prod1_name, prod3_name, prod4_name]",3
2,3,2019-02-05,2020-01-08,"[prod2, prod4]","[prod2_name, prod4_name]",2


Beside __rename columns__ after executing groupby, there is another way to do it __right inside the `agg` syntax__ by creating __nested__ dictionary. Then we can drop unnecesary column title level with `droplevel(n)` all in one cell. For demonstration purpose, these 2 steps are separated into 2 cells

In [26]:
#Group Data with nested dictionary in agg function --> We will have 2 level column title like below.
df_group = combined_df.groupby(('customer_id', 'start_date','end_date'))\
                      .agg({'product_id': {'product_id':lambda x :list(x)},
                            'product_name': {'product_name': lambda x :list(x)},
                            'customer_category':{'product_count':len}})
df_group

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,product_id,product_name,customer_category
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,product_id,product_name,product_count
customer_id,start_date,end_date,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,2019-01-01,2019-12-01,"[prod1, prod2]","[prod1_name, prod2_name]",2
2,2019-05-01,2020-07-31,"[prod1, prod3, prod4]","[prod1_name, prod3_name, prod4_name]",3
3,2019-02-05,2020-01-08,"[prod2, prod4]","[prod2_name, prod4_name]",2


In [27]:
# We may need to drop the first level of column title which is all about the source column name
df_group.columns =df_group.columns.droplevel(0)
df_group.reset_index(inplace = True)
df_group

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,product_count
0,1,2019-01-01,2019-12-01,"[prod1, prod2]","[prod1_name, prod2_name]",2
1,2,2019-05-01,2020-07-31,"[prod1, prod3, prod4]","[prod1_name, prod3_name, prod4_name]",3
2,3,2019-02-05,2020-01-08,"[prod2, prod4]","[prod2_name, prod4_name]",2


### 7.Transpose functions

In [28]:
#data frame before transpose
combined_df

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,customer_category
0,1,2019-01-01,2019-12-01,prod1,prod1_name,old_customer
1,1,2019-01-01,2019-12-01,prod2,prod2_name,old_customer
2,2,2019-05-01,2020-07-31,prod1,prod1_name,new_customer
3,2,2019-05-01,2020-07-31,prod3,prod3_name,new_customer
4,2,2019-05-01,2020-07-31,prod4,prod4_name,new_customer
0,3,2019-02-05,2020-01-08,prod2,prod2_name,new_customer
1,3,2019-02-05,2020-01-08,prod4,prod4_name,new_customer


__pivot_table function:__
- `index`: pass list of column to keep, not to transpose
- `columns`: column from vertical view to horizontal view
- `values`: pair value of variable in columns to apply aggregate function
- `aggfunc`: aggregate function to apply on values column. 
- `aggfunc`: It can be `sum`, `count`, `mean` or `user defined` function like `lambda x: ','.join(str(v) for v in x)`
- `fill_value`: fill value incase aggregate function return blank entry. 
- `fill_value`: Note: DO NOT use empty string ''. Use space ' ' instead because we can save the file to csv easily without creating na value

__REMEMBER__ to set `columns.name = None` to get rid of the title for index

In [29]:
pivot_df = pd.pivot_table(combined_df,index = ['customer_id','start_date','end_date','customer_category'],
                              columns ='product_id',values = 'product_name',aggfunc = 'count',fill_value = 0)\
                  .reset_index()
#to set blank index name instead of having index with title 'product_id' from columns variable
pivot_df.columns.name = None
pivot_df

Unnamed: 0,customer_id,start_date,end_date,customer_category,prod1,prod2,prod3,prod4
0,1,2019-01-01,2019-12-01,old_customer,1,1,0,0
1,2,2019-05-01,2020-07-31,new_customer,1,0,1,1
2,3,2019-02-05,2020-01-08,new_customer,0,1,0,1


We may use `pivot_table` as dot syntax after a pandas dataframe (instead of the default pd) without passing the dataframe later

In [30]:
pivot_df = combined_df.pivot_table(index = ['customer_id','start_date','end_date','customer_category'],
                              columns ='product_id',values = 'product_name',aggfunc = 'count',fill_value = 0)\
                  .reset_index()
pivot_df.columns.name = None
pivot_df

Unnamed: 0,customer_id,start_date,end_date,customer_category,prod1,prod2,prod3,prod4
0,1,2019-01-01,2019-12-01,old_customer,1,1,0,0
1,2,2019-05-01,2020-07-31,new_customer,1,0,1,1
2,3,2019-02-05,2020-01-08,new_customer,0,1,0,1


__melt function:__

`metl` function is the __opposite__ of `pivot_table` to convert detail from horizontal view to vertical view
- `id_vars` : pass list of column to keep as identifier variables, not to transpose
- `value_vars` : list of column(s) to unpivot from horizontal view to vertical view. If not specified, uses all columns that are not set as id_vars.
- `var_name` : scalar, name to use for the column in value_vars. If None it uses frame.columns.name or ‘variable’.
- `value_name` : scalar, default ‘value’. Name to use for the ‘value’ column.
- `col_level` : int or string, optional. If columns are a MultiIndex then use this level to melt.

In [31]:
melt_df = pd.melt(combined_df, id_vars=['customer_id','start_date','end_date','customer_category'],
                   value_vars=['product_id','product_name'],var_name= 'product_property',value_name='property_detail')\
            .sort_values(by=['customer_id','product_property'])
melt_df

Unnamed: 0,customer_id,start_date,end_date,customer_category,product_property,property_detail
0,1,2019-01-01,2019-12-01,old_customer,product_id,prod1
1,1,2019-01-01,2019-12-01,old_customer,product_id,prod2
7,1,2019-01-01,2019-12-01,old_customer,product_name,prod1_name
8,1,2019-01-01,2019-12-01,old_customer,product_name,prod2_name
2,2,2019-05-01,2020-07-31,new_customer,product_id,prod1
3,2,2019-05-01,2020-07-31,new_customer,product_id,prod3
4,2,2019-05-01,2020-07-31,new_customer,product_id,prod4
9,2,2019-05-01,2020-07-31,new_customer,product_name,prod1_name
10,2,2019-05-01,2020-07-31,new_customer,product_name,prod3_name
11,2,2019-05-01,2020-07-31,new_customer,product_name,prod4_name


Similar to pivot_table, we can use `melt` function as dot syntax after a pandas dataframe (instead of the default pd) without passing the dataframe later

In [32]:
melt_df = combined_df.melt(id_vars=['customer_id','start_date','end_date','customer_category'],
                   value_vars=['product_id','product_name'],var_name= 'product_property',value_name='property_detail')\
            .sort_values(by=['customer_id','product_property'])
melt_df.head()

Unnamed: 0,customer_id,start_date,end_date,customer_category,product_property,property_detail
0,1,2019-01-01,2019-12-01,old_customer,product_id,prod1
1,1,2019-01-01,2019-12-01,old_customer,product_id,prod2
7,1,2019-01-01,2019-12-01,old_customer,product_name,prod1_name
8,1,2019-01-01,2019-12-01,old_customer,product_name,prod2_name
2,2,2019-05-01,2020-07-31,new_customer,product_id,prod1


### 8. Join dataframe

In [33]:
print('https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.join.html')

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.join.html


In [34]:
#Create customer reference table to join with customer product table
cust_data = {'customer_id':['001','002','003','004'],
             'customer_name':['First1 Last1','First2 Last2','First3 Last3','First4 Last4']}
customer_df = pd.DataFrame(cust_data)
customer_df

Unnamed: 0,customer_id,customer_name
0,1,First1 Last1
1,2,First2 Last2
2,3,First3 Last3
3,4,First4 Last4


Two ways to do merge or join with pandas:
- df_new = `pd`.merge(`df1`,`df2`,on=...)
- df_new = `df1`.merge(`df2`,on=...)

Join using:
- `on` for common column name and 
- `how` for type of join (inner, left, right,outer). `how` method has default value of inner

In [35]:
merged_df = pd.merge(combined_df,customer_df, on = 'customer_id',how='right')
merged_df

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,customer_category,customer_name
0,1,2019-01-01,2019-12-01,prod1,prod1_name,old_customer,First1 Last1
1,1,2019-01-01,2019-12-01,prod2,prod2_name,old_customer,First1 Last1
2,2,2019-05-01,2020-07-31,prod1,prod1_name,new_customer,First2 Last2
3,2,2019-05-01,2020-07-31,prod3,prod3_name,new_customer,First2 Last2
4,2,2019-05-01,2020-07-31,prod4,prod4_name,new_customer,First2 Last2
5,3,2019-02-05,2020-01-08,prod2,prod2_name,new_customer,First3 Last3
6,3,2019-02-05,2020-01-08,prod4,prod4_name,new_customer,First3 Last3
7,4,NaT,NaT,,,,First4 Last4


Another way to do merge `df1.merge(df2)`

In [36]:
merged_df = combined_df.merge(customer_df, on = 'customer_id',how='right')
merged_df

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,customer_category,customer_name
0,1,2019-01-01,2019-12-01,prod1,prod1_name,old_customer,First1 Last1
1,1,2019-01-01,2019-12-01,prod2,prod2_name,old_customer,First1 Last1
2,2,2019-05-01,2020-07-31,prod1,prod1_name,new_customer,First2 Last2
3,2,2019-05-01,2020-07-31,prod3,prod3_name,new_customer,First2 Last2
4,2,2019-05-01,2020-07-31,prod4,prod4_name,new_customer,First2 Last2
5,3,2019-02-05,2020-01-08,prod2,prod2_name,new_customer,First3 Last3
6,3,2019-02-05,2020-01-08,prod4,prod4_name,new_customer,First3 Last3
7,4,NaT,NaT,,,,First4 Last4


If the key columns in 2 dataframes don't have the same name, we can use `left_on` and `right_on` instead of `on`, then pass a list of key columns to each of the on

In [37]:
merged_df = combined_df.merge(customer_df, left_on = 'customer_id',right_on = 'customer_id',how='right')
merged_df.head()

Unnamed: 0,customer_id,start_date,end_date,product_id,product_name,customer_category,customer_name
0,1,2019-01-01,2019-12-01,prod1,prod1_name,old_customer,First1 Last1
1,1,2019-01-01,2019-12-01,prod2,prod2_name,old_customer,First1 Last1
2,2,2019-05-01,2020-07-31,prod1,prod1_name,new_customer,First2 Last2
3,2,2019-05-01,2020-07-31,prod3,prod3_name,new_customer,First2 Last2
4,2,2019-05-01,2020-07-31,prod4,prod4_name,new_customer,First2 Last2


Save file to csv

`merged_df.to_csv('url_path/filename.csv',sep=',',encoding ='utf-8',index=False)`