# 4.6 Merging and Exporting Data

## This script contains the following points

#### 1. Create data to experiment on
#### 2. Concatenate dataframes
#### 3. Append data (append function no longer works)
#### 4. Merge data

In [33]:
# Import libraries

import pandas as pd
import numpy as np
import os

#### 1. Create data to experiment on

In [34]:
# Define a dictionary containing January 2020 data
data1 = {'customer_id':['6732', '767', '890', '635'],
    'month':['Jan-20', 'Jan-20', 'Jan-20', 'Jan-20'],
    'purchased_meat':[0, 13, 3, 4],
    'purchased_alcohol':[1, 2, 10, 0],
    'purchased_snacks':[10, 5, 1, 7]}

In [35]:
# Define a dictionary containing February 2020 data
data2 = {'customer_id':['6732', '767', '890', '635'],
    'month':['Feb-20', 'Feb-20', 'Feb-20', 'Feb-20'],
    'purchased_meat':[0, 10, 5, 3],
    'purchased_alcohol':[2, 4, 14, 0],
    'purchased_snacks':[15, 3, 2, 6]}

In [36]:
#Convert the dictionary into dataframe
df = pd.DataFrame(data1,index=[0, 1, 2, 3])
df_1 = pd.DataFrame(data2,index=[0, 1, 2, 3])

In [37]:
df

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Jan-20,0,1,10
1,767,Jan-20,13,2,5
2,890,Jan-20,3,10,1
3,635,Jan-20,4,0,7


In [38]:
df_1

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Feb-20,0,2,15
1,767,Feb-20,10,4,3
2,890,Feb-20,5,14,2
3,635,Feb-20,3,0,6


#### 2. Concatenate dataframes

In [39]:
# Create a list that contains our dataframes

frames = [df, df_1]

In [40]:
# Check the output

frames

[  customer_id   month  purchased_meat  purchased_alcohol  purchased_snacks
 0        6732  Jan-20               0                  1                10
 1         767  Jan-20              13                  2                 5
 2         890  Jan-20               3                 10                 1
 3         635  Jan-20               4                  0                 7,
   customer_id   month  purchased_meat  purchased_alcohol  purchased_snacks
 0        6732  Feb-20               0                  2                15
 1         767  Feb-20              10                  4                 3
 2         890  Feb-20               5                 14                 2
 3         635  Feb-20               3                  0                 6]

In [41]:
# Check the data types to be sure it is a list

type(frames)

list

In [42]:
# Concatenate the dataframes using default options

df_concat = pd.concat(frames)

In [43]:
# Check the output

df_concat

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Jan-20,0,1,10
1,767,Jan-20,13,2,5
2,890,Jan-20,3,10,1
3,635,Jan-20,4,0,7
0,6732,Feb-20,0,2,15
1,767,Feb-20,10,4,3
2,890,Feb-20,5,14,2
3,635,Feb-20,3,0,6


#### 3. Append data

In [44]:
# Append function no longer works use pd.concat
df_appended = pd.concat([df, df_1])

In [45]:
df_appended

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Jan-20,0,1,10
1,767,Jan-20,13,2,5
2,890,Jan-20,3,10,1
3,635,Jan-20,4,0,7
0,6732,Feb-20,0,2,15
1,767,Feb-20,10,4,3
2,890,Feb-20,5,14,2
3,635,Feb-20,3,0,6


#### 4. Merge data

In [46]:
# Create data with different columns from df

data3 = {'customer_id':['6732', '767', '890', '635'], 
        'month':['Jan-20', 'Jan-20', 'Jan-20', 'Jan-20'], 
        'days_purchased_on':[0, 13, 3, 4]}

In [47]:
# Convert to dataframe

df_2 = pd.DataFrame(data3,index=[0, 1, 2, 3])

In [48]:
df_2

Unnamed: 0,customer_id,month,days_purchased_on
0,6732,Jan-20,0
1,767,Jan-20,13
2,890,Jan-20,3
3,635,Jan-20,4


In [49]:
# merge df and df_2 using customer_id as key

df_merged = df.merge(df_2, on = 'customer_id')

In [50]:
df_merged

Unnamed: 0,customer_id,month_x,purchased_meat,purchased_alcohol,purchased_snacks,month_y,days_purchased_on
0,6732,Jan-20,0,1,10,Jan-20,0
1,767,Jan-20,13,2,5,Jan-20,13
2,890,Jan-20,3,10,1,Jan-20,3
3,635,Jan-20,4,0,7,Jan-20,4


In [51]:
# Merge df and df_2 using customer_id and month as a keys

df_merged = df.merge(df_2, on = ['customer_id', 'month'])

In [52]:
df_merged

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on
0,6732,Jan-20,0,1,10,0
1,767,Jan-20,13,2,5,13
2,890,Jan-20,3,10,1,3
3,635,Jan-20,4,0,7,4


In [53]:
# Merge df and df_2 using customer_id and month as a keys, add a merge flag


df_merged = df.merge(df_2, on = ['customer_id', 'month'], indicator = True)

In [54]:
df_merged

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on,_merge
0,6732,Jan-20,0,1,10,0,both
1,767,Jan-20,13,2,5,13,both
2,890,Jan-20,3,10,1,3,both
3,635,Jan-20,4,0,7,4,both


In [55]:
df_merged['_merge'].value_counts()

both          4
left_only     0
right_only    0
Name: _merge, dtype: int64

In [56]:
# Test merge without overwriting

pd.merge(df,df_2, on = ['customer_id', 'month'], indicator = True)

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on,_merge
0,6732,Jan-20,0,1,10,0,both
1,767,Jan-20,13,2,5,13,both
2,890,Jan-20,3,10,1,3,both
3,635,Jan-20,4,0,7,4,both
