# Merge and Export Data

## 1. Importing Data
## 2. Checking Data for merging
## 3. Merging df_ords and df_ords_prior to create df_merged_large
## 4. Export new df_merged_large as pkl

In [76]:
# Import Libraries
import pandas as pd
import numpy as np
import os

# Importing Data

In [77]:
# Path Creation
path=r'C:\Users\Drew\Instacart Basket Analysis'

In [78]:
# Orders Data
df_ords=pd.read_csv(os.path.join(path,'02 Data','Original Data','orders.csv'),index_col=False)

In [79]:
# Products Data
df_prods=pd.read_csv(os.path.join(path,'02 Data','Original Data','products.csv'),index_col=False)

In [80]:
# Departments Data
df_dep=pd.read_csv(os.path.join(path,'02 Data','Original Data','departments.csv'),index_col=False)

In [81]:
# Orders_Products_Prior Data
df_ords_prior=pd.read_csv(os.path.join(path,'02 Data','Original Data','orders_products_prior.csv'),index_col=False)

# Comparing and Checking Dataframes for Merging

## Comparing df_ords and df_prods columns to see if any similar

In [82]:
print(list(df_ords.columns))

['order_id', 'user_id', 'eval_set', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']


In [83]:
print(list(df_prods.columns))

['product_id', 'product_name', 'aisle_id', 'department_id', 'prices']


## No similar columns found. 
## Will combine df_ords with new dataframe called orders_products_prior.
## orders_products_prior contains a "product_id" column by adding product_id to df_ords to have similar column in both df_ords and df_prods

In [84]:
# Checking new df_ords_prior dataframe added above
df_ords_prior.shape

(32434489, 4)

In [85]:
df_ords_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [97]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [98]:
df_ords.shape

(3421083, 6)

In [88]:
# Renaming columns to match column header in lesson example
df_ords.rename(columns={'order_dow':'orders_day_of_week'},inplace=True)

In [89]:
# Dropping eval_set column
df_ords=df_ords.drop(columns=['eval_set'])

## Merging dataframes df_ords and df_ords_prior

In [90]:
# Merging dataframes df_ords and df_ords_prior
# Uses default inner join means that the resulting table will only contain observations found in both dataframes. As such, the merge flag here will only show entries that have a value of “both.”
# create a new dataframe, df_merged_large, that contains the combined df_ords and df_ords_prior dataframes and uses the “order_id” column as its key.
# includes the indicator = True argument so that you can check for a full match.
df_merged_large=df_ords.merge(df_ords_prior,on='order_id',indicator=True)

In [91]:
# Checking new df_merged_large
df_merged_large.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,196,1,0,both
1,2539329,1,1,2,8,,14084,2,0,both
2,2539329,1,1,2,8,,12427,3,0,both
3,2539329,1,1,2,8,,26088,4,0,both
4,2539329,1,1,2,8,,26405,5,0,both


In [102]:
df_merged_large.shape

(32434489, 10)

In [92]:
# Checking merged dataframe value_count
# Using the value_counts() function lets you quickly sum up all the values in the “_merge” column, letting you see whether you have a full match or not.
df_merged_large['_merge'].value_counts()

_merge
both          32434489
left_only            0
right_only           0
Name: count, dtype: int64

In [93]:
# Using outer join in code.
df_merged_large_2=df_ords.merge(df_ords_prior,on='order_id',how='outer',indicator=True)

In [94]:
# using this method to double-check your merge, you can see that you don’t actually have a full match. 
df_merged_large_2['_merge'].value_counts()

_merge
both          32434489
left_only       206209
right_only           0
Name: count, dtype: int64

# Exporting Data in Pickle Format

In [99]:
# Export df_merge_large to csv
df_merged_large.to_csv(os.path.join(path,'02 Data','Prepared Data','orders_products_combined.csv'))

In [100]:
# Export df_merge_large to pkl
# Importing pickle files also follows a similar syntax to its “.csv”
#  only difference comes in the function (read_pickle()) and the lack of an index_col, since pickle-format files include this information already.
df_merged_large.to_pickle(os.path.join(path,'02 Data','Prepared Data','orders_products_combined.pkl'))