# Merge and Export Data

## Importing Dataframes
## Checking Data for merging
## Merging Dataframes df_prods_clean and df_ords_prods_combined into df_ords_prods_merge
## Export new df_ords_prods_merge as pkl

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os

# Importing Data

In [2]:
# Path Creation
path=r'C:\Users\Drew\Instacart Basket Analysis'

In [3]:
# Orders Data
df_ords=pd.read_csv(os.path.join(path,'02 Data','Original Data','orders.csv'),index_col=False)

In [4]:
# Products Data
df_prods=pd.read_csv(os.path.join(path,'02 Data','Original Data','products.csv'),index_col=False)

In [5]:
# Products Data Cleaned
df_prods_clean=pd.read_csv(os.path.join(path,'02 Data','Prepared Data','products_checked.csv'),index_col=False)

In [6]:
# Department Data
df_dep=pd.read_csv(os.path.join(path,'02 Data','Original Data','departments.csv'),index_col=False)

In [7]:
# Orders_Products_prior Data
df_ords_prior=pd.read_csv(os.path.join(path,'02 Data','Original Data','orders_products_prior.csv'),index_col=False)

## Importing orders_products_combined.pkl dataframe

In [8]:
df_ords_prods_combined=pd.read_pickle(os.path.join(path,'02 Data','Prepared Data','orders_products_combined.pkl'))

In [9]:
# Confirming df_ords_prods_combined dataframe pulled same rows and column information.
df_ords_prods_combined.shape

(32434489, 10)

In [10]:
# Confirming dataframe matches same column headers as when created.
df_ords_prods_combined.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,196,1,0,both
1,2539329,1,1,2,8,,14084,2,0,both
2,2539329,1,1,2,8,,12427,3,0,both
3,2539329,1,1,2,8,,26088,4,0,both
4,2539329,1,1,2,8,,26405,5,0,both


# Comparing and Checking Dataframe for Merging

## Determining a suitable way to combine the orders_products_combined dataframe with your products data set.

In [11]:
# Finding columns of df_prods_clean to compare to df_ords_prods_combined
print(list(df_prods_clean.columns))

['Unnamed: 0', 'product_id', 'product_name', 'aisle_id', 'department_id', 'prices']


In [12]:
# Removing Unname: 0 columns as unneeded
df_prods_clean=df_prods_clean.drop(columns=['Unnamed: 0'])

In [13]:
df_prods_clean.shape

(49672, 5)

In [14]:
# Printing columns of df_ords_prods_combined
print(list(df_ords_prods_combined.columns))

['order_id', 'user_id', 'order_number', 'orders_day_of_week', 'order_hour_of_day', 'days_since_prior_order', 'product_id', 'add_to_cart_order', 'reordered', '_merge']


In [15]:
# Dropping _merge due to error received during merge stating "Cannot use name of an existing column for indicator column"
df_ords_prods_combined=df_ords_prods_combined.drop(columns=['_merge'])

In [16]:
df_ords_prods_combined.shape

(32434489, 9)

## It was found that using 'product_id' would be a suitable way to combine df_prods_clean and df_ords_prods_combined using an inner join

# Merging Dataframes

## Merging df_prods_clean and df_ords_prods_combined into df_ords_prods_merge

In [17]:
# merging df_prods and df_ords_prods_combined
df_ords_prods_merge=df_prods_clean.merge(df_ords_prods_combined,on='product_id',indicator=True)

In [18]:
df_ords_prods_merge.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge
0,33664,2 % Reduced Fat Milk,84,16,9.9,183964,873,3,0,10,7.0,11,0,both
1,33664,2 % Reduced Fat Milk,84,16,9.9,1851256,873,4,6,12,13.0,8,1,both
2,33664,2 % Reduced Fat Milk,84,16,9.9,1915696,1893,1,5,17,,10,0,both
3,33664,2 % Reduced Fat Milk,84,16,9.9,2763293,1893,2,4,16,13.0,6,1,both
4,33664,2 % Reduced Fat Milk,84,16,9.9,2564805,1893,4,1,17,30.0,3,1,both


# Confirming Results of Merge

In [19]:
df_ords_prods_merge

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge
0,33664,2 % Reduced Fat Milk,84,16,9.9,183964,873,3,0,10,7.0,11,0,both
1,33664,2 % Reduced Fat Milk,84,16,9.9,1851256,873,4,6,12,13.0,8,1,both
2,33664,2 % Reduced Fat Milk,84,16,9.9,1915696,1893,1,5,17,,10,0,both
3,33664,2 % Reduced Fat Milk,84,16,9.9,2763293,1893,2,4,16,13.0,6,1,both
4,33664,2 % Reduced Fat Milk,84,16,9.9,2564805,1893,4,1,17,30.0,3,1,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,48172,Culture Club Kombucha Goji Ginger,31,7,1.0,647110,121251,12,6,23,14.0,8,0,both
32404855,48172,Culture Club Kombucha Goji Ginger,31,7,1.0,2299169,136644,6,2,2,30.0,4,0,both
32404856,48172,Culture Club Kombucha Goji Ginger,31,7,1.0,154941,171023,10,4,9,27.0,5,0,both
32404857,48172,Culture Club Kombucha Goji Ginger,31,7,1.0,2457228,177923,3,0,19,28.0,4,0,both


In [20]:
# Quick look at newly merged dataframe rows and column numbers
df_ords_prods_merge.shape

(32404859, 14)

In [21]:
# Confirming merge results
df_ords_prods_merge['_merge'].value_counts()

_merge
both          32404859
left_only            0
right_only           0
Name: count, dtype: int64

In [22]:
# Quick List of newly merged dataframe column names
print(list(df_ords_prods_merge.columns))

['product_id', 'product_name', 'aisle_id', 'department_id', 'prices', 'order_id', 'user_id', 'order_number', 'orders_day_of_week', 'order_hour_of_day', 'days_since_prior_order', 'add_to_cart_order', 'reordered', '_merge']


In [23]:
df_ords_prods_merge.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered
count,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,30328760.0,32404860.0,32404860.0
mean,25598.66,71.19612,9.919792,7.79018,1710745.0,102937.2,17.1423,2.738867,13.42515,11.10408,8.352547,0.5895873
std,14084.0,38.21139,6.281485,4.242122,987298.8,59466.1,17.53532,2.090077,4.24638,8.779064,7.127071,0.4919087
min,1.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
25%,13544.0,31.0,4.0,4.2,855947.0,51422.0,5.0,1.0,10.0,5.0,3.0,0.0
50%,25302.0,83.0,9.0,7.4,1711049.0,102616.0,11.0,3.0,13.0,8.0,6.0,1.0
75%,37947.0,107.0,16.0,11.3,2565499.0,154389.0,24.0,5.0,16.0,15.0,11.0,1.0
max,49688.0,134.0,21.0,25.0,3421083.0,206209.0,99.0,6.0,23.0,30.0,145.0,1.0


# Export df_ords_prods_merge

In [24]:
# Export df_ords_prods_merge to pickel file
df_ords_prods_merge.to_pickle(os.path.join(path,'02 Data','Prepared Data','orders_products_merge.pkl'))

In [29]:
df_ords_prods_merge[df_ords_prods_merge['product_name']==('Lowfat 2% Milkfat Cottage Cheese')]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge
698,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,1.5,912404,17,12,2,14,5.0,5,0,both
699,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,1.5,603376,17,22,6,16,4.0,3,1,both
700,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,1.5,3264360,135,2,2,21,13.0,6,0,both
701,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,1.5,892534,135,3,0,8,12.0,3,1,both
702,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,1.5,229704,342,8,1,19,30.0,9,0,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5122,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,1.5,3172853,205650,18,1,9,7.0,17,1,both
5123,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,1.5,2504315,205818,3,5,15,3.0,13,0,both
5124,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,1.5,1108388,205818,5,4,5,1.0,5,1,both
5125,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,1.5,1916142,206049,1,2,17,,2,0,both
