# 4.6.2 Combining & Exporting Data

## This script contains the following points:

### 1. Import the data sets

### 2. Check the dimensions of the imported dataframes

### 3. Determine a suitable way to combine the dataframes

### 4. Confirm the results of the merge using the merge flag

### 5. Export the newly created dataframe in a suitable format

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import os

### 1. Import the data sets

In [3]:
path = r'C:\Users\julia\anaconda3\Instacart Basket Analysis'

In [6]:
df_ords_prods_combined = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))

In [7]:
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index_col = False)

### 2. Check the dimensions of the imported dataframes

In [8]:
df_ords_prods_combined.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,8382,23,prior,2,0,10,9.0,3873,1,0,both
1,8382,23,prior,2,0,10,9.0,28199,2,0,both
2,8382,23,prior,2,0,10,9.0,42372,3,0,both
3,8382,23,prior,2,0,10,9.0,23106,4,0,both
4,8382,23,prior,2,0,10,9.0,33819,5,0,both


In [9]:
df_ords_prods_combined.shape

(3809, 11)

In [10]:
df_prods.head()

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,0,1,Chocolate Sandwich Cookies,61,19,5.8
1,1,2,All-Seasons Salt,104,13,9.3
2,2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,4,5,Green Chile Anytime Sauce,5,13,4.3


In [11]:
df_prods.shape

(49693, 6)

### 3. Determine a suitable way to combine the dataframes

In [12]:
df_ords_prods_merged = df_prods.merge(df_ords_prods_combined, on = 'product_id', indicator = True)

ValueError: Cannot use name of an existing column for indicator column

In [None]:
# Because merge already exists in the dataframe, I have to drop it first

In [13]:
df_ords_prods_combined = df_ords_prods_combined.drop(['_merge'], axis=1)

In [14]:
df_ords_prods_combined.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,8382,23,prior,2,0,10,9.0,3873,1,0
1,8382,23,prior,2,0,10,9.0,28199,2,0
2,8382,23,prior,2,0,10,9.0,42372,3,0
3,8382,23,prior,2,0,10,9.0,23106,4,0
4,8382,23,prior,2,0,10,9.0,33819,5,0


In [15]:
df_ords_prods_merged = df_prods.merge(df_ords_prods_combined, on = 'product_id', indicator = True)

In [16]:
df_ords_prods_merged

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge
0,24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9,2510,4297,prior,10,4,12,6.0,18,1,both
1,33,34,,121,14,12.2,10142,4604,prior,5,1,12,8.0,7,1,both
2,44,45,European Cucumber,83,4,14.3,1425,2593,prior,4,1,14,7.0,7,1,both
3,44,45,European Cucumber,83,4,14.3,1915,4631,prior,29,6,20,7.0,5,1,both
4,44,45,European Cucumber,83,4,14.3,1375,5801,prior,16,1,17,6.0,10,0,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3801,49687,49683,Cucumber Kirby,83,4,13.2,2803,4282,prior,12,3,17,7.0,4,1,both
3802,49687,49683,Cucumber Kirby,83,4,13.2,9414,4588,prior,5,6,20,1.0,6,1,both
3803,49687,49683,Cucumber Kirby,83,4,13.2,4604,4789,prior,24,0,19,6.0,1,0,both
3804,49687,49683,Cucumber Kirby,83,4,13.2,6088,5321,prior,12,0,15,12.0,1,1,both


### 4. Confirm the results of the merge using the merge flag

In [18]:
df_ords_prods_merged['_merge'].value_counts()

_merge
both          3806
left_only        0
right_only       0
Name: count, dtype: int64

In [None]:
# The resulting dataframe (after the merge) has 3806 rows (due to the heavy deletion resulting from aforementioned problem)

### 5. Export the newly created dataframe in a suitable format

In [19]:
df_ords_prods_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'ords_prods_merged.pkl'))