## Merging products_checked.csv and orders_products_combined.pkl

### This script includes the following points:

#### 1. Importing data

#### 2. Checking output and dimensions of data

#### 3. Optimizing for memory function

#### 4. Merging data

#### 5. Confirm results with merge flag

#### 6. Exporting data as pickle

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [2]:
# Set path
path = r'/Users/mainframe/Documents/Instacart Basket Analysis'

### 1. Importing data

In [3]:
# Import data
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index_col = False)

In [4]:
df_ords_combined = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))

### 2. Checking output and dimensions of data

In [5]:
df_prods.describe()

Unnamed: 0.1,Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0,49672.0
mean,24852.005053,24850.349775,67.762442,11.728942,9.993282
std,14342.265579,14340.705287,38.315784,5.850779,453.615536
min,0.0,1.0,1.0,1.0,1.0
25%,12432.75,12432.75,35.0,7.0,4.1
50%,24851.5,24850.5,69.0,13.0,7.1
75%,37272.25,37268.25,100.0,17.0,11.1
max,49692.0,49688.0,134.0,21.0,99999.0


In [6]:
# Check the output
df_prods.head()

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,0,1,Chocolate Sandwich Cookies,61,19,5.8
1,1,2,All-Seasons Salt,104,13,9.3
2,2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,4,5,Green Chile Anytime Sauce,5,13,4.3


In [7]:
df_ords_combined.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,hour_ordered,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,196,1,0,both
1,2539329,1,1,2,8,,14084,2,0,both
2,2539329,1,1,2,8,,12427,3,0,both
3,2539329,1,1,2,8,,26088,4,0,both
4,2539329,1,1,2,8,,26405,5,0,both


In [8]:
df_ords_combined.describe()

  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  return dtype.type(n)


Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,hour_ordered,days_since_prior_order,product_id,add_to_cart_order,reordered
count,32434490.0,32434490.0,32434490.0,32434490.0,32434490.0,30356421.0,32434490.0,32434490.0,32434490.0
mean,1710749.0,102937.2,17.14205,2.738818,13.42498,,25576.34,8.351076,0.5896975
std,987300.7,59466.48,17.53504,2.090049,4.246365,0.0,14096.69,7.126671,0.4918886
min,2.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
25%,855943.0,51421.0,5.0,1.0,10.0,5.0,13530.0,3.0,0.0
50%,1711048.0,102611.0,11.0,3.0,13.0,8.0,25256.0,6.0,1.0
75%,2565514.0,154391.0,24.0,5.0,16.0,15.0,37935.0,11.0,1.0
max,3421083.0,206209.0,99.0,6.0,23.0,30.0,49688.0,145.0,1.0


In [9]:
# Check shape
df_prods.shape

(49672, 6)

In [10]:
df_ords_combined.shape

(32434489, 10)

In [11]:
df_prods.tail()

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
49667,49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49668,49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49669,49690,49686,Artisan Baguette,112,3,7.8
49670,49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7
49671,49692,49688,Fresh Foaming Cleanser,73,11,13.5


In [12]:
# Check data types in df_prods
print(df_prods.dtypes)

Unnamed: 0         int64
product_id         int64
product_name      object
aisle_id           int64
department_id      int64
prices           float64
dtype: object


### 3. Optimizing for memory function

In [13]:
# Changing data types in df_prods
df_prods['product_id'] = df_prods['product_id'].astype('uint16')
df_prods['aisle_id'] = df_prods['aisle_id'].astype('uint8')
df_prods['department_id'] = df_prods['department_id'].astype('uint8')

In [14]:
# Drop column in df_prods
df_prods = df_prods.drop(columns = ['Unnamed: 0'])

In [15]:
# Drop columns in df_ords_combined
df_ords_combined = df_ords_combined.drop(columns = ['_merge'])

### 4. Merging data

In [16]:
# Merge df_prods and df_ords_combined
df_merged = df_ords_combined.merge(df_prods, on = 'product_id', indicator = True)

In [17]:
# Check output
df_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,hour_ordered,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both
1,2539329,1,1,2,8,,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,both
2,2539329,1,1,2,8,,12427,3,0,Original Beef Jerky,23,19,4.4,both
3,2539329,1,1,2,8,,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,both
4,2539329,1,1,2,8,,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,both


In [18]:
# Check shape
df_merged.shape

(32404859, 14)

### 5. Confirm results with merge flag

In [19]:
df_merged['_merge'].value_counts()

_merge
both          32404859
left_only            0
right_only           0
Name: count, dtype: int64

### 6. Exporting data as pickle

In [20]:
# Export to pkl
df_merged.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge.pkl'))