# Exercise 4.6 Combining and Exporting Data (Part 2)

## This script contains the following:
1. Importing Libraries and Data Files
2. Converting Data Types for Optimal Performance
3. Merging Dataframes
4. Exporting Dataframes

# 1. Importing Libraries and Data Files

In [1]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Create a string of the path for the main project folder
path = r'C:\Users\Ryan\Documents\07-17-2023 Instacart Basket Analysis'

In [3]:
# Import the “orders_products_combined.pkl” data set using the os library
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))

In [4]:
# Check the ords_prods_merge output
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,2539329,1,1,2,8,,196,1,0
1,2539329,1,1,2,8,,14084,2,0
2,2539329,1,1,2,8,,12427,3,0
3,2539329,1,1,2,8,,26088,4,0
4,2539329,1,1,2,8,,26405,5,0


#### Check the shape of the imported dataframe (it should be (32434489, 9)).

In [5]:
# Check the dimensions of df_merged dataframe
ords_prods_merge.shape

(32434489, 9)

Dimensions are the same as before. It does not have an 'unnamed: 0' column. Hurray!

In [6]:
# Check the data types of ords_prods_merge
ords_prods_merge.dtypes

order_id                   uint32
user_id                    uint32
order_number                uint8
orders_day_of_week          uint8
order_hour_of_day           uint8
days_since_prior_order    float32
product_id                 uint16
add_to_cart_order           uint8
reordered                   uint8
dtype: object

In [7]:
# Import the “products_checked.csv” data set using the os library
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index_col = False)

In [8]:
# Check the df_prods output
df_prods.head()

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,0,1,Chocolate Sandwich Cookies,61,19,5.8
1,1,2,All-Seasons Salt,104,13,9.3
2,2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,4,5,Green Chile Anytime Sauce,5,13,4.3


In [9]:
# Drop 'Unnamed: 0' column from df_prods dataframe
df_prods = df_prods.drop(columns = ['Unnamed: 0'])

In [10]:
# Check the dimensions of df_prods
df_prods.shape

(49672, 5)

# 2. Converting Data Types for Optimal Performance

In [11]:
# Check the data types of df_prods
df_prods.dtypes

product_id         int64
product_name      object
aisle_id           int64
department_id      int64
prices           float64
dtype: object

In [12]:
# Obtain summary statistics of df_prods
df_prods.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0
mean,24850.349775,67.762442,11.728942,9.993282
std,14340.705287,38.315784,5.850779,453.615536
min,1.0,1.0,1.0,1.0
25%,12432.75,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.25,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


In [13]:
# Change the data type of 'product_id' to 16-bit unsigned integer
df_prods['product_id'] = df_prods['product_id'].astype('uint16')

In [14]:
# Change the data type of 'aisle_id' to 8-bit unsigned integer
df_prods['aisle_id'] = df_prods['aisle_id'].astype('uint8')

In [15]:
# Change the data type of 'department_id' to 8-bit unsigned integer
df_prods['department_id'] = df_prods['department_id'].astype('uint8')

In [16]:
# Change the data type of 'prices' to 32-bit float
df_prods['prices'] = df_prods['prices'].astype('float32')

# 3. Merging Dataframes

#### Determine a suitable way to combine the orders_products_combined dataframe with your products data set. Make sure you’re using your wrangled, cleaned, and deduped products data set stored in your “Prepared Data” folder from the previous Exercise’s task.

The df_ords_prods_merge dataframe and df_prods dataframe both have a 'product_id' column, which we can use to merge the two dataframes. The default 'inner' join would be suitable for this task.

#### Confirm the results of the merge using the merge flag.

In [17]:
# Merge df_merged and df_prods
ords_prods_merge = ords_prods_merge.merge(df_prods, on = 'product_id', how = 'inner', indicator = True)

In [18]:
# Check the output
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both


In [19]:
# Check the frequency of '_merge' column
ords_prods_merge['_merge'].value_counts()

both          32404859
left_only            0
right_only           0
Name: _merge, dtype: int64

In [20]:
# Drop '_merge' column from ords_prods_merge
ords_prods_merge = ords_prods_merge.drop(columns = ['_merge'])

In [21]:
# Check the dimensions
ords_prods_merge.shape

(32404859, 13)

In [22]:
# Check the data types in ords_prods_merge
ords_prods_merge.dtypes

order_id                   uint32
user_id                    uint32
order_number                uint8
orders_day_of_week          uint8
order_hour_of_day           uint8
days_since_prior_order    float32
product_id                 uint16
add_to_cart_order           uint8
reordered                   uint8
product_name               object
aisle_id                    uint8
department_id               uint8
prices                    float32
dtype: object

# 4. Exporting Dataframes

#### Export the newly created dataframe as orders_products_merged in a suitable format (taking into consideration the size).

In [23]:
# Export df_merged dataframe as "orders_products_merged.pkl"
ords_prods_merge.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged.pkl'))