# Clean & Reduce Memory Usage

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

## Importing Data

In [3]:
# main folder
path = r'C:\Users\steve\Documents\11.24 Instacart Basket Analysis'

# ords_prods_cust
ords_prods_cust = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_cust.pkl'))

## Analysis

#### Cleaning

01. Clean Dataset

In [7]:
# set limit of column display to 40, allowing all columns in df to be seen
pd.set_option('display.max_column', 40)

In [9]:
ords_prods_cust.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range,price_range_loc,busiet_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_spent,spending_flag,med_sinceprior,med_since_prior,frequency_label,first name,last name,gender,state,age,date_joined,dependents,fam_status,income
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,Mid-range product,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
1,2539329,1,1,2,8,,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,both,Mid-range product,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
2,2539329,1,1,2,8,,12427,3,0,Original Beef Jerky,23,19,4.4,both,Low-range product,Low-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
3,2539329,1,1,2,8,,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,both,Low-range product,Low-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
4,2539329,1,1,2,8,,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,both,Low-range product,Low-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423


Renaming Columns

In [18]:
# fix typo - changing orders to order in order_day_of_week
ords_prods_cust.rename(columns = {'orders_day_of_week': 'order_day_of_week'}, inplace = True)
# rename for clarity
ords_prods_cust.rename(columns = {'busiet_day': 'day_busyness'}, inplace = True)
ords_prods_cust.rename(columns = {'busiest_period_of_day': 'hour_of_day_busyness'}, inplace = True)
# rename for clarity
ords_prods_cust.rename(columns = {'med_since_prior': 'med_days_since_prior_order'}, inplace = True)

Deleting Columns

In [21]:
# delete unnecessary / duplicate columns
opc_clean = ords_prods_cust.drop(columns = ['_merge', 'price_range', 'med_sinceprior'])

In [22]:
opc_clean.head(1)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range,day_busyness,busiest_days,hour_of_day_busyness,max_order,loyalty_flag,avg_spent,spending_flag,med_days_since_prior_order,frequency_label,first name,last name,gender,state,age,date_joined,dependents,fam_status,income
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423


02. Remove PII

In [24]:
# [first name] and [last name]  are PII - these columns will be deleted
opc_nopii = opc_clean.drop(columns = ['first name', 'last name'])

In [25]:
opc_nopii.head(1)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range,day_busyness,busiest_days,hour_of_day_busyness,max_order,loyalty_flag,avg_spent,spending_flag,med_days_since_prior_order,frequency_label,gender,state,age,date_joined,dependents,fam_status,income
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423


03. Reduce Memory Usage

In [33]:
opc_nopii['order_id'] = opc_nopii['order_id'].astype('int32')
opc_nopii['user_id'] = opc_nopii['user_id'].astype('int32')
opc_nopii['order_number'] = opc_nopii['order_number'].astype('int8')
opc_nopii['order_day_of_week'] = opc_nopii['order_day_of_week'].astype('int8')
opc_nopii['order_hour_of_day'] = opc_nopii['order_hour_of_day'].astype('int8')
opc_nopii['days_since_prior_order'] = opc_nopii['days_since_prior_order'].astype('float16')

# Export Data

In [37]:
opc_nopii.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'opc_checked.pkl'))