# 4.9 Task 1.ipynb — TOC
	1.	Setup (imports + paths)
	2.	Task overview
	3.	Data wrangling
	4.	Data consistency checks
	5.	Combine customer data with prepared data
	6.	Pre-export validation
	7.	Export new dataframe

# Imports

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# Import datasets
path = r'/Users/spencer/Documents/Career Foundry/Data Immersion/4 Python Fundamentals for Data Analysts/Instacart Basket Analysis'
cust = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_aggregated.pkl'))

# Task


In [3]:
# View data sample
cust.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


## Data Wrangling

In [4]:
# Rename columns for clarity and consistency
cust.rename(columns = {'First Name': 'first_name', 'Surnam': 'surname', 'Gender': 'gender', 'STATE': 'state', 'Age': 'age', 'n_dependants': 'number_of_dependents', 'fam_status': 'family_status'}, inplace = True)

In [5]:
cust.head()

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,number_of_dependents,family_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [6]:
cust.shape

(206209, 10)

## Data Consistency Checks

In [7]:
# Check for mixed data types
for col in cust.columns.tolist():
  weird = (cust[[col]].map(type) != cust[[col]].iloc[0].apply(type)).any(axis = 1)
  if len(cust[weird]) > 0:
    print (col)

first_name


In [8]:
# Check for missing data types
cust.isnull().sum()

user_id                     0
first_name              11259
surname                     0
gender                      0
state                       0
age                         0
date_joined                 0
number_of_dependents        0
family_status               0
income                      0
dtype: int64

In [9]:
# Verify the data type before the change
print(cust['first_name'].dtype)

object


In [10]:
# Impute missing values with 'Unknown'
cust['first_name'] = cust['first_name'].fillna('Unknown')

In [11]:
# Check for mixed data types again
for col in cust.columns.tolist():
  weird = (cust[[col]].map(type) != cust[[col]].iloc[0].apply(type)).any(axis = 1)
  if len(cust[weird]) > 0:
    print (col)

In [12]:
# Check for missing data types
cust.isnull().sum()

user_id                 0
first_name              0
surname                 0
gender                  0
state                   0
age                     0
date_joined             0
number_of_dependents    0
family_status           0
income                  0
dtype: int64

In [13]:
# Check for duplicates
cust.duplicated().sum()

np.int64(0)

In [14]:
cust.shape

(206209, 10)

## Combining customer data with prepared data

In [15]:
# Check customer dataframe
cust.head()

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,number_of_dependents,family_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [16]:
cust.shape

(206209, 10)

In [17]:
# Check orders_products dataframe
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,...,price_label,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,user_avg_item_price,spending_habit,median_days_between_orders,order_frequency_flag
0,2539329,1,1,2,8,,True,196,1,0,...,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
1,2539329,1,1,2,8,,True,14084,2,0,...,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
2,2539329,1,1,2,8,,True,12427,3,0,...,Low-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
3,2539329,1,1,2,8,,True,26088,4,0,...,Low-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
4,2539329,1,1,2,8,,True,26405,5,0,...,Low-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer


In [18]:
ords_prods_merge.shape

(32434489, 24)

In [19]:
# Check the data type of user_id in both dataframes
print(ords_prods_merge['user_id'].dtype)
print(cust['user_id'].dtype)

int64
int64


In [20]:
# Merge the dataframes (inner join)
ords_prods_cust_merge = ords_prods_merge.merge(cust, on = 'user_id', indicator = True)

In [21]:
ords_prods_cust_merge['_merge'].value_counts()

_merge
both          32434489
left_only            0
right_only           0
Name: count, dtype: int64

In [22]:
# Drop the _merge column
ords_prods_cust_merge.drop(columns = ['_merge'], inplace = True)

In [23]:
# Check dataframe
ords_prods_cust_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,...,order_frequency_flag,first_name,surname,gender,state,age,date_joined,number_of_dependents,family_status,income
0,2539329,1,1,2,8,,True,196,1,0,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
1,2539329,1,1,2,8,,True,14084,2,0,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
2,2539329,1,1,2,8,,True,12427,3,0,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
3,2539329,1,1,2,8,,True,26088,4,0,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
4,2539329,1,1,2,8,,True,26405,5,0,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423


## Pre-Export Validation

In [24]:
ords_prods_cust_merge.shape

(32434489, 33)

In [25]:
ords_prods_cust_merge.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'first_order',
       'product_id', 'add_to_cart_order', 'reordered', 'product_name',
       'aisle_id', 'department_id', 'prices', 'price_label', 'busiest_day',
       'busiest_days', 'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'user_avg_item_price', 'spending_habit', 'median_days_between_orders',
       'order_frequency_flag', 'first_name', 'surname', 'gender', 'state',
       'age', 'date_joined', 'number_of_dependents', 'family_status',
       'income'],
      dtype='object')

In [26]:
# Check for mixed data types again
for col in ords_prods_cust_merge.columns.tolist():
  weird = (ords_prods_cust_merge[[col]].map(type) != ords_prods_cust_merge[[col]].iloc[0].apply(type)).any(axis = 1)
  if len(ords_prods_cust_merge[weird]) > 0:
    print (col)

product_name
price_label


In [27]:
# Check for missing data types
ords_prods_cust_merge.isnull().sum()

order_id                            0
user_id                             0
order_number                        0
orders_day_of_week                  0
order_hour_of_day                   0
days_since_prior_order        2078068
first_order                         0
product_id                          0
add_to_cart_order                   0
reordered                           0
product_name                    30200
aisle_id                        30200
department_id                   30200
prices                          35327
price_label                     30200
busiest_day                         0
busiest_days                        0
busiest_period_of_day               0
max_order                           0
loyalty_flag                        0
user_avg_item_price                 0
spending_habit                      0
median_days_between_orders          0
order_frequency_flag                0
first_name                          0
surname                             0
gender      

In [28]:
# Fill missing Product Names with 'Unknown'
ords_prods_cust_merge['product_name'] = ords_prods_cust_merge['product_name'].fillna('Unknown')

In [29]:
# Fill missing Price Labels with 'Not assigned'
ords_prods_cust_merge['price_label'] = ords_prods_cust_merge['price_label'].fillna('Not assigned')

In [30]:
# Re-Check only the fixed columns 
ords_prods_cust_merge[['product_name', 'price_label']].isnull().sum()

product_name    0
price_label     0
dtype: int64

In [31]:
# Check for mixed data types again again
for col in ords_prods_cust_merge.columns.tolist():
  weird = (ords_prods_cust_merge[[col]].map(type) != ords_prods_cust_merge[[col]].iloc[0].apply(type)).any(axis = 1)
  if len(ords_prods_cust_merge[weird]) > 0:
    print (col)

In [32]:
# Shape
ords_prods_cust_merge.shape

(32434489, 33)

In [33]:
# Check for missing data types
ords_prods_cust_merge.isnull().sum()

order_id                            0
user_id                             0
order_number                        0
orders_day_of_week                  0
order_hour_of_day                   0
days_since_prior_order        2078068
first_order                         0
product_id                          0
add_to_cart_order                   0
reordered                           0
product_name                        0
aisle_id                        30200
department_id                   30200
prices                          35327
price_label                         0
busiest_day                         0
busiest_days                        0
busiest_period_of_day               0
max_order                           0
loyalty_flag                        0
user_avg_item_price                 0
spending_habit                      0
median_days_between_orders          0
order_frequency_flag                0
first_name                          0
surname                             0
gender      

### NOTES:
- days_since_prior_order (2,078,068 NaNs): Normal. Impossible to have a prior order on your first order
- prices (35,327 NaNs):
  - 30,200 rows where the product_id did not match the products table
  - 5,127 rows where outliers (prices > $100) were intentionally ommitted
- aisle_id & department_id (30,200 NaNs): product_id was not found

## Export new dataframe

In [34]:
# Export the merged dataframe
ords_prods_cust_merge.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_all.pkl'))