## Table of Contents:
#### 01. Importing the libraries
#### 02. Importing the dataset
#### 03. Wrangling the dataframe
#### 04. Checking data quality and consistency
#### 05. Combining the dataframes
#### 06. Exporting the dataframe

## 01. Importing the libraries

In [1]:
# importing the libraries

import pandas as pd
import numpy as np
import os

## 02. Importing the dataset

In [2]:
# defining the path

path=r'/Users/sanju/Documents/Jul 2023 Instacart Basket Analysis'

In [3]:
# importing the 'customers.csv' datset

df=pd.read_csv(os.path.join(path,'02 Data','Original Data','customers.csv'))

In [4]:
# checking the import

df.head(10)

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374
5,133128,Cynthia,Noble,Female,Kentucky,43,1/1/2017,2,married,49643
6,152052,Chris,Walton,Male,Montana,20,1/1/2017,0,single,61746
7,168851,Joseph,Hickman,Male,South Carolina,30,1/1/2017,0,single,63712
8,69965,Jeremy,Vang,Male,Texas,47,1/1/2017,1,married,162432
9,82820,Shawn,Chung,Male,Virginia,26,1/1/2017,2,married,32072


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [6]:
df.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [7]:
# importing the 'orders_products_aggregated.pkl' dataset

df_agg=pd.read_pickle(os.path.join(path,'02 Data','Prepared Data','orders_products_aggregated.pkl'))

In [8]:
# checking the import

df_agg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 24 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int32   
 1   user_id                 int32   
 2   order_number            int8    
 3   orders_day_of_week      int8    
 4   order_hour_of_day       int8    
 5   days_since_prior_order  float16 
 6   product_id              int32   
 7   add_to_cart_order       int32   
 8   reordered               int8    
 9   _merge                  category
 10  product_name            object  
 11  aisle_id                int16   
 12  department_id           int16   
 13  prices                  float32 
 14  price_range_loc         object  
 15  busiest_day             object  
 16  busiest_days            object  
 17  busiest_period_of_day   object  
 18  max_order               int8    
 19  loyalty_flag            object  
 20  avg_price               float32 
 21  spendi

## 03. Wrangling the dataframe

In [9]:
# renaming the 'STATE' column

df.rename(columns={'STATE':'state'},inplace=True)

In [26]:
# renaming the 'surnam' column

df.rename(columns={'Surnam':'surname'},inplace=True)

In [11]:
# renaming the 'n_dependants' column

df.rename(columns={'n_dependants':'no_of_dependants'},inplace=True)

In [12]:
# checking the renamed columns

df.columns

Index(['user_id', 'First Name', 'Surnam', 'Gender', 'state', 'Age',
       'date_joined', 'no_of_dependants', 'fam_status', 'income'],
      dtype='object')

## 04. Checking data quality and consistency

In [13]:
# checking for any missing values in the dataframe

df.isnull().sum()

user_id                 0
First Name          11259
Surnam                  0
Gender                  0
state                   0
Age                     0
date_joined             0
no_of_dependants        0
fam_status              0
income                  0
dtype: int64

#### since the missing values are from the 'First Name' column, so I am keeping it unchanged as it will not interfere in the analysis and we also have their user_id and surnames

In [14]:
# checking for any duplicates

df_dups=df[df.duplicated()]

In [15]:
df_dups

Unnamed: 0,user_id,First Name,Surnam,Gender,state,Age,date_joined,no_of_dependants,fam_status,income


In [16]:
# checking for mixed data types

for col in df.columns.tolist():
    weird=(df[[col]].applymap(type)!=df[[col]].iloc[0].apply(type)).any(axis=1)
    if len(df[weird])>0:
        print(col)
                                                                    

First Name


In [17]:
# changing the data type of the 'First Name' column

df['First Name']=df['First Name'].astype('str')

In [18]:
# checking the result

for col in df.columns.tolist():
    weird=(df[[col]].applymap(type)!=df[[col]].iloc[0].apply(type)).any(axis=1)
    if len(df[weird])>0:
        print(col)

## 05. Combining the dataframes

In [19]:
# changing the datatypes of the customers 'df' dataframe

df['user_id']=df['user_id'].astype('int32')
df['Age']=df['Age'].astype('int8')
df['no_of_dependants']=df['no_of_dependants'].astype('int8')
df['income']=df['income'].astype('int32')

In [20]:
# checking the result

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user_id           206209 non-null  int32 
 1   First Name        206209 non-null  object
 2   Surnam            206209 non-null  object
 3   Gender            206209 non-null  object
 4   state             206209 non-null  object
 5   Age               206209 non-null  int8  
 6   date_joined       206209 non-null  object
 7   no_of_dependants  206209 non-null  int8  
 8   fam_status        206209 non-null  object
 9   income            206209 non-null  int32 
dtypes: int32(2), int8(2), object(6)
memory usage: 11.4+ MB


In [27]:
# combining the customers dataframe with the previously prepared Instacart dataframe

df_merged=df.merge(df_agg,on='user_id')

In [28]:
# checking the merge

df_merged.head()

Unnamed: 0,user_id,First Name,surname,Gender,state,Age,date_joined,no_of_dependants,fam_status,income,...,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag,median_order_days,order_frequency_flag
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Mid-range product,Regularly Busy,Busiest Day,Most orders,8,New Customer,7.988889,Low spender,19.0,Regular customer
1,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Mid-range product,Regularly Busy,Regularly Busy,Most orders,8,New Customer,7.988889,Low spender,19.0,Regular customer
2,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Mid-range product,Regularly Busy,Busiest Day,Most orders,8,New Customer,7.988889,Low spender,19.0,Regular customer
3,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Mid-range product,Regularly Busy,Regularly Busy,Most orders,8,New Customer,7.988889,Low spender,19.0,Regular customer
4,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Mid-range product,Regularly Busy,Least Busy,Most orders,8,New Customer,7.988889,Low spender,19.0,Regular customer


In [23]:
df_merged.describe()

Unnamed: 0,user_id,Age,no_of_dependants,income,order_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,aisle_id,department_id,prices,max_order,avg_price,median_order_days
count,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,30328763.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404854.0
mean,102937.2,49.46527,1.501896,99437.73,1710745.0,17.1423,2.738867,13.42515,,25598.66,8.352547,0.5895873,71.19612,9.919792,11.98023,33.05217,11.98022,
std,59466.1,18.48558,1.118865,43057.27,987298.8,17.53532,2.090077,4.24638,0.0,14084.0,7.127071,0.4919087,38.21139,6.281485,495.6436,25.15525,83.23649,0.0
min,1.0,18.0,0.0,25903.0,2.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,51422.0,33.0,1.0,67004.0,855947.0,5.0,1.0,10.0,5.0,13544.0,3.0,0.0,31.0,4.0,4.2,13.0,7.387299,6.0
50%,102616.0,49.0,2.0,96618.0,1711049.0,11.0,3.0,13.0,8.0,25302.0,6.0,1.0,83.0,9.0,7.4,26.0,7.824786,8.0
75%,154389.0,65.0,3.0,127912.0,2565499.0,24.0,5.0,16.0,15.0,37947.0,11.0,1.0,107.0,16.0,11.3,47.0,8.254023,13.0
max,206209.0,81.0,3.0,593901.0,3421083.0,99.0,6.0,23.0,30.0,49688.0,145.0,1.0,134.0,21.0,99999.0,99.0,25005.43,30.0


## 06. Exporting the dataframe

In [29]:
df_merged.to_pickle(os.path.join(path,'02 Data','Prepared Data','ords_prods_cust_merged.pkl'))