# Part 1 - Data Visualization

## This script includes the following points:

1. Importing Data
2. Wrangle Data
3. Data Quality and Consistency Check
4. Combine Dataframes
5. Export dataframe

# 01. Importing Data

In [79]:
# Importing Libraries

import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [80]:
# defining path

path = r'/Users/robson/Desktop/CareerFoundry/Data Immersion/Achivement 4/19-04-2024 Instacart Basket Analysis'

In [81]:
#import dataframe customers

df_customers = pd.read_csv(os.path.join(path, '02 Data', 'original_data', 'customers.csv'))

In [82]:
# import dataframe containing information about orders and products

df_ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'prepared_data', 'order_prods_merge_aggregated.pkl'))

# 02. Wrangle Data

In [83]:
# check columns and data

df_customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [84]:
# check frequency of 'date_joined' column to see if there is any other date in

df_customers['date_joined'].value_counts()

date_joined
9/17/2018     213
2/10/2018     212
4/1/2019      211
9/21/2019     211
12/19/2017    210
             ... 
9/1/2018      141
1/22/2018     140
11/24/2017    139
7/18/2019     138
8/6/2018      128
Name: count, Length: 1187, dtype: int64

In [85]:
# rename column to follow a standard

df_customers.rename(columns = {'First Name': 'first_name','Surnam':'surname', 'Gender':'gender', 'STATE': 'us_state', 'Age': 'age', 'fam_status':'marital_status' }, inplace = True)

In [86]:
# check if all the changes were made correctly 

df_customers.head()

Unnamed: 0,user_id,first_name,surname,gender,us_state,age,date_joined,n_dependants,marital_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [87]:
# checking columns data type to see if any of them need to be changed

df_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   user_id         206209 non-null  int64 
 1   first_name      194950 non-null  object
 2   surname         206209 non-null  object
 3   gender          206209 non-null  object
 4   us_state        206209 non-null  object
 5   age             206209 non-null  int64 
 6   date_joined     206209 non-null  object
 7   n_dependants    206209 non-null  int64 
 8   marital_status  206209 non-null  object
 9   income          206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [88]:
# change the user_id to object - does not have a numerical value

df_customers['user_id'] = df_customers['user_id'].astype('str')

In [89]:
# convert the date_joined to datetime format

df_customers['date_joined'] = pd.to_datetime(df_customers['date_joined'])

In [90]:
# check if the change was successfull 

df_customers[['user_id','date_joined']].dtypes

user_id                object
date_joined    datetime64[ns]
dtype: object

#### I changed some columns name to more appropriate nomenclatures, this way would not make any confusion to anyone who read it. Also, I kept all the column, since if any of them become necessary in the future, I would need to start all over again. For last, since the user_id does not have a numerical value, I changed for a string (or object) data type, as well as the date_joined to datetime format. 

# 03. Data Quality and Consistency Check 

In [91]:
# check for duplicates value

df_customers[df_customers.duplicated()]

Unnamed: 0,user_id,first_name,surname,gender,us_state,age,date_joined,n_dependants,marital_status,income


##### No duplicates were found

In [92]:
# finding missing values 

df_customers.isnull().sum()

user_id               0
first_name        11259
surname               0
gender                0
us_state              0
age                   0
date_joined           0
n_dependants          0
marital_status        0
income                0
dtype: int64

In [93]:
# visualize missing values

df_customers[df_customers['first_name'].isnull() == True]

Unnamed: 0,user_id,first_name,surname,gender,us_state,age,date_joined,n_dependants,marital_status,income
53,76659,,Gilbert,Male,Colorado,26,2017-01-01,2,married,41709
73,13738,,Frost,Female,Louisiana,39,2017-01-01,0,single,82518
82,89996,,Dawson,Female,Oregon,52,2017-01-01,3,married,117099
99,96166,,Oconnor,Male,Oklahoma,51,2017-01-01,1,married,155673
105,29778,,Dawson,Female,Utah,63,2017-01-01,3,married,151819
...,...,...,...,...,...,...,...,...,...,...
206038,121317,,Melton,Male,Pennsylvania,28,2020-03-31,3,married,87783
206044,200799,,Copeland,Female,Hawaii,52,2020-04-01,2,married,108488
206090,167394,,Frost,Female,Hawaii,61,2020-04-01,1,married,45275
206162,187532,,Floyd,Female,California,39,2020-04-01,0,single,56325


#### I decided to keep all the NaN values 

In [94]:
# function to check if there is a mixed type columns

for col in df_customers.columns.tolist():
  weird = (df_customers[[col]].map(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_customers[weird]) > 0:
    print (col)

first_name


##### Considering that the mixed type column is the 'first_name' this means that the NaN is the cause of this change. So, I decided to not change it, otherwise the NaN would turn into a string ('nan'). 

##### But, if it was necessary to change it, I'd use the following code:

##### df_customers['first_name'] = df_customers['first_name'].astype('str')

# 04. Combine Dataframes

In [95]:
# check for columns in common

df_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   user_id         206209 non-null  object        
 1   first_name      194950 non-null  object        
 2   surname         206209 non-null  object        
 3   gender          206209 non-null  object        
 4   us_state        206209 non-null  object        
 5   age             206209 non-null  int64         
 6   date_joined     206209 non-null  datetime64[ns]
 7   n_dependants    206209 non-null  int64         
 8   marital_status  206209 non-null  object        
 9   income          206209 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(6)
memory usage: 15.7+ MB


In [96]:
# check output

df_customers.head()

Unnamed: 0,user_id,first_name,surname,gender,us_state,age,date_joined,n_dependants,marital_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,2017-01-01,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,2017-01-01,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,2017-01-01,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,2017-01-01,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,2017-01-01,1,married,40374


In [97]:
#check for columns in common

df_ords_prods_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 24 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int64   
 1   user_id                 int64   
 2   order_number            int64   
 3   order_day_of_week       int64   
 4   order_hour_of_day       int64   
 5   days_since_prior_order  float64 
 6   first_order             bool    
 7   product_id              int64   
 8   add_to_cart_order       int64   
 9   reordered               int64   
 10  product_name            object  
 11  aisle_id                int64   
 12  department_id           int64   
 13  prices                  float64 
 14  _merge                  category
 15  price_range_loc         object  
 16  busiest_days            object  
 17  busiest_period_of_day   object  
 18  max_order               int64   
 19  loyalty_flag            object  
 20  avg_order               float64 
 21  spendi

In [98]:
# check output

df_ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,...,_merge,price_range_loc,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_order,spending_flag,median_prior_order,order_frequency_flag
0,2539329,1,1,2,8,,True,196,1,0,...,both,Mid-range product,Regularly Busy,Average orders,10,New Customer,6.367797,Low Spender,20.5,Non-frequent Customer
1,2398795,1,2,3,7,15.0,False,196,1,1,...,both,Mid-range product,Slowest Days,Average orders,10,New Customer,6.367797,Low Spender,20.5,Non-frequent Customer
2,473747,1,3,3,12,21.0,False,196,1,1,...,both,Mid-range product,Slowest Days,Most orders,10,New Customer,6.367797,Low Spender,20.5,Non-frequent Customer
3,2254736,1,4,4,7,29.0,False,196,1,1,...,both,Mid-range product,Slowest Days,Average orders,10,New Customer,6.367797,Low Spender,20.5,Non-frequent Customer
4,431534,1,5,4,15,28.0,False,196,1,1,...,both,Mid-range product,Slowest Days,Most orders,10,New Customer,6.367797,Low Spender,20.5,Non-frequent Customer


In [99]:
# change the data type from column 'user_id' to string

df_ords_prods_merge['user_id'] = df_ords_prods_merge['user_id'].astype('str')

In [100]:
# check if the column data type changed

df_ords_prods_merge['user_id'].dtype

dtype('O')

#### Both dataframe will be connected by the column 'user_id'|

In [101]:
# to be able to merge, once again the '_merge' need to be dropped, otherwise a error will return

df_ords_prods_merge.drop(columns = '_merge', inplace = True)

In [102]:
# merge the columns on 'user_id'

df_merged = df_ords_prods_merge.merge(df_customers, on = 'user_id', indicator = True)

In [103]:
# check if all the columns were added 

df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 33 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   order_id                int64         
 1   user_id                 object        
 2   order_number            int64         
 3   order_day_of_week       int64         
 4   order_hour_of_day       int64         
 5   days_since_prior_order  float64       
 6   first_order             bool          
 7   product_id              int64         
 8   add_to_cart_order       int64         
 9   reordered               int64         
 10  product_name            object        
 11  aisle_id                int64         
 12  department_id           int64         
 13  prices                  float64       
 14  price_range_loc         object        
 15  busiest_days            object        
 16  busiest_period_of_day   object        
 17  max_order               int64         
 18  

In [104]:
# check frequency of '_merge' column to verify if the merge was successful 

df_merged['_merge'].value_counts()

_merge
both          32404859
left_only            0
right_only           0
Name: count, dtype: int64

# 05. Export Dataframe

In [105]:
# export dataframe in pickle format

df_merged.to_pickle(os.path.join(path, '02 Data', 'prepared_data', 'ords_prods_customer_merge.pkl'))