# 4.9 Intro to Data Visualization with Python PART 1

In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import os

In [3]:
path = r'/Users/rachelgrigiac/Documents/CareerFoundry/Instacart Basket Analysis'

In [4]:
customers = pd.read_csv(os.path.join( path, '02 Data', 'Original Data', 'customers.csv'))

In [5]:
customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [6]:
customers.shape

(206209, 10)

## Data Wrangling

'First Name', 'Surnam', and 'date_joined' columns are not necessary for the analysis, so I am dropping them to make the analysis faster and easier.

In [None]:
#Drop columns
customers = customers.drop(columns = ['First Name','Surnam','date_joined'])

In [9]:
# Check the results
customers.head()

Unnamed: 0,user_id,Gender,STATE,Age,n_dependants,fam_status,income
0,26711,Female,Missouri,48,3,married,165665
1,33890,Female,New Mexico,36,0,single,59285
2,65803,Male,Idaho,35,2,married,99568
3,125935,Female,Iowa,40,0,single,42049
4,130797,Female,Maryland,26,1,married,40374


In [10]:
customers.shape

(206209, 7)

In [13]:
# Look for missing values
customers.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


The count is consistent across all columns, indicating no missing values. 'user_id' range from one to the total number of rows, age from 18 to 81, n_dependants from 0 to 3, and income between 25903 to 593901. They all makes sense. No outliers have been detected.

In [16]:
# Rename columns in all lowercase
customers.rename(columns = {'Age': 'age'}, inplace = True)
customers.rename(columns = {'Gender': 'gender'}, inplace = True)
customers.rename(columns = {'STATE': 'state'}, inplace = True)

In [17]:
customers.head()

Unnamed: 0,user_id,gender,state,age,n_dependants,fam_status,income
0,26711,Female,Missouri,48,3,married,165665
1,33890,Female,New Mexico,36,0,single,59285
2,65803,Male,Idaho,35,2,married,99568
3,125935,Female,Iowa,40,0,single,42049
4,130797,Female,Maryland,26,1,married,40374


In [23]:
#Check the datatypes 
customers.dtypes

user_id          int64
gender          object
state           object
age              int64
n_dependants     int64
fam_status      object
income           int64
dtype: object

## Data Consistency Check

In [30]:
# Check for missing values
customers['gender'].isnull().sum()

0

In [28]:
customers['state'].isnull().sum()

0

In [29]:
customers['fam_status'].isnull().sum()

0

In [31]:
# Look for any duplicate
df_dups = customers[customers.duplicated()]

In [32]:
df_dups

Unnamed: 0,user_id,gender,state,age,n_dependants,fam_status,income


No duplicates found.

## Combining Data

In [33]:
# Importing 
ords_prods_merge = pd.read_pickle(os.path.join( path, '02 Data', 'Prepared Data', 'ords_prods_grouped.pkl'))

In [34]:
ords_prods_merge.dtypes

order_id                     int64
user_id                      int64
user_order_number            int64
orders_day_of_week           int64
order_hour_of_day            int64
days_since_prior_order     float64
first_order                   bool
product_id                   int64
add_to_cart_order            int64
reordered                    int64
product_name                object
aisle_id                     int64
department_id                int64
prices                     float64
_merge                    category
price_range_loc             object
busiest_day                 object
busiest_days                object
busiest_period_of_day       object
max_order                    int64
loyalty_flag                object
average_price              float64
spending_flag               object
order_frequency            float64
frequency_flag              object
dtype: object

In [48]:
ords_prods_merge = ords_prods_merge.drop(columns = ['_merge'])

In [50]:
ords_prods_merge.shape

(32404859, 24)

In [44]:
ords_prods_merge['user_id'].value_counts()

user_id
201268    3704
129928    3637
164055    3061
186704    2936
176478    2921
          ... 
188345       3
70320        3
203875       2
124615       2
91567        1
Name: count, Length: 206209, dtype: int64

There are 206209 unique values for 'user_id', which equals the number of rows in the customers dataframe.

The best way to merge the two dataframes would be the merge() function since we are going do it based on a common column ('user_id').

In [51]:
# Combine the dataframes
ords_prods_cust = ords_prods_merge.merge(customers, on = 'user_id', indicator = True)

In [52]:
# Check the results
ords_prods_cust.head()

Unnamed: 0,order_id,user_id,user_order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,...,spending_flag,order_frequency,frequency_flag,gender,state,age,n_dependants,fam_status,income,_merge
0,2539329,1,1,2,8,,True,196,1,0,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,both
1,2398795,1,2,3,7,15.0,False,196,1,1,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,both
2,473747,1,3,3,12,21.0,False,196,1,1,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,both
3,2254736,1,4,4,7,29.0,False,196,1,1,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,both
4,431534,1,5,4,15,28.0,False,196,1,1,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,both


In [53]:
ords_prods_cust.shape

(32404859, 31)

In [57]:
ords_prods_cust['_merge'].value_counts()

_merge
both          32404859
left_only            0
right_only           0
Name: count, dtype: int64

In [58]:
# Drop the '_merge' column
ords_prods_cust = ords_prods_cust.drop(columns = ['_merge'])

In [59]:
# Exporting Data
ords_prods_cust.to_pickle(os.path.join( path, '02 Data', 'Prepared Data', 'ords_prods_cust.pkl'))