# 4.10: Coding Etiquette & Excel Reporting

## Contents

### 01. Importing libaries
### 02. Importing data
### 03. Quick data consistency check
### 04. Create a profiling variable based on age, income, certain goods in the “department_id” column, and number of dependents(Step 5 a) 

## 01. Importing libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

## 02. Importing data

In [2]:
path = r'C:\Users\Sulo\Desktop\Careerfoundry\11.09.2023 Instacart Basket Analysis'

In [3]:
# Import the pickle file
ords_prods_updated = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_update_v2.pkl'))

## 03. Quick data consistency check

In [4]:
# Quick data consistency check
ords_prods_updated.head()

Unnamed: 0,order_id,user_id,order_amount,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,order_frequency_flag,gender,state,age,date_joined,n_dependants,fam_status,income,region,activity_flag
0,2539329,1,1,2,8,7.0,196,1,0,Soda,...,Regular customer.,Female,Alabama,31,2/17/2019,3,married,40423,South,High-activity customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Regular customer.,Female,Alabama,31,2/17/2019,3,married,40423,South,High-activity customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Regular customer.,Female,Alabama,31,2/17/2019,3,married,40423,South,High-activity customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Regular customer.,Female,Alabama,31,2/17/2019,3,married,40423,South,High-activity customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Regular customer.,Female,Alabama,31,2/17/2019,3,married,40423,South,High-activity customer


In [5]:
# Another follow up on data check
ords_prods_updated.shape

(30964564, 31)

In [6]:
# Check for column names
ords_prods_updated.columns

Index(['order_id', 'user_id', 'order_amount', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'price_range_loc', 'busiest_days',
       'busiest_period_of_day_loc', 'max_order', 'loyalty_flag', 'avg_order',
       'spending_flag', 'med_days', 'order_frequency_flag', 'gender', 'state',
       'age', 'date_joined', 'n_dependants', 'fam_status', 'income', 'region',
       'activity_flag'],
      dtype='object')

## 04. Create a profiling variable based on age, income, certain goods in the “department_id” column, and number of dependents(Step 5 a) 

### Age profile

In [7]:
ords_prods_updated['age'].describe()

count    3.096456e+07
mean     4.946803e+01
std      1.848528e+01
min      1.800000e+01
25%      3.300000e+01
50%      4.900000e+01
75%      6.500000e+01
max      8.100000e+01
Name: age, dtype: float64

In [8]:
# Create a young adult age range variable and the new column 'age_profile'
ords_prods_updated.loc[ords_prods_updated['age'] <= 39, 'age_profile'] = 'Young Adult'

In [9]:
# Create a middle-aged adult age range variable and the new column 'age_profile'
ords_prods_updated.loc[(ords_prods_updated['age'] > 39) & (ords_prods_updated['age'] <= 59), 'age_profile'] = 'Middle-aged Adult'

In [10]:
# Create a old adult age range variable and the new column 'age_profile'
ords_prods_updated.loc[ords_prods_updated['age']  > 59, 'age_profile'] = 'Old Adult'

In [11]:
# Check if it worked
ords_prods_updated['age_profile'].value_counts(dropna = False)

age_profile
Young Adult          10665110
Old Adult            10574504
Middle-aged Adult     9724950
Name: count, dtype: int64

In [12]:
ords_prods_updated.head()

Unnamed: 0,order_id,user_id,order_amount,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,gender,state,age,date_joined,n_dependants,fam_status,income,region,activity_flag,age_profile
0,2539329,1,1,2,8,7.0,196,1,0,Soda,...,Female,Alabama,31,2/17/2019,3,married,40423,South,High-activity customer,Young Adult
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Female,Alabama,31,2/17/2019,3,married,40423,South,High-activity customer,Young Adult
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Female,Alabama,31,2/17/2019,3,married,40423,South,High-activity customer,Young Adult
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Female,Alabama,31,2/17/2019,3,married,40423,South,High-activity customer,Young Adult
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Female,Alabama,31,2/17/2019,3,married,40423,South,High-activity customer,Young Adult


### Income profile

In [13]:
ords_prods_updated['income'].describe()

count    3.096456e+07
mean     9.967587e+04
std      4.314187e+04
min      2.590300e+04
25%      6.729200e+04
50%      9.676500e+04
75%      1.281020e+05
max      5.939010e+05
Name: income, dtype: float64

In [14]:
# Create a low income variable and the new column 'income_profile'
ords_prods_updated.loc[ords_prods_updated['income'] <= 50000 , 'income_profile'] = 'Low-income'

In [15]:
# Create a middle income variable and the new column 'income_profile'
ords_prods_updated.loc[(ords_prods_updated['income'] > 50000) & (ords_prods_updated['income'] <= 150000), 'income_profile'] = 'Middle-income'

In [16]:
# Create a high income variable and the new column 'income_profile'
ords_prods_updated.loc[ords_prods_updated['income'] > 150000 , 'income_profile'] = 'High-income'

In [17]:
# Check if it worked
ords_prods_updated['income_profile'].value_counts(dropna = False)

income_profile
Middle-income    23707476
High-income       3894534
Low-income        3362554
Name: count, dtype: int64

In [18]:
ords_prods_updated.head()

Unnamed: 0,order_id,user_id,order_amount,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,state,age,date_joined,n_dependants,fam_status,income,region,activity_flag,age_profile,income_profile
0,2539329,1,1,2,8,7.0,196,1,0,Soda,...,Alabama,31,2/17/2019,3,married,40423,South,High-activity customer,Young Adult,Low-income
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Alabama,31,2/17/2019,3,married,40423,South,High-activity customer,Young Adult,Low-income
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Alabama,31,2/17/2019,3,married,40423,South,High-activity customer,Young Adult,Low-income
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Alabama,31,2/17/2019,3,married,40423,South,High-activity customer,Young Adult,Low-income
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Alabama,31,2/17/2019,3,married,40423,South,High-activity customer,Young Adult,Low-income


In [None]:
# Export data to pkl
ords_prods_updated.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_updated_v3.pkl'))