In [1]:
# import libraries
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Import dataframe with customer names scrubbed, unnecessary columns removed, and regions in place of states
df_all_scrub = pd.read_pickle('/Users/samabrams/Data Analysis Projects/Instacart Basket Analysis/02 Data/Prepared Data/OPC_ALL_scrub.pkl')

In [3]:
df_all_scrub.shape

(32404859, 23)

In [4]:
df_all_scrub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 23 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   product_id             int64  
 1   department_id          int64  
 2   prices                 float64
 3   user_id                int64  
 4   order_number           int64  
 5   order_day_of_week      int64  
 6   order_hour_of_day      int64  
 7   days_since_last_order  float64
 8   reordered              int64  
 9   price_range            object 
 10  busiest days           object 
 11  busiest day            object 
 12  busiest_period_of_day  object 
 13  loyalty_flag           object 
 14  spending_flag          object 
 15  freq_flag              object 
 16  Gender                 object 
 17  state                  object 
 18  Age                    int64  
 19  num_dependents         int64  
 20  fam_status             object 
 21  income                 int64  
 22  Region          

In [5]:
# Changing data type for several columns to save memory
columns_to_convert = ['Region', 'fam_status', 'Gender', 'freq_flag', 'spending_flag', 'loyalty_flag', 'busiest day', 'price_range']

for column in columns_to_convert:
  df_all_scrub[column] = df_all_scrub[column].astype('category')

In [6]:
df_all_scrub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 23 columns):
 #   Column                 Dtype   
---  ------                 -----   
 0   product_id             int64   
 1   department_id          int64   
 2   prices                 float64 
 3   user_id                int64   
 4   order_number           int64   
 5   order_day_of_week      int64   
 6   order_hour_of_day      int64   
 7   days_since_last_order  float64 
 8   reordered              int64   
 9   price_range            category
 10  busiest days           object  
 11  busiest day            category
 12  busiest_period_of_day  object  
 13  loyalty_flag           category
 14  spending_flag          category
 15  freq_flag              category
 16  Gender                 category
 17  state                  object  
 18  Age                    int64   
 19  num_dependents         int64   
 20  fam_status             category
 21  income                 int64 

## Exclusion Flag for Low-Activity Customers

In [8]:
## calculate number of orders for each customer
orders_per_customer = df_all_scrub.groupby('user_id')['order_number'].count()

In [9]:
## Filter to keep only customers with 5 or more orders
frequent_customers = orders_per_customer[orders_per_customer >= 6].index

In [10]:
## New dataframe excluding customers with 5 or fewer orders

In [11]:
df_all_top = df_all_scrub[df_all_scrub['user_id'].isin(frequent_customers)]

In [12]:
df_all_top.shape

(32398590, 23)

In [13]:
df_all_scrub.shape

(32404859, 23)

6,269 records were removed as a result of the low-activity exclusion flag.

## Customer Profiles

In [16]:
df_all_top = df_all_top.rename(columns={'Age': 'age',})

In [17]:
## Creates two different profiles, one based on age and income, the other based on age and dependents

def get_age_group(age):
   
    if age <= 30:
        return 'Young Adult'
    elif age <= 60:
        return 'Middle-Aged Adult'
    else:
        return 'Older Adult'

def get_income_group(income):
    if income <= 67000:
        return 'Low-Income'
    elif income <= 170000:
        return 'Middle-Income'
    else:
        return 'High-Income'

def get_dependent_status(row):
    if row['num_dependents'] > 0:
        if row['fam_status'] in ('single', 'divorced/widowed'):
            return 'Single Parent'
        else:
            return 'Parent'
    else:
        return 'No Dependents'

In [18]:
# Apply the functions to create separate columns
df_all_top['age_group'] = df_all_top['age'].apply(get_age_group)

In [19]:
df_all_top['income_group'] = df_all_top['income'].apply(get_income_group)

In [20]:
df_all_top['num_dependents'].value_counts(dropna=False)

num_dependents
3    8133496
0    8095929
2    8089574
1    8079591
Name: count, dtype: int64

In [21]:
df_all_top['fam_status'].value_counts(dropna=False)

fam_status
married                             22752378
single                               5324657
divorced/widowed                     2771272
living with parents and siblings     1550283
Name: count, dtype: int64

In [41]:
df_all_top.head()

Unnamed: 0,product_id,department_id,prices,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order,reordered,price_range,...,freq_flag,Gender,state,age,num_dependents,fam_status,income,Region,age_group,income_group
0,1,19,5.8,138,28,6,11,3.0,0,mid-range product,...,frequent customer,Male,Minnesota,81,1,married,49620,Midwest,Older Adult,Low-Income
1,1,19,5.8,138,30,6,17,20.0,1,mid-range product,...,frequent customer,Male,Minnesota,81,1,married,49620,Midwest,Older Adult,Low-Income
2,1,19,5.8,709,2,0,21,6.0,0,mid-range product,...,frequent customer,Female,Vermont,66,2,married,158302,Northeast,Older Adult,Middle-Income
3,1,19,5.8,764,1,3,13,,0,mid-range product,...,frequent customer,Female,Wisconsin,40,3,married,31308,Midwest,Middle-Aged Adult,Low-Income
4,1,19,5.8,764,3,4,17,9.0,1,mid-range product,...,frequent customer,Female,Wisconsin,40,3,married,31308,Midwest,Middle-Aged Adult,Low-Income


In [45]:
df_all_top = df_all_top.drop(['age', 'income'], axis=1)

In [None]:
df_all_top['dependent_status'] = df_all_top.apply(get_dependent_status, axis=1)

In [None]:
# Combine the columns to create the final profile columns
df_all_top['income_profile'] = df_all_top['income_group'] + ' ' + df_all_top['age_group']
df_all_top['dependent_profile'] = df_all_top['age_group'] + ', ' + df_all_top['dependent_status']

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_all_top.head()