In [2]:
# File name: Exercise 4.10 - Creating Customer Profiles
# Author: Sam Abrams
# Created: 12/27/24
# Description: This notebook drops personal identifying information along with other non-essential columns, then focuses on creating profiles that group customers into defined profiles based on age, income, and number of dependents.

## Initial Notebook Setup

In [None]:
# import libraries
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df_all_top = pd.read_pickle("C:/Users/Sam/Documents/Data Analytics Projects/02 Data/ALL_order_prod_cust_dataframe.pkl")

In [None]:
df_all_top.shape

(32404859, 35)

In [None]:
df_all = df_all_top

In [None]:
df_all.shape

(32404859, 35)

I wanted to change the name of the dataframe to avoid confusion, hence the df_all name above.

## Handling PII Concerns and Dropping Unnecessary Columns

The code below will create a new dataframe that doesn't contain the first and last names of the customers. User_ID will be the primary identifier for each customer, and the pertinent demographic info has been retained.

In [None]:
df_all_scrub = df_all.drop(['first_name', 'last_name', 'date_joined', 'add_to_cart_order', '_merge', 'source', 'aisle_id', 'product_name', 'order_id', 'max_order', 'price_range_loc', 'median_order_freq', 'average_price'], axis=1)

In [None]:
df_all_scrub.head(50)

Unnamed: 0,product_id,department_id,prices,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order,reordered,price_range,...,busiest_period_of_day,loyalty_flag,spending_flag,freq_flag,Gender,state,Age,num_dependents,fam_status,income
0,1,19,5.8,138,28,6,11,3.0,0,mid-range product,...,Most Orders,Regular customer,low spender,frequent customer,Male,Minnesota,81,1,married,49620
1,1,19,5.8,138,30,6,17,20.0,1,mid-range product,...,Average Orders,Regular customer,low spender,frequent customer,Male,Minnesota,81,1,married,49620
2,1,19,5.8,709,2,0,21,6.0,0,mid-range product,...,Average Orders,New customer,low spender,frequent customer,Female,Vermont,66,2,married,158302
3,1,19,5.8,764,1,3,13,,0,mid-range product,...,Most Orders,New customer,low spender,frequent customer,Female,Wisconsin,40,3,married,31308
4,1,19,5.8,764,3,4,17,9.0,1,mid-range product,...,Average Orders,New customer,low spender,frequent customer,Female,Wisconsin,40,3,married,31308
5,1,19,5.8,777,16,1,7,26.0,0,mid-range product,...,Average Orders,Regular customer,low spender,Regular customer,Female,Hawaii,51,2,married,57797
6,1,19,5.8,825,3,2,14,30.0,0,mid-range product,...,Most Orders,New customer,low spender,Regular customer,Male,District of Columbia,20,3,living with parents and siblings,34171
7,1,19,5.8,910,12,3,10,30.0,0,mid-range product,...,Most Orders,Regular customer,low spender,frequent customer,Female,Tennessee,65,0,divorced/widowed,44856
8,1,19,5.8,1052,10,1,20,19.0,0,mid-range product,...,Average Orders,Regular customer,low spender,frequent customer,Male,Oregon,72,1,married,153843
9,1,19,5.8,1052,15,1,12,15.0,1,mid-range product,...,Most Orders,Regular customer,low spender,frequent customer,Male,Oregon,72,1,married,153843


## Assigning Regions to Customers

In [None]:
df_all_scrub['state'].value_counts(dropna=False)

state
Pennsylvania            667082
California              659783
Rhode Island            656913
Georgia                 656389
New Mexico              654494
Arizona                 653964
North Carolina          651900
Oklahoma                651739
Alaska                  648495
Minnesota               647825
Massachusetts           646358
Wyoming                 644255
Virginia                641421
Missouri                640732
Texas                   640394
Colorado                639280
Maine                   638583
North Dakota            638491
Alabama                 638003
Kansas                  637538
Louisiana               637482
Delaware                637024
South Carolina          636754
Oregon                  636425
Arkansas                636144
Nevada                  636139
New York                635983
Montana                 635265
South Dakota            633772
Illinois                633024
Hawaii                  632901
Washington              632852
Mi

In [None]:
def assign_region(state):
    if state in ['Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut', 'New York', 'Pennsylvania', 'New Jersey']:
        return 'Northeast'
    elif state in ['Wisconsin', 'Michigan', 'Illinois', 'Indiana', 'Ohio', 'North Dakota', 'South Dakota', 'Nebraska', 'Kansas', 'Minnesota', 'Iowa', 'Missouri']:
        return 'Midwest'
    elif state in ['Delaware', 'Maryland', 'District of Columbia', 'Virginia', 'West Virginia', 'North Carolina', 'South Carolina', 'Georgia', 'Florida', 'Kentucky', 'Tennessee', 'Mississippi', 'Alabama', 'Oklahoma', 'Texas', 'Arkansas', 'Louisiana']:
        return 'South'
    elif state in ['Idaho', 'Montana', 'Wyoming', 'Nevada', 'Utah', 'Colorado', 'Arizona', 'New Mexico', 'Alaska', 'Washington', 'Oregon', 'California', 'Hawaii']:
        return 'West'

In [None]:
df_all_scrub['Region'] = df_all_scrub['state'].apply(assign_region)

In [None]:
df_all_scrub.head()

Unnamed: 0,product_id,department_id,prices,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order,reordered,price_range,...,loyalty_flag,spending_flag,freq_flag,Gender,state,Age,num_dependents,fam_status,income,Region
0,1,19,5.8,138,28,6,11,3.0,0,mid-range product,...,Regular customer,low spender,frequent customer,Male,Minnesota,81,1,married,49620,Midwest
1,1,19,5.8,138,30,6,17,20.0,1,mid-range product,...,Regular customer,low spender,frequent customer,Male,Minnesota,81,1,married,49620,Midwest
2,1,19,5.8,709,2,0,21,6.0,0,mid-range product,...,New customer,low spender,frequent customer,Female,Vermont,66,2,married,158302,Northeast
3,1,19,5.8,764,1,3,13,,0,mid-range product,...,New customer,low spender,frequent customer,Female,Wisconsin,40,3,married,31308,Midwest
4,1,19,5.8,764,3,4,17,9.0,1,mid-range product,...,New customer,low spender,frequent customer,Female,Wisconsin,40,3,married,31308,Midwest


In [None]:
df_all_scrub = df_all_scrub.drop('state', axis=1)

In [None]:
df_all_scrub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 22 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   product_id             int64  
 1   department_id          int64  
 2   prices                 float64
 3   user_id                int64  
 4   order_number           int64  
 5   order_day_of_week      int64  
 6   order_hour_of_day      int64  
 7   days_since_last_order  float64
 8   reordered              int64  
 9   price_range            object 
 10  busiest days           object 
 11  busiest day            object 
 12  busiest_period_of_day  object 
 13  loyalty_flag           object 
 14  spending_flag          object 
 15  freq_flag              object 
 16  Gender                 object 
 17  Age                    int64  
 18  num_dependents         int64  
 19  fam_status             object 
 20  income                 int64  
 21  Region                 object 
dtypes: float64(2), i

## Creating exclusion flag for, and subsequently removing, low-activity customers

In [None]:
## calculate the number of orders for each customer
orders_per_customer = df_all_scrub.groupby('user_id')['order_number'].count()

In [None]:
## Filter to keep only customers with 5 or more orders
frequent_customers = orders_per_customer[orders_per_customer >= 6].index

In [None]:
## New dataframe with only customers who have 6 or more orders, thereby excluding customers with 5 or less
df_all_top = df_all_scrub[df_all_scrub['user_id'].isin(frequent_customers)]

MemoryError: Unable to allocate 1.69 GiB for an array with shape (7, 32398590) and data type object

In [None]:
df_all_scrub.shape

In [None]:
df_all_top.shape

6,269 records were removed as a result of the low-activity flag.

## Changing Column Data Types

The dataframe was challenging the extent of my RAM, so I'm changing a few data types from object to category in hopes of decreasing the load.

In [None]:
columns_to_convert = ['Region', 'fam_status', 'Gender', 'freq_flag', 'spending_flag', 'loyalty_flag', 'busiest day', 'price_range']

for column in columns_to_convert:
    df_all_top[column] = df_all_top[column].astype('category')

In [None]:
df_all_top.info()

In [None]:
df_all_top = df_all_top.rename(columns={'Age':'age',})

## Creating Customer Profiles

In [None]:
## Creates two different profiles, one based on age and income, the other based on age and dependents

def get_age_group(age):

    if age <= 30:
        return 'Young Adult'
    elif age <= 60:
        return 'Middle-Aged Adult'
    else:
        return 'Older Adult'

def get_income_group(income):
    if income <= 67000:
        return 'Low-Income'
    elif income <= 170000:
        return 'Middle-Income'
    else:
        return 'High-Income'

def get_dependent_status(row):
    if row['num_dependents'] > 0:
        if row['fam_status'] in ('single', 'divorced/widowed'):
            return 'Single Parent'
        else:
            return 'Parent'
    else:
        return 'No Dependents'

In [None]:
df_all_top['age_group'] = df_all_top['age'].apply(get_age_group)

In [None]:
df_all_top['income_group'] = df_all_top['income'].apply(get_income_group)

In [None]:
df_all_top['dependent_status'] = df_all_top.apply(get_dependent_status, axis=1)

In [None]:
df_all_top.head(20)

In [None]:
# Combine the columns to create the final profile columns
df_all_top['income_profile'] = df_all_top['income_group'] + ' ' + df_all_top['age_group']
df_all_top['dependent_profile'] = df_all_top['age_group'] + ', ' + df_all_top['dependent_status']

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df_all_top.head()

In [None]:
df_all_top.info()

In [None]:
columns_to_convert2 = ['age_group', 'income_group', 'dependent_status', 'income_profile', 'dependent_profile']

for column in columns_to_convert2:
    df_all_top[column] = df_all_top[column].astype('category')

In [None]:
df_all_top.to_pickle('C:/Users/Sam/Documents/Data Analytics Projects/02 Data/OPC_scrubbed_profiled.pkl')

I wanted to save the data here because it took so long to create this new database with the customer profiles, and I didn't want to risk losing that file.

## Visualizations for Distribution of Profiles

In [None]:
df_OPC = pd.read_pickle('C:/Users/Sam/Documents/Data Analytics Projects/02 Data/OPC_scrubbed_profiled.pkl')