In [2]:
# File name: Exercise 4.10 - Profile Visualizations
# Author: Sam Abrams
# Created: 12/27/24
# Description: This notebook contains visualizations relating to the customer profiles created in Part 1 of task 4.10 - the notebook was split into two to help with memory issues.

## Notebook Setup

In [3]:
# import libraries
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import scipy

In [30]:
df_profiles = pd.read_pickle('/Users/samabrams/Data Analysis Projects/Instacart Basket Analysis/02 Data/Prepared Data/ALL_order_prod_cust_dataframe.pkl')

EOFError: Ran out of input

In [None]:
df_profiles.head()

In [None]:
df_profiles.info()

## Visualizations of Customer Profile Distribution

### Income Profiling

In [None]:
# bar chart of income profile
df_profiles['income_profile'].value_counts().plot.bar()

Key Takeaway: High-income individuals aren't order products at the same rate as middle- and low-income customers, although this could be due to the number of high-income individuals. At any rate, the primary customers are middle-and-low income individuals.

In [None]:
## Spending habits of each group
profile_spending = df_profiles.groupby('income_profile')['prices'].sum()
spending_habits_by_income = profile_spending.plot(kind='bar', color='skyblue', edgecolor='black')
plt.xlabel('Income Profile')
plt.ylabel('Total Spending')
plt.title('Total Spending by Customer Income Profile')

The graph above shows that middle-income customers are spending the most on Instacart orders. High- and low-income customers, interestingly enough, are spending about the same.

In [None]:
spending_habits_by_income.figure.savefig('C:/Users/Sam/Documents/Data Analytics Projects/04 Analysis/Visualizations/spending_habits_by_income_profile.png')

### Dependency Profiles

In [None]:
df_profiles['dependent_profile'].value_counts().plot.bar()

Parents (or anyone with dependents) appear much more likely to order from Instacart than those who don't have dependents.

In [None]:
## Spending habits of each group
dep_profile_spending = df_profiles.groupby('dependent_profile')['prices'].sum().sort_index()
spending_habits_by_dependents = dep_profile_spending.plot(kind='bar', color='blue', edgecolor='black')
plt.xlabel('Dependent Profile')
plt.ylabel('Total Spending')
plt.title('Total Spending by Customer Dependent Profile')

Parents also spend the most money ordering from Instacart.

In [None]:
spending_habits_by_dependents.figure.savefig('C:/Users/Sam/Documents/Data Analytics Projects/04 Analysis/Visualizations/spending_habits_by_dependent_profile.png')

## Creating Profile Dataframes

I'm going to make two dictionaries, one containing income-based profiles and the other containing dependent-based profiles.

### Income Profiles

In [None]:
#Create dictionary for INCOME profile Dataframes
income_profile_dfs = {}

# Add new dataframes to dictionary

## young adult income profiles
income_profile_dfs['df_high_income_YA'] = df_profiles[df_profiles['income_profile'] == 'High-Income Young Adult']
income_profile_dfs['df_middle_income_YA'] = df_profiles[df_profiles['income_profile'] == 'Middle-Income Young Adult']
income_profile_dfs['df_low_income_YA'] = df_profiles[df_profiles['income_profile'] == 'Low-Income Young Adult']

## middle-aged adult income profiles
income_profile_dfs['df_high_income_MA'] = df_profiles[df_profiles['income_profile'] == 'High-Income Middle-Aged Adult']
income_profile_dfs['df_middle_income_MA'] = df_profiles[df_profiles['income_profile'] == 'Middle-Income Middle-Aged Adult']
income_profile_dfs['df_low_income_MA'] = df_profiles[df_profiles['income_profile'] == 'Low-Income Middle-Aged Adult']

## older adult income profiles
income_profile_dfs['df_high_income_OA'] = df_profiles[df_profiles['income_profile'] == 'High-Income Older Adult']
income_profile_dfs['df_middle_income_OA'] = df_profiles[df_profiles['income_profile'] == 'Middle-Income Older Adult']
income_profile_dfs['df_low_income_OA'] = df_profiles[df_profiles['income_profile'] == 'Low-Income Older Adult']

In [None]:
# Assign INCOME profiles to variables
df_HI_YA = income_profile_dfs['df_high_income_YA']
df_MI_YA = income_profile_dfs['df_middle_income_YA']
df_LI_YA = income_profile_dfs['df_low_income_YA']
df_HI_MA = income_profile_dfs['df_high_income_MA']
df_MI_MA = income_profile_dfs['df_middle_income_MA']
df_LI_MA = income_profile_dfs['df_low_income_MA']
df_HI_OA = income_profile_dfs['df_high_income_OA']
df_MI_OA = income_profile_dfs['df_middle_income_OA']
df_LI_OA = income_profile_dfs['df_low_income_OA']

In [None]:
df_HI_YA.shape

In [None]:
df_MI_YA.shape

In [None]:
df_LI_YA.shape

### Dependent Profiles

In [None]:
df_profiles['dependent_profile'].value_counts()

In [None]:
# Create Dictionary for Dependent profiles
dependent_profile_dfs = {}

# Add dataframes to dictionary
dependent_profile_dfs['df_parent_YA'] = df_profiles[df_profiles['dependent_profile'] == 'Young Adult, Parent']
dependent_profile_dfs['df_parent_MA'] = df_profiles[df_profiles['dependent_profile'] == 'Middle-Aged Adult, Parent']
dependent_profile_dfs['df_parent_OA'] = df_profiles[df_profiles['dependent_profile'] == 'Older Adult, Parent']
dependent_profile_dfs['df_no_deps_YA'] = df_profiles[df_profiles['dependent_profile'] == 'Young Adult, No Dependents']
dependent_profile_dfs['df_no_deps_MA'] = df_profiles[df_profiles['dependent_profile'] == 'Young Adult, No Dependents']
dependent_profile_dfs['df_no_deps_OA'] = df_profiles[df_profiles['dependent_profile'] == 'Young Adult, No Dependents']

In [None]:
# Assign dictionary dataframes to variables
df_parent_YA = dependent_profile_dfs['df_parent_YA']
df_parent_MA = dependent_profile_dfs['df_parent_MA']
df_parent_OA = dependent_profile_dfs['df_parent_OA']
df_no_deps_YA = dependent_profile_dfs['df_no_deps_YA']
df_no_deps_MA = dependent_profile_dfs['df_no_deps_MA']
df_no_deps_OA = dependent_profile_dfs['df_no_deps_OA']

In [None]:
print(dependent_profile_dfs.keys())

In [None]:
print(income_profile_dfs.keys())

In [None]:
df_HI_YA.shape

## Usage Frequency and Expenditure Aggregate Statistics

In [None]:
df_profiles.info()

In [None]:
# Defining functions to calculate profile metrics

## Descriptive Statistics for spending using 'prices" column
def get_descriptive_stats(df):
    return df['prices'].describe()

## Total spending for each profile
def get_total_spending(df):
    return df['prices'].sum()

## Finds the most ordered department for the profile
def get_most_ordered_department(df):
    return df['department_id'].mode()

## Average days between orders for the profile
def get_order_frequency(df):
    return df['days_since_last_order'].mean()

In [None]:
# Main function to analyze profile data
def analyze_profile_data(profile_dfs):
    results = []
    for profile_name, df in profile_dfs.items():
        desc_stats = get_descriptive_stats(df)
        total_spending = get_total_spending(df)
        most_ordered_department = get_most_ordered_department(df)
        avg_days_between_orders = get_order_frequency(df)

# Create a dictionary to store the results for the current profile
        profile_results = {
            'customer_profile': profile_name,
            'max_price': desc_stats['max'],
            'average_price': desc_stats['mean'],
            'min_price': desc_stats['min'],
            '25th_percentile_price': desc_stats['25%'],
            'median_price': desc_stats['50%'],
            '75th_percentile_price': desc_stats['75%'],
            'total_spending': total_spending,
            'most_ordered_department': most_ordered_department,
            'avg_days_between_orders': avg_days_between_orders}

# Append the dictionary to the results list
        results.append(profile_results)

    # Create a DataFrame from the results
    df_profile_stats = pd.DataFrame(results)

    return df_profile_stats

In [None]:
## Apply analysis to dictionaries
income_profile_stats = analyze_profile_data(income_profile_dfs)
dependent_profile_stats = analyze_profile_data(dependent_profile_dfs)

In [None]:
print("Income Profile Stats:")
print(income_profile_stats.to_markdown(index=False, numalign="center", stralign="center"))

print("\nDependent Profile Stats:")
print(dependent_profile_stats.to_markdown(index=False, numalign="center", stralign="center"))

In [None]:
income_profile_stats.head(20)

In [None]:
dependent_profile_stats.head(10)

In [None]:
df_parent_MA.head()

In [None]:
df_parent_MA.shape

In [None]:
df_parent_OA.shape

### Charts for Total Spending by Profile

In [None]:
## Creating the bar chart for total spending
sns.barplot(x='customer_profile', y='total_spending', data = income_profile_stats)

In [None]:
## Clean up the visuals
income_profile_spending_chart = sns.barplot(x='customer_profile', y='total_spending', data = income_profile_stats)

plt.xlabel('Income Profile')
plt.ylabel('Total Spending')
plt.title('Total Spending by Income Profile')
plt.xticks(rotation=45, ha='right')
plt.legend

plt.show()

In [None]:
## Now to do the same for the dependent profiles
dependent_profile_spending_chart = sns.barplot(x='customer_profile', y='total_spending', data = dependent_profile_stats)

plt.xlabel('Dependent Profile')
plt.ylabel('Total Spending')
plt.title('Total Spending by Dependent Profile')
plt.xticks(rotation=45, ha='right')
plt.legend

plt.show()

In [None]:
## saving the charts above
dependent_profile_spending_chart.figure.savefig("C:/Users/Sam/Documents/Data Analytics Projects/04 Analysis/Visualizations/Total spending by dependent profile.png")

In [None]:
income_profile_spending_chart.figure.savefig("C:/Users/Sam/Documents/Data Analytics Projects/04 Analysis/Visualizations/Total spending by income profile.png")

### Chart for Usage Frequency by Profile

In [None]:
income_profile_frequency_chart = sns.barplot(x='customer_profile', y='avg_days_between_orders', data = income_profile_stats)

plt.xlabel('Income Profile')
plt.ylabel('Average Days Between Orders')
plt.title('Average Order Frequency for each Income Profile')
plt.xticks(rotation=45, ha='right')
plt.legend

plt.show()

In [None]:
dependent_profile_frequency_chart = sns.barplot(x='customer_profile', y='avg_days_between_orders', data = dependent_profile_stats)

plt.xlabel('Dependent Profile')
plt.ylabel('Average Days Between Orders')
plt.title('Average Order Frequency for each Income Profile')
plt.xticks(rotation=45, ha='right')
plt.legend

plt.show()

There is no significant difference in order frequency across the six dependent profiles, but high-income customers do tend to order more frequently than middle- or low-income customers.

In [None]:
## Saving Chart
dependent_profile_frequency_chart.figure.savefig("C:/Users/Sam/Documents/Data Analytics Projects/04 Analysis/Visualizations/AVG order frequency by dependent profile.png")
income_profile_frequency_chart.figure.savefig("C:/Users/Sam/Documents/Data Analytics Projects/04 Analysis/Visualizations/AVG order frequency by income profile.png")

## Grouping Customer Profiles by Region

In [None]:
# Create a cross-tab to count profiles in each region
income_profile_region = pd.crosstab(df_profiles['income_profile'], df_profiles['Region'])

# Get percentages within each profile
income_profile_region_pct = income_profile_region.div(income_profile_region.sum(axis=1), axis=0) * 100



In [None]:
#Same drill but for dependent profiles
# Create a cross-tab to count profiles in each region
dependent_profile_region = pd.crosstab(df_profiles['dependent_profile'], df_profiles['Region'])

# Get percentages within each profile
dependent_profile_region_pct = dependent_profile_region.div(dependent_profile_region.sum(axis=1), axis=0) * 100



In [None]:
# Create a stacked bar chart
region_breakdown_by_income = income_profile_region_pct.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Regional Distribution of Income-Based Customer Profiles')
plt.ylabel('Percentage')
plt.xlabel('Customer Income Profile')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Region', loc='upper right')
plt.show()

It appears that most profiles have the highest percentage of customers in the South region. One interesting trend that appears too is that high-income older adults are noticeably more present in the Midwest.

In [None]:
# Create another stacked bar chart
region_breakdown_by_dependent = dependent_profile_region_pct.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Regional Distribution of Dependent-Based Customer Profiles')
plt.ylabel('Percentage')
plt.xlabel('Customer Dependent Profile')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Region', loc='upper right')
plt.show()

In [None]:
## Saving Figures
region_breakdown_by_dependent.figure.savefig("C:/Users/Sam/Documents/Data Analytics Projects/04 Analysis/Visualizations/region breakdown by dependent profile.png")
region_breakdown_by_income.figure.savefig("C:/Users/Sam/Documents/Data Analytics Projects/04 Analysis/Visualizations/region breakdown by income profile.png")

## Analyzing Most Ordered Department by Profile

In [None]:
# Create a cross-tabulation to count departments within each profile
income_profile_department = pd.crosstab(df_profiles['income_profile'], df_profiles['department_id'])
dependent_profile_department = pd.crosstab(df_profiles['dependent_profile'], df_profiles['department_id'])

# Get the most frequently ordered department for each profile
income_most_ordered_dept = income_profile_department.idxmax(axis=1)
dependent_most_ordered_dept = dependent_profile_department.idxmax(axis=1)


In [None]:
# Create a bar chart
income_top_departments = income_most_ordered_dept.plot(kind='bar', figsize=(10, 6))
plt.title('Most Ordered Department by Income Profile')
plt.ylabel('Department ID')
plt.xlabel('Income Profile')
plt.xticks(rotation=45, ha='right')
for p in income_top_departments.patches:
    income_top_departments.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center',
                                    va = 'center', xytext=(0, 10), textcoords='offset points')

plt.show()

In [None]:
# Create a bar chart for dependent profiles
dependent_top_departments = dependent_most_ordered_dept.plot(kind='bar', figsize=(10, 6))
plt.title('Most Ordered Department by Dependent Profile')
plt.ylabel('Department ID')
plt.xlabel('Dependent Profile')
plt.xticks(rotation=45, ha='right')
for p in dependent_top_departments.patches:
    dependent_top_departments.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center',
                                    va = 'center', xytext=(0, 10), textcoords='offset points')

plt.show()

For both profiles, the most commonly ordered department is 4, which according to the project brief, is produce.

In [None]:
# Export Graphs
dependent_top_departments.figure.savefig("C:/Users/Sam/Documents/Data Analytics Projects/04 Analysis/Visualizations/top department by dependent profile.png")
income_top_departments.figure.savefig("C:/Users/Sam/Documents/Data Analytics Projects/04 Analysis/Visualizations/top department by income profile.png")