# Step 4: Visualizations

Section: Step 4: Visualizations - Box Plots, Histograms, and Density Plots

**Part of:** [marketing_campaign_082825_working.ipynb](./marketing_campaign_082825_working.ipynb)

In [None]:
# Setup and data loading
from utils import (
    ProjectConfig,
    save_intermediate_results,
    load_intermediate_results,
    reset_plot_settings,
)
import seaborn as sns
import matplotlib.pyplot as plt

config = ProjectConfig()
# Load data from previous notebook
df = load_intermediate_results('data_from_05_step_3.pkl', config)

# Visualizations - Box Plots, Histograms, and Density Plots

In [None]:
# Plot reset, use between sections:
reset_plot_settings()

In [None]:
# Visualizations - Box Plots, Histograms, and Density Plots

variables = ['Age','Income','Total_Purchases','Total_Spending']
titles = ['Age','Income','Total Purchases','Total Spending']
units = ['Years','USD','Number','USD']

# Create a 4x3 subplot grid
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(12,12))

for i, var in enumerate(variables):
    # Boxplot
    sns.boxplot(y=df[var], ax=axes[i,0], color='skyblue')
    axes[i,0].set_title(f'{titles[i]} Boxplot')
    axes[i,0].set_ylabel(units[i])

    # Histogram with KDE overlay
    sns.histplot(data=df, x=var, ax=axes[i,1], bins=30, color='salmon', stat='density')
    sns.kdeplot(data=df, x=var, ax=axes[i,1], color='navy', linewidth=2)
    axes[i,1].set_title(f'{titles[i]} Histogram with KDE')
    axes[i,1].set_xlabel(units[i])

# Adjust layout to prevent overlap
plt.tight_layout()
fig.savefig('figures/Original_Age_and_Income_Plots.png', dpi=300, bbox_inches='tight')
#save_project_figure(
#    "Original_Age_and_Income_Plots",
#    "Original Age and Income Plots",
#    config,
#)
plt.show()


In [None]:
def find_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    print(f'Lower Bound: {lower_bound}')
    df_lower = df[df[column] < lower_bound]
    if df_lower[column].count() > 0:
        print(f'Number of records below Lower Bound: {df_lower[column].count()}')
        print(df_lower[column])
    else:
        print(f'Number of records below Lower Bound: {df_lower[column].count()}')

    print(f'Upper Bound: {upper_bound}')
    df_upper = df[df[column] > upper_bound]
    if df_upper[column].count() > 0:
        print(f'Number of records above Upper Bound: {df_upper[column].count()}')
        print(df_upper[column])
    else:
        print(f'Number of records above Upper Bound: {df_upper[column].count()}')

    #print(df[(df[column] >= lower_bound) & (df[column] <= upper_bound)][column].count())

variables = ['Age','Income','Total_Purchases','Total_Spending']
for var in variables:
    print(f'\n{var}')
    find_outliers(df, var)

# Results from the plots and outlier calculations show that for the variables Total_Purchases and Total_Spending
# It does not make sense to remove any outliers. While the data is skewed the outlier data is relevant.

# For the variables Age and Income different results are shown.
# - For Age the 3 outlier values seem to be erroneous since the ages are above 93.
#   The suggestion is to remove these rows.
# - For Income 1 of the 8 values seems to be erroneous or at least drastically different than the rest
#   of the data.
#   The suggestion is to remove the rows with Income outliers above the upper bound

In [None]:
# Remove the outliers for Age and Income

def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

variables = ['Age','Income']
for var in variables:
    df = remove_outliers(df, var)

# Final Check to see if there are any negative values for age, income, total purchases, or total spending
variables = ['Age','Income','Total_Purchases','Total_Spending']
for var in variables:
    print(df[df[var]<0][var].count())

In [None]:
# Visualizations - Box Plots, Histograms, and Density Plots post removal of outlier values

reset_plot_settings()

variables = ['Age','Income','Total_Purchases','Total_Spending']
titles = ['Age','Income','Total Purchases','Total Spending']
units = ['Years','USD','Number','USD']

# Create a 4x3 subplot grid
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(12,12))

for i, var in enumerate(variables):
    # Boxplot
    sns.boxplot(y=df[var], ax=axes[i,0], color='skyblue')
    axes[i,0].set_title(f'{titles[i]} Boxplot')
    axes[i,0].set_ylabel(units[i])

    # Histogram with KDE overlay
    sns.histplot(data=df, x=var, ax=axes[i,1], bins=30, color='salmon', stat='density')
    sns.kdeplot(data=df, x=var, ax=axes[i,1], color='navy', linewidth=2)
    axes[i,1].set_title(f'{titles[i]} Histogram with KDE')
    axes[i,1].set_xlabel(units[i])

# Adjust layout to prevent overlap
plt.tight_layout()
fig.savefig('figures/Age_and_Income_Plots_Post_Outlier_Removal.png', dpi=300, bbox_inches='tight')
plt.show()

# After removing outliers from Age and Income the box and histogram plots look
# more like a normal curve. However Total Purchases and Total Spending are still
# very heavilty right skewed.

In [None]:
median_pivot = df.pivot_table(
    values="Income", index="Marital_Status", columns="Education", aggfunc="median"
).round(2)

print("\nMedian Income Pivot Table:")
print(median_pivot)
# The categorical median has been retained after removing the outliers except for the Divorced-Bachelors category where it has gone from 55635.00 to 55599.00 as seen in the updated pivot table.
median_pivot = df.pivot_table(
    values="Income", index="Marital_Status", columns="Education", aggfunc="median"
).round(2)

print("\nMedian Income Pivot Table:")
print(median_pivot)
# The categorical median has been retained after removing the outliers except for the Divorced-Bachelors category where it has gone from 55635.00 to 55599.00 as seen in the updated pivot table.

In [None]:

# Save results for next notebook
save_intermediate_results(df, 'data_from_06_step_4.pkl', config)
# save_intermediate_results(analysis_results, 'analysis_results.pkl', config)
print('✓ Results saved for next notebook')