In [None]:
# Workshop: Creating Custom Visualizations with Seaborn

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

#1 Load the data

customers_df = pd.read_csv("customers_wk9_thurs.csv")
products_df = pd.read_csv("products_wk9_thurs.csv")
sales_df = pd.read_csv("sales_wk9_thurs.csv")

In [None]:
customers_df.info()
products_df.info()
sales_df.info()

In [None]:
# 2.  convert date columns to proper DateTime format using pd.to_datetime()

customers_df['registration_date'] = pd.to_datetime(customers_df['registration_date'])
products_df['launch_date'] = pd.to_datetime(products_df['launch_date'])
sales_df['transaction_date'] = pd.to_datetime(sales_df['transaction_date'])

In [None]:
#3. Extract useful components from the sales transaction dates: year, month, and day of week

sales_df['sales_year'] = pd.to_datetime(sales_df['transaction_date']).dt.year
sales_df['sales_month'] = pd.to_datetime(sales_df['transaction_date']).dt.month
sales_df['sales_day_of_week'] = pd.to_datetime(sales_df['transaction_date']).dt.dayofweek
sales_df['sales_day_name'] = pd.to_datetime(sales_df['transaction_date']).dt.day_name()

sales_df.head()

In [None]:
#4.	Step 4: Prepare your data for each chart in the dashboard.
#Hint: You’ll need to merge `sales` with `products` to get category info, group data using `.groupby()`, and count using `.value_counts()`.

sales_with_category = sales_df.merge(products_df[['product_id', 'category']], on='product_id')
category_revenue = sales_with_category.groupby('category')['total_amount'].sum().sort_values(ascending=False)

state_customer_counts = customers_df['state'].value_counts()

day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily_avg_amount = sales_df.groupby('sales_day_name')['total_amount'].mean().reindex(day_order)

# Fill any missing days with 0 (in case no transactions happened on certain days)
daily_avg_amount = daily_avg_amount.fillna(0)

# Chart 4 Data: Transaction amounts for distribution analysis
# We'll use this to create a histogram showing the spread of transaction values
transaction_amounts = sales_df['total_amount']

#5.	Step 5: Apply a professional seaborn and matplotlib style.
#Hint: Use `sns.set_theme(style='whitegrid', context='talk')` and customize `plt.rcParams.update()` for cleaner visuals.
sns.set_theme(style='whitegrid', context='talk')

plt.rcParams.update({
    'figure.facecolor': 'white',        # White background for the entire figure
    'axes.facecolor': 'white',          # White background for each chart
    'font.family': 'DejaVu Sans',       # Professional, readable font
    'axes.spines.top': False,           # Remove top border (cleaner look)
    'axes.spines.right': False,         # Remove right border (cleaner look)
})


In [None]:
# Display key statistics first 

print(" Total Revenue: $" ,sales_df['total_amount'].sum())
print(" Average Transaction: ", sales_df['total_amount'].mean())
print(" Product Categories: ", products_df['category'].nunique())
print(" States Represented: ", customers_df['state'].nunique())

In [None]:
#6.	Step 6: Build your 2x2 subplot layout with `plt.subplots()`.
#Hint: Use `(2, 2)` for two rows and two columns. Use `fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(...)` to access each plot area.

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20, 15))
fig.suptitle('Retail Chain Performance: Executive Summary Dashboard', fontsize=18, fontweight='bold')

##7.	Step 7: Plot the following charts in each subplot:

#- Top Left: Horizontal bar chart of revenue by category using `sns.barplot()`.
sns.barplot(x=category_revenue.values, y=category_revenue.index, ax=ax1, color='skyblue')

ax1.set_title('Revenue by Product Category', fontsize=14, fontweight='bold')
ax1.set_xlabel('Total Revenue ($)')
ax1.set_ylabel('Product Category')

#- Top Right: Top 6 states by customer count.
top_states = state_customer_counts.head(6)

# Create horizontal bar chart for state data
sns.barplot(x=top_states.values, y=top_states.index, ax=ax2, color='skyblue')
ax2.set_title('Customer Distribution by State (Top 6)', fontsize=14, fontweight='bold')
ax2.set_xlabel('Number of Customers')
ax2.set_ylabel('State')


#- Bottom Left: Line chart of average transaction by day.
ax3.plot(range(len(daily_avg_amount)), daily_avg_amount.values, marker='o',linewidth=2, markersize=6, color='skyblue')      

ax3.set_title('Average Transaction Value by Day of Week', fontsize=14, fontweight='bold')
ax3.set_xlabel('Day of the Week')
ax3.set_ylabel('Average Transaction Amount in ($)')

ax3.set_xticks(range(len(daily_avg_amount)))
ax3.set_xticklabels(daily_avg_amount.index, rotation=45)

#- Bottom Right: Histogram of transaction amounts with a vertical line for the mean.
#Hint: Use `.plot()` or `sns.histplot()` for these, and don’t forget to add titles and labels using `.set_title()` and `.set_xlabel()`.

sns.histplot(transaction_amounts, bins=15, ax=ax4, alpha=0.7, color='skyblue')
ax4.set_title('Transaction Amount Distribution', fontsize=14, fontweight='bold')
ax4.set_xlabel('Transaction Amount ($)')
ax4.set_ylabel('Count')

# Add a reference line showing the average (mean) transaction amount
# This helps viewers understand what's "typical"
mean_amount = transaction_amounts.mean()
ax4.axvline(mean_amount, color='red',linestyle='--', linewidth=2,label=f'Mean: ${mean_amount:.2f}') 

# Add legend so viewers know what the red line represents
ax4.legend()

plt.tight_layout()
plt.show()

In [None]:
# Data Preparation for Layered Analysis ** As per the answer-key wkshop-learner.ipynb **

print("\n Preparing customer spending analysis...")

# Create customer spending summary by aggregating sales data
customer_spending = sales_df.groupby('customer_id')['total_amount'].agg(['sum', 'count']).reset_index()
customer_spending.columns = ['customer_id', 'total_spent', 'purchase_count']

# Merge customer information with their spending data
customer_analysis = customers_df.merge(customer_spending, on='customer_id', how='left')

# Handle customers who haven't made purchases (fill NaN with 0)
customer_analysis[['total_spent', 'purchase_count']] = customer_analysis[['total_spent', 'purchase_count']].fillna(0)

# Create age groups for categorical analysis
customer_analysis['age_group'] = pd.cut(customer_analysis['age'], 
                                       bins=[0, 30, 45, 60, 100], 
                                       labels=['Under 30', '30-45', '46-60', 'Over 60'])

print(f" Data preparation complete:")
print(f"   - Customer spending analysis: {len(customer_analysis)} customers")
print(f"   - Age groups created: {customer_analysis['age_group'].value_counts().to_dict()}")
print(f"   - Spending range: ${customer_analysis['total_spent'].min():.0f} - ${customer_analysis['total_spent'].max():.0f}")

In [None]:
# : ADVANCED LAYERED VISUALIZATION ** As per the answer-key wkshop-learner.ipynb **

# Create a larger figure to accommodate multiple layers and legend
fig, ax = plt.subplots(figsize=(14, 8))

# Set up professional styling
sns.set_theme(style="whitegrid", context="talk")


# LAYER 1: Foundation Scatter Plot 

# Create scatter plot showing income vs total spending, color coded ed by age group
scatter = sns.scatterplot(data=customer_analysis, 
                         x='income', 
                         y='total_spent',
                         hue='age_group',
                         alpha=0.7,
                         s=80,
                         ax=ax)


# LAYER 2: Statistical Trend Line
# Add regression line showing overall income-spending relationship
sns.regplot(data=customer_analysis, 
           x='income', 
           y='total_spent',
           scatter=False,
           color='darkred',
           line_kws={'linewidth': 3},
           ax=ax)


# LAYER 3: Business Context References

# Calculate business thresholds
target_spending = customer_analysis['total_spent'].mean()
high_income_threshold = customer_analysis['income'].quantile(0.75)

# Add horizontal line for target spending threshold
ax.axhline(target_spending, color='green', linestyle='--', linewidth=2, 
          alpha=0.8, label=f'Target Spending: ${target_spending:.0f}')

# Add vertical line for high-income threshold
ax.axvline(high_income_threshold, color='orange', linestyle='--', linewidth=2, 
          alpha=0.8, label=f'High Income: ${high_income_threshold:,.0f}')

# LAYER 4: Statistical Annotations

# Calculate and display the correlation coefficient between income and spending
correlation = customer_analysis['income'].corr(customer_analysis['total_spent'])

# Calculate key business metrics
high_income_customers = len(customer_analysis[customer_analysis['income'] > high_income_threshold])
high_spenders = len(customer_analysis[customer_analysis['total_spent'] > target_spending])
conversion_rate = (high_spenders / high_income_customers * 100) if high_income_customers > 0 else 0

# Add correlation annotation in top-left
ax.text(0.05, 0.95, f'Income-Spending Correlation: {correlation:.3f}', 
        transform=ax.transAxes, fontsize=12, fontweight='bold',
        bbox=dict(boxstyle="round,pad=0.5", facecolor='lightblue', alpha=0.8))

# Add business insights annotation in top-right
insight_text = f'High Income Customers: {high_income_customers}\nHigh Spenders: {high_spenders}\nConversion Rate: {conversion_rate:.1f}%'
ax.text(0.95, 0.95, insight_text, transform=ax.transAxes, fontsize=11,
        bbox=dict(boxstyle="round,pad=0.5", facecolor='wheat', alpha=0.8),
        verticalalignment='top', horizontalalignment='right')


# LAYER 5: Performance Insights

# Calculate average spending by age group
age_group_avg = customer_analysis.groupby('age_group', observed=True)['total_spent'].mean()
top_age_group = age_group_avg.idxmax()
top_avg_spending = age_group_avg.max()

# Find highest spender from top age group for annotation
top_customer = customer_analysis[customer_analysis['age_group'] == top_age_group].nlargest(1, 'total_spent')

if not top_customer.empty:
    ax.annotate(f'Top Age Group:\n{top_age_group}\nAvg: ${top_avg_spending:.0f}',
                xy=(top_customer['income'].iloc[0], top_customer['total_spent'].iloc[0]),
                xytext=(20, 20), textcoords='offset points',
                bbox=dict(boxstyle="round,pad=0.3", facecolor='yellow', alpha=0.7),
                arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.1', color='black'),
                fontsize=10, fontweight='bold')


# Add comprehensive title
ax.set_title('Customer Income vs Spending Analysis\nLayered Business Intelligence Dashboard', 
             fontsize=16, fontweight='bold', pad=20)

# Add clear axis labels with business context
ax.set_xlabel('Annual Income ($)', fontsize=12, fontweight='bold')
ax.set_ylabel('Total Amount Spent ($)', fontsize=12, fontweight='bold')

# Format axes to show currency
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))

# Position legend outside the plot
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Apply layout and display
plt.tight_layout()
plt.show()