# Sales Driver Analysis

Exploratory analysis to understand what drives sales for RMC Retail locations.

**Approach:**
1. Load trade area features for RMC stores
2. Join with sales data
3. Summary statistics for all variables
4. Correlation analysis between features and sales
5. Visualizations to identify key drivers

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Configuration
catalog = "geo_site_selection"
gold_schema = "gold"
trade_area_table = f"{catalog}.{gold_schema}.rmc_urbanicity_based_isochrones_enriched"

## Load Trade Area Features

In [None]:
# Load trade area features
trade_area_features = spark.table(trade_area_table)

print(f"Total stores: {trade_area_features.count()}")
print(f"\nSchema:")
trade_area_features.printSchema()

display(trade_area_features.limit(5))

## Generate or Join Sales Data

If sales data exists in a separate table, update this cell to join it.
Otherwise, we'll generate synthetic sales based on feature correlation for demonstration.

In [None]:
# Check if sales data exists
sales_table_exists = False

try:
    sales_df = spark.table(f"{catalog}.bronze.rmc_sales_data")
    sales_table_exists = True
    print("Sales data found, joining...")
    
    # Join with trade area features
    data = trade_area_features.join(sales_df, "store_number", "inner")
    
except:
    print("No sales table found. Generating synthetic sales data for demonstration...")
    print("\nSales Formula Story:")
    print("- Target: Urban areas with younger, affluent demographics")
    print("- POI density is important (vibrant neighborhoods)")
    print("- Distance from KEY competitors matters (further = better)")
    print("- Other general competitors are acceptable\n")
    
    # Simplified formula that clearly varies with inputs
    data = (trade_area_features
        .withColumn(
            "annual_sales",
            (
                # Base: 300k
                300000 +
                
                # Urbanicity category bonus
                F.when(F.col("urbanicity_category") == "urban", 200000)
                 .when(F.col("urbanicity_category") == "suburban", 100000)
                 .otherwise(50000) +
                
                # Demographics - direct scaling
                (F.col("male_18_to_24") + F.col("female_18_to_24")) * 30 +
                (F.col("income_100k_125k") + F.col("income_125k_150k") + F.col("income_150k_200k") + F.col("income_200k_plus")) * 5 +
                (F.col("bachelors_degree") + F.col("masters_degree")) * 3 +
                
                # POI impact
                F.col("total_poi_count") * 100 +
                
                # Competitor distance bonus
                F.coalesce(F.col("distance_to_valuemart_miles"), F.lit(0)) * 10000 +
                F.coalesce(F.col("distance_to_quickshop_market_miles"), F.lit(0)) * 5000 +
                
                # Population impact
                (F.col("total_population") / 100)
            ).cast("long")
        )
        .withColumn("monthly_sales", (F.col("annual_sales") / 12).cast("long"))
    )
    
    # Save sales to gold layer
    sales_table = f"{catalog}.{gold_schema}.rmc_retail_location_sales"
    
    sales_only = data.select(
        "store_number",
        "store_type",
        "city",
        "state",
        "urbanicity_category",
        "annual_sales",
        "monthly_sales"
    ).withColumn("created_timestamp", F.current_timestamp())
    
    (
        sales_only
        .write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(sales_table)
    )
    
    print(f"\nâœ“ Sales data saved to {sales_table}")

print(f"\nData prepared with {data.count()} stores")
print("\nDEBUG - Checking for variation in inputs and outputs:")
display(data.select(
    "store_number", 
    "city", 
    "urbanicity_category",
    "total_population",
    "male_18_to_24",
    "female_18_to_24",
    "income_100k_125k",
    "total_poi_count",
    "distance_to_valuemart_miles",
    "annual_sales", 
    "monthly_sales"
).orderBy(F.desc("annual_sales")))

## Summary Statistics

In [None]:
# Get all numeric columns
numeric_cols = [
    field.name for field in data.schema.fields
    if field.dataType.typeName() in ['long', 'double', 'integer', 'float']
]

print(f"Total numeric features: {len(numeric_cols)}\n")
print("Numeric columns:")
for col in sorted(numeric_cols):
    print(f"  - {col}")

In [None]:
# Summary statistics for key variables
display(data.select(
    "annual_sales",
    "total_population",
    "median_household_income",
    "total_poi_count",
    "total_competitor_count",
    "urbanicity_score",
    "population_density",
    "area_sqkm"
).summary())

In [None]:
# Sales by urbanicity category
display(data.groupBy("urbanicity_category").agg(
    F.count("store_number").alias("store_count"),
    F.round(F.avg("annual_sales"), 0).alias("avg_annual_sales"),
    F.round(F.min("annual_sales"), 0).alias("min_annual_sales"),
    F.round(F.max("annual_sales"), 0).alias("max_annual_sales"),
    F.round(F.avg("total_population"), 0).alias("avg_population"),
    F.round(F.avg("median_household_income"), 0).alias("avg_income"),
    F.round(F.avg("total_poi_count"), 0).alias("avg_poi_count"),
    F.round(F.avg("total_competitor_count"), 0).alias("avg_competitor_count")
).orderBy(F.desc("avg_annual_sales")))

## Correlation Analysis

In [None]:
# Convert to Pandas for correlation analysis
# Select relevant numeric columns for correlation
correlation_cols = [
    "annual_sales",
    "total_population",
    "median_household_income",
    "per_capita_income",
    "total_poi_count",
    "total_competitor_count",
    "urbanicity_score",
    "population_density",
    "area_sqkm",
    "h3_cell_count",
    "drive_time_minutes"
]

# Filter to only columns that exist
existing_corr_cols = [col for col in correlation_cols if col in data.columns]

# Convert to Pandas
pdf = data.select(existing_corr_cols).toPandas()

# Calculate correlation matrix
corr_matrix = pdf.corr()

# Display correlations with sales
sales_corr = corr_matrix['annual_sales'].sort_values(ascending=False)
print("Correlations with Annual Sales:\n")
print(sales_corr)

In [None]:
# Visualize correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            fmt='.2f', linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=16, pad=20)
plt.tight_layout()
plt.show()

In [None]:
# Top positive correlations with sales (excluding self-correlation)
sales_corr_sorted = sales_corr.drop('annual_sales').sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sales_corr_sorted.plot(kind='barh', color='steelblue')
plt.title('Feature Correlations with Annual Sales', fontsize=14, pad=15)
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## Detailed Feature Analysis

In [None]:
# Get POI category columns
poi_cols = [col for col in data.columns if col.startswith('poi_count_')]

if poi_cols:
    print(f"Analyzing {len(poi_cols)} POI categories...\n")
    
    # Calculate correlations for all POI categories
    poi_pdf = data.select(['annual_sales'] + poi_cols).toPandas()
    poi_corr = poi_pdf.corr()['annual_sales'].drop('annual_sales').sort_values(ascending=False)
    
    print("POI Category Correlations with Sales:\n")
    print(poi_corr.head(15))
    
    # Visualize top POI correlations
    plt.figure(figsize=(10, 8))
    poi_corr.head(20).plot(kind='barh', color='coral')
    plt.title('Top 20 POI Categories - Correlation with Sales', fontsize=14, pad=15)
    plt.xlabel('Correlation Coefficient', fontsize=12)
    plt.ylabel('POI Category', fontsize=12)
    plt.tight_layout()
    plt.show()
else:
    print("No POI category columns found")

In [None]:
# Demographic features correlation
demo_keywords = ['population', 'income', 'household', 'education', 'employment', 'age', 'race', 'commute']
demo_cols = ['annual_sales'] + [col for col in data.columns 
                                 if any(kw in col.lower() for kw in demo_keywords)
                                 and col in numeric_cols]

if len(demo_cols) > 1:
    demo_pdf = data.select(demo_cols).toPandas()
    demo_corr = demo_pdf.corr()['annual_sales'].drop('annual_sales').sort_values(ascending=False)
    
    print(f"Analyzing {len(demo_corr)} demographic features...\n")
    print("Top Demographic Correlations with Sales:\n")
    print(demo_corr.head(20))
    
    # Visualize
    plt.figure(figsize=(10, 8))
    demo_corr.head(20).plot(kind='barh', color='mediumseagreen')
    plt.title('Top 20 Demographic Features - Correlation with Sales', fontsize=14, pad=15)
    plt.xlabel('Correlation Coefficient', fontsize=12)
    plt.ylabel('Demographic Feature', fontsize=12)
    plt.tight_layout()
    plt.show()

## Scatter Plots - Key Drivers

In [None]:
# Create scatter plots for top correlated features
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
fig.suptitle('Sales vs Key Features', fontsize=16, y=1.00)

# Plot 1: Population vs Sales
axes[0, 0].scatter(pdf['total_population'], pdf['annual_sales'], alpha=0.6, color='steelblue')
axes[0, 0].set_xlabel('Total Population in Trade Area', fontsize=11)
axes[0, 0].set_ylabel('Annual Sales ($)', fontsize=11)
axes[0, 0].set_title('Population Impact on Sales', fontsize=12)
axes[0, 0].grid(True, alpha=0.3)

# Plot 2: Median Income vs Sales
if 'median_household_income' in pdf.columns:
    axes[0, 1].scatter(pdf['median_household_income'], pdf['annual_sales'], alpha=0.6, color='coral')
    axes[0, 1].set_xlabel('Median Household Income ($)', fontsize=11)
    axes[0, 1].set_ylabel('Annual Sales ($)', fontsize=11)
    axes[0, 1].set_title('Income Impact on Sales', fontsize=12)
    axes[0, 1].grid(True, alpha=0.3)

# Plot 3: POI Count vs Sales
axes[1, 0].scatter(pdf['total_poi_count'], pdf['annual_sales'], alpha=0.6, color='mediumseagreen')
axes[1, 0].set_xlabel('Total POI Count in Trade Area', fontsize=11)
axes[1, 0].set_ylabel('Annual Sales ($)', fontsize=11)
axes[1, 0].set_title('POI Density Impact on Sales', fontsize=12)
axes[1, 0].grid(True, alpha=0.3)

# Plot 4: Competitor Count vs Sales
axes[1, 1].scatter(pdf['total_competitor_count'], pdf['annual_sales'], alpha=0.6, color='crimson')
axes[1, 1].set_xlabel('Total Competitor Count in Trade Area', fontsize=11)
axes[1, 1].set_ylabel('Annual Sales ($)', fontsize=11)
axes[1, 1].set_title('Competition Impact on Sales', fontsize=12)
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Key Insights Summary

In [None]:
# Generate insights summary
print("=" * 80)
print("KEY INSIGHTS - SALES DRIVERS FOR RMC RETAIL")
print("=" * 80)

# Top 3 positive correlations
top_3_positive = sales_corr_sorted.head(3)
print("\nTop 3 Positive Drivers:")
for i, (feature, corr) in enumerate(top_3_positive.items(), 1):
    print(f"  {i}. {feature}: {corr:.3f}")

# Top 3 negative correlations
top_3_negative = sales_corr_sorted.tail(3)
print("\nTop 3 Negative Factors:")
for i, (feature, corr) in enumerate(reversed(list(top_3_negative.items())), 1):
    print(f"  {i}. {feature}: {corr:.3f}")

# Performance by urbanicity
print("\nPerformance by Urbanicity:")
urbanicity_stats = data.groupBy("urbanicity_category").agg(
    F.count("store_number").alias("stores"),
    F.round(F.avg("annual_sales"), 0).alias("avg_sales")
).orderBy(F.desc("avg_sales")).collect()

for row in urbanicity_stats:
    print(f"  {row['urbanicity_category'].title()}: ${row['avg_sales']:,.0f} avg ({row['stores']} stores)")

# Overall stats
overall_stats = data.select(
    F.avg("annual_sales").alias("avg_sales"),
    F.min("annual_sales").alias("min_sales"),
    F.max("annual_sales").alias("max_sales"),
    F.stddev("annual_sales").alias("std_sales")
).collect()[0]

print("\nOverall Sales Performance:")
print(f"  Average: ${overall_stats['avg_sales']:,.0f}")
print(f"  Range: ${overall_stats['min_sales']:,.0f} - ${overall_stats['max_sales']:,.0f}")
print(f"  Std Dev: ${overall_stats['std_sales']:,.0f}")

print("\n" + "=" * 80)

## Export Results

In [None]:
# Save correlation results to table for further analysis
correlation_results = []
for feature, corr_value in sales_corr_sorted.items():
    correlation_results.append({
        'feature': feature,
        'correlation_with_sales': float(corr_value),
        'abs_correlation': abs(float(corr_value))
    })

corr_df = spark.createDataFrame(correlation_results)
corr_df = corr_df.orderBy(F.desc('abs_correlation'))

display(corr_df)

In [None]:
# Optional: Save full dataset with sales for modeling
# Uncomment to save
# output_table = f"{catalog}.{gold_schema}.rmc_sales_with_features"
# (
#     data
#     .write
#     .format("delta")
#     .mode("overwrite")
#     .option("overwriteSchema", "true")
#     .saveAsTable(output_table)
# )
# print(f"Data saved to {output_table}")