In [None]:
# Databricks notebook source# MAGIC %md# MAGIC # Seed Points Sales Prediction# MAGIC# MAGIC Predicts sales for seed point expansion locations using the same formula as RMC stores.# MAGIC Selects top 25% performers for expansion recommendation.

%md## Parameters

In [None]:
from pyspark.sql import functions as Fdbutils.widgets.text("catalog", "geo_site_selection")dbutils.widgets.text("gold_schema", "gold")catalog = dbutils.widgets.get("catalog")gold_schema = dbutils.widgets.get("gold_schema")seed_points_table = f"{catalog}.{gold_schema}.gold_seed_point_isochrones_features"output_table = f"{catalog}.{gold_schema}.gold_seed_points_expansion_top_25"print(f"Input: {seed_points_table}")print(f"Output: {output_table}")

%md## Load Seed Points Trade Area Features

In [None]:
seed_points = spark.table(seed_points_table)print(f"Total seed points: {seed_points.count()}")display(seed_points.limit(5))

%md## Predict SalesUsing the same formula as RMC retail locations:- Urban areas with young, affluent demographics- POI density (vibrant neighborhoods)- Distance from key competitors (ValueMart, QuickShop)

In [None]:
# Apply same sales prediction formula as RMC storesseed_points_with_sales = seed_points.withColumn(    "predicted_annual_sales",    (        # Base: 300k        300000 +                # Demographics - direct scaling        (F.col("male_18_to_24") + F.col("female_18_to_24")) * 30 +        (F.col("income_100k_125k") + F.col("income_125k_150k") + F.col("income_150k_200k") + F.col("income_200k_plus")) * 5 +        (F.col("bachelors_degree") + F.col("masters_degree")) * 3 +                # POI impact        F.col("total_poi_count") * 100 +                # Competitor distance bonus        F.coalesce(F.col("distance_to_valuemart_miles"), F.lit(0)) * 10000 +        F.coalesce(F.col("distance_to_quickshop_market_miles"), F.lit(0)) * 5000 +                # Population impact        (F.col("total_population") / 100)    ).cast("long")).withColumn(    "predicted_monthly_sales",    (F.col("predicted_annual_sales") / 12).cast("long"))print(f"Sales predictions generated for {seed_points_with_sales.count()} seed points")display(seed_points_with_sales.select(    "store_number",    "city",    "total_population",    "total_poi_count",    "distance_to_valuemart_miles",    "predicted_annual_sales",    "predicted_monthly_sales").orderBy(F.desc("predicted_annual_sales")).limit(10))

%md## Select Top 25%

In [None]:
# Calculate 75th percentile thresholdpercentile_75 = seed_points_with_sales.selectExpr(    "percentile_approx(predicted_annual_sales, 0.75) as p75").collect()[0]['p75']print(f"75th percentile threshold: ${percentile_75:,}")# Filter to top 25%top_25_percent = seed_points_with_sales.filter(    F.col("predicted_annual_sales") >= percentile_75)top_count = top_25_percent.count()total_count = seed_points_with_sales.count()print(f"Top 25%: {top_count} locations out of {total_count} total")display(top_25_percent.select(    "store_number",    "latitude",    "longitude",    "city",    "state",    # "urbanicity_category",    "total_population",    "total_poi_count",    "distance_to_valuemart_miles",    "predicted_annual_sales",    "predicted_monthly_sales").orderBy(F.desc("predicted_annual_sales")))

%md## Write to Gold

In [None]:
# Add processing timestamptop_25_final = top_25_percent.withColumn("processing_timestamp", F.current_timestamp())# Write to gold layer(    top_25_final    .write    .format("delta")    .mode("overwrite")    .option("overwriteSchema", "true")    .saveAsTable(output_table))print(f"\n✓ Written {top_count} top performing seed points to {output_table}")

%md## Summary Statistics

In [None]:
display(spark.sql(f"""  SELECT    COUNT(*) as total_locations,    ROUND(AVG(predicted_annual_sales), 0) as avg_predicted_sales,    ROUND(MIN(predicted_annual_sales), 0) as min_predicted_sales,    ROUND(MAX(predicted_annual_sales), 0) as max_predicted_sales,    ROUND(AVG(total_population), 0) as avg_population,    ROUND(AVG(total_poi_count), 0) as avg_poi_count,    ROUND(AVG(distance_to_valuemart_miles), 2) as avg_distance_valuemart  FROM {output_table}"""))