# Generate Seed Points for Expansion

Generate potential new store locations using the top 25% of H3 cells with highest total POI counts from the gold table.

These locations represent areas with high commercial activity and could be good candidates for new store openings.


In [None]:
# Read h3_features_gold table
h3_features_df = spark.table(f"{catalog}.gold.h3_features_gold")

# Calculate total POI count threshold for top 25%
poi_percentile = h3_features_df.selectExpr("percentile_approx(total_poi_count, 0.75) as p75").collect()[0]['p75']

print(f"Total POI count threshold (75th percentile): {poi_percentile}")

# Filter to top 25% by total POI count
top_h3_cells = (h3_features_df
                .filter(F.col("total_poi_count") >= poi_percentile)
                .filter(F.col("total_poi_count") > 0)  # Ensure we have actual POIs
                )

print(f"Number of H3 cells in top 25%: {top_h3_cells.count()}")


In [None]:
# Calculate centroids of H3 cells and get geographic context
h3_with_centroids = (top_h3_cells
    .withColumn("center_wkt", F.expr("h3_centeraswkt(h3_cell_id)"))
    .withColumn("center_point", F.expr("ST_GeomFromWKT(center_wkt, 4326)"))
    .withColumn("latitude", F.expr("ST_Y(center_point)"))
    .withColumn("longitude", F.expr("ST_X(center_point)"))
)

# Load block groups to get city/state information via spatial join
blockgroups_geo = spark.table(f"{catalog}.bronze.census_blockgroups") \
    .select("geoid", "name", "state_fips", "county_fips", "geometry")

# Spatial join to get geographic attributes
h3_with_geo = h3_with_centroids.join(
    F.broadcast(blockgroups_geo),
    F.expr("ST_Contains(geometry, center_point)"),
    "left"
)

display(h3_with_geo.select("h3_cell_id", "latitude", "longitude", "name", "state_fips", "total_poi_count").limit(10))


In [None]:
# Format seed points to match RMC locations structure
# Add row number for store numbering
from pyspark.sql.window import Window

window_spec = Window.orderBy(F.desc("total_poi_count"))

seed_points_expansion = (h3_with_geo
    .withColumn("row_num", F.row_number().over(window_spec))
    .withColumn("store_number", (20000 + F.col("row_num")).cast("string"))  # Start at 20001
    .withColumn("store_type", F.lit("New Expansion"))
    .withColumn("address", F.concat(F.col("store_number"), F.lit(" Expansion Blvd")))
    .withColumn("city", 
                F.when(F.col("name").isNotNull(), 
                       F.regexp_extract(F.col("name"), r"^(.*?),", 1))
                .otherwise("Boston"))  # Default to Boston if no match
    .withColumn("state", F.lit("MA"))
    .withColumn("zip_code", F.lit("02101"))  # Placeholder
    .withColumn("phone_number", F.concat(
        F.lit("(774) "),
        F.lpad((800 + (F.col("row_num") % 200)).cast("string"), 3, "0"),
        F.lit("-"),
        F.lpad((F.col("row_num") * 17 % 10000).cast("string"), 4, "0")
    ))
    .withColumn("store_hours", F.lit("Monday: 08:00-22:00|Tuesday: 08:00-22:00|Wednesday: 08:00-22:00|Thursday: 08:00-22:00|Friday: 08:00-22:00|Saturday: 08:00-22:00|Sunday: 08:00-22:00"))
    .withColumn("store_services", F.lit("Planned Location"))
    .withColumn("country", F.lit("United States"))
    .withColumn("county", 
                F.when(F.col("county_fips").isNotNull(), F.col("county_fips"))
                .otherwise("Suffolk County"))
    .withColumn("country_code", F.lit("US"))
    .withColumn("geo_accuracy", F.lit("H3_CENTROID"))
    .select(
        "store_number", "store_type", "address", "city", "state", "zip_code",
        "phone_number", "latitude", "longitude", "store_hours", "store_services",
        "country", "county", "country_code", "geo_accuracy"
    )
)

display(seed_points_expansion.limit(20))


In [None]:
# Write seed points to bronze layer
seed_points_table = f"{catalog}.bronze.seed_points_expansion"

(seed_points_expansion
 .write
 .mode("overwrite")
 .saveAsTable(seed_points_table))

print(f"Written seed points expansion to {seed_points_table}")

# Verify the structure matches RMC locations
display(spark.sql(f"""
    SELECT 
        COUNT(*) as total_seed_points,
        COUNT(DISTINCT city) as unique_cities,
        COUNT(DISTINCT store_type) as unique_store_types,
        MIN(latitude) as min_lat,
        MAX(latitude) as max_lat,
        MIN(longitude) as min_lon,
        MAX(longitude) as max_lon
    FROM {seed_points_table}
"""))


In [None]:
# Visualize seed points on map
display(spark.sql(f"""
    SELECT 
        store_number,
        store_type,
        city,
        state,
        latitude,
        longitude,
        store_services,
        geo_accuracy
    FROM {seed_points_table}
    ORDER BY store_number
"""))
