In [15]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from sedona.spark import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
import time

spark = (
    SparkSession.builder
    .appName("Query5")
    .config("spark.executor.instances", "2")
    .config("spark.executor.cores", "4")
    .config("spark.executor.memory", "8g")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)
        
sedona = SedonaContext.create(spark)

start_time=time.time()

precincts_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv", header=True)\
    .withColumnRenamed("DIVISION","division")\
    .withColumn("geom", ST_Point(F.col("X"),F.col("Y")))\
    .select("division","geom")

crime_data_2010_2019 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
    header=True, inferSchema=True
)
crime_data_2020_present = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",
    header=True, inferSchema=True
)

# Combine both datasets
crime_data = crime_data_2010_2019.union(crime_data_2020_present)

crime_df = (
    crime_data
    .filter((col("LON") != 0) | (col("LAT") != 0))
    .withColumn("crime_geom", ST_Point(F.col("LON"), F.col("LAT")))
    .select("crime_geom")
)


joined_df = precincts_df.crossJoin(crime_df) \
    .withColumn("distance", ST_DistanceSphere(col("geom"), F.col("crime_geom")))

window_spec = Window.partitionBy("crime_geom").orderBy(F.col("distance"))

closest_division_df = joined_df.withColumn("rank", row_number().over(window_spec))\
    .filter(F.col("rank") == 1) \
    .select("division", "crime_geom", "distance") 
    
result_df = closest_division_df.groupBy("division") \
    .agg(
        F.avg("distance").alias("avg_distance"),
        F.count("crime_geom").alias("#"),
    )\
    .orderBy("#",ascending=False)    
 
result_df.show()

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+------------------+----+
|        division|      avg_distance|   #|
+----------------+------------------+----+
|         PACIFIC| 3617.773433715092|7654|
|        FOOTHILL|4811.3949733459485|7318|
|     WEST VALLEY|3363.5469483212823|6469|
|        VAN NUYS|3430.7651448008714|6453|
|WEST LOS ANGELES|3950.8905508901676|6193|
|         TOPANGA|3818.0253382083038|5971|
| NORTH HOLLYWOOD|3194.6900764893703|5748|
|        WILSHIRE| 2556.598961706194|5731|
|       NORTHEAST| 3695.256227099795|5726|
|          HARBOR|4047.4501072731873|5554|
|         MISSION| 3553.593615521549|5415|
|       HOLLYWOOD|2840.1645352992105|4933|
|      HOLLENBECK|3365.6869973683083|4618|
|       SOUTHWEST|2295.1702816435154|4482|
|      DEVONSHIRE|3083.4413145949866|4437|
|       SOUTHEAST|3321.5950885851876|4347|
|     77TH STREET|1931.1552614873917|3370|
|          NEWTON|1638.2180223355472|3009|
|         RAMPART|1926.0054134765983|2817|
|         OLYMPIC|1726.8573247441022|2760|
+----------

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from sedona.spark import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
import time

spark = (
    SparkSession.builder
    .appName("Query5")
    .config("spark.executor.instances", "4")
    .config("spark.executor.cores", "2")
    .config("spark.executor.memory", "4g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)


        
sedona = SedonaContext.create(spark)

start_time=time.time()

precincts_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv", header=True)\
    .withColumnRenamed("DIVISION","division")\
    .withColumn("geom", ST_Point(F.col("X"),F.col("Y")))\
    .select("division","geom")

crime_data_2010_2019 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
    header=True, inferSchema=True
)
crime_data_2020_present = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",
    header=True, inferSchema=True
)

# Combine both datasets
crime_data = crime_data_2010_2019.union(crime_data_2020_present)

crime_df = (
    crime_data
    .filter((col("LON") != 0) | (col("LAT") != 0))
    .withColumn("crime_geom", ST_Point(F.col("LON"), F.col("LAT")))
    .select("crime_geom")
)


joined_df = precincts_df.crossJoin(crime_df) \
    .withColumn("distance", ST_DistanceSphere(col("geom"), F.col("crime_geom")))

window_spec = Window.partitionBy("crime_geom").orderBy(F.col("distance"))

closest_division_df = joined_df.withColumn("rank", row_number().over(window_spec))\
    .filter(F.col("rank") == 1) \
    .select("division", "crime_geom", "distance") 
    
result_df = closest_division_df.groupBy("division") \
    .agg(
        F.avg("distance").alias("avg_distance"),
        F.count("crime_geom").alias("#"),
    )\
    .orderBy("#",ascending=False)    
 
result_df.show()

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+------------------+----+
|        division|      avg_distance|   #|
+----------------+------------------+----+
|         PACIFIC|3617.7734337150914|7654|
|        FOOTHILL|  4811.39497334595|7318|
|     WEST VALLEY|3363.5469483212823|6469|
|        VAN NUYS| 3430.765144800872|6453|
|WEST LOS ANGELES|3950.8905508901676|6193|
|         TOPANGA| 3818.025338208304|5971|
| NORTH HOLLYWOOD|3194.6900764893708|5748|
|        WILSHIRE|2556.5989617061937|5731|
|       NORTHEAST| 3695.256227099794|5726|
|          HARBOR|4047.4501072731864|5554|
|         MISSION|  3553.59361552155|5415|
|       HOLLYWOOD|2840.1645352992114|4933|
|      HOLLENBECK|3365.6869973683083|4618|
|       SOUTHWEST| 2295.170281643516|4482|
|      DEVONSHIRE|3083.4413145949866|4437|
|       SOUTHEAST| 3321.595088585188|4347|
|     77TH STREET|1931.1552614873917|3370|
|          NEWTON|1638.2180223355476|3009|
|         RAMPART|1926.0054134765985|2817|
|         OLYMPIC| 1726.857324744102|2760|
+----------

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from sedona.spark import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
import time

spark = (
    SparkSession.builder
    .appName("Query5")
    .config("spark.executor.instances", "8")
    .config("spark.executor.cores", "1")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "2g")
    .getOrCreate()
)
        
sedona = SedonaContext.create(spark)

start_time=time.time()

precincts_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv", header=True)\
    .withColumnRenamed("DIVISION","division")\
    .withColumn("geom", ST_Point(F.col("X"),F.col("Y")))\
    .select("division","geom")

crime_data_2010_2019 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
    header=True, inferSchema=True
)
crime_data_2020_present = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",
    header=True, inferSchema=True
)

# Combine both datasets
crime_data = crime_data_2010_2019.union(crime_data_2020_present)

crime_df = (
    crime_data
    .filter((col("LON") != 0) | (col("LAT") != 0))
    .withColumn("crime_geom", ST_Point(F.col("LON"), F.col("LAT")))
    .select("crime_geom")
)


joined_df = precincts_df.crossJoin(crime_df) \
    .withColumn("distance", ST_DistanceSphere(col("geom"), F.col("crime_geom")))

window_spec = Window.partitionBy("crime_geom").orderBy(F.col("distance"))

closest_division_df = joined_df.withColumn("rank", row_number().over(window_spec))\
    .filter(F.col("rank") == 1) \
    .select("division", "crime_geom", "distance") 
    
result_df = closest_division_df.groupBy("division") \
    .agg(
        F.avg("distance").alias("avg_distance"),
        F.count("crime_geom").alias("#"),
    )\
    .orderBy("#",ascending=False)    
 
result_df.show()

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+------------------+----+
|        division|      avg_distance|   #|
+----------------+------------------+----+
|         PACIFIC|3617.7734337150914|7654|
|        FOOTHILL| 4811.394973345951|7318|
|     WEST VALLEY| 3363.546948321281|6469|
|        VAN NUYS| 3430.765144800872|6453|
|WEST LOS ANGELES|3950.8905508901676|6193|
|         TOPANGA|3818.0253382083038|5971|
| NORTH HOLLYWOOD|3194.6900764893708|5748|
|        WILSHIRE| 2556.598961706193|5731|
|       NORTHEAST| 3695.256227099795|5726|
|          HARBOR|4047.4501072731873|5554|
|         MISSION| 3553.593615521549|5415|
|       HOLLYWOOD|2840.1645352992114|4933|
|      HOLLENBECK|3365.6869973683083|4618|
|       SOUTHWEST|2295.1702816435154|4482|
|      DEVONSHIRE|3083.4413145949866|4437|
|       SOUTHEAST|3321.5950885851885|4347|
|     77TH STREET|1931.1552614873917|3370|
|          NEWTON|1638.2180223355472|3009|
|         RAMPART| 1926.005413476599|2817|
|         OLYMPIC|1726.8573247441022|2760|
+----------