# Query 4: First configuration

In [7]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "2",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1778,application_1765289937462_1762,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1720,application_1765289937462_1704,pyspark,idle,Link,Link,,
1769,application_1765289937462_1753,pyspark,idle,Link,Link,,
1773,application_1765289937462_1757,pyspark,idle,Link,Link,,
1776,application_1765289937462_1760,pyspark,idle,Link,Link,,
1777,application_1765289937462_1761,pyspark,idle,Link,Link,,
1778,application_1765289937462_1762,pyspark,idle,Link,Link,,✔


In [8]:
from sedona.spark import *

# Initialize sedona context
sedona = SedonaContext.create(spark)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType
from pyspark.sql.functions import col, row_number, desc
from pyspark.sql.window import Window
import time

police_station_schema = StructType([
    StructField("X", DoubleType()),
    StructField("Y", DoubleType()),
    StructField("FID", IntegerType()),
    StructField("Division", StringType()),
    StructField("Location", StringType()),
    StructField("PREC", IntegerType()),
])

crime_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Descent", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
start = time.time()

police_stations_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Police_Stations.csv", \
    header=True, \
    schema=police_station_schema)

data1 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
    header=True, \
    schema=crime_schema)

data2 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
    header=True, \
    schema=crime_schema)

crime_df = data1.union(data2)

# SQL Implementation
# Create views
police_stations_df.createOrReplaceTempView("stations")
crime_df.createOrReplaceTempView("crimes")

# Clear crimes in null island
query0 = "SELECT * FROM crimes WHERE NOT (LON=0 AND LAT=0)"
no_null_crime_df = spark.sql(query0)
no_null_crime_df.createOrReplaceTempView("crimes")

# First query: calculate distance of each crime from each station
# then partition over crimes and find the nearest station
query1 = "SELECT Division, Distance FROM \
    (SELECT *, ROW_NUMBER() OVER (PARTITION BY DR_NO ORDER BY Distance) AS row_num FROM \
        (SELECT DR_NO, Division,\
        ST_DistanceSphere(ST_Point(LON, LAT),ST_Point(X, Y))/1000 AS Distance \
        FROM crimes,stations) AS res1 \
    ) WHERE row_num=1"

intermediate_res = spark.sql(query1)
intermediate_res.createOrReplaceTempView("closest_stations")

# Second query: get average distance and total count of crimes per station
query2 = "SELECT Division, ROUND(AVG(Distance),2) AS average_distance, COUNT(*) AS count\
    FROM closest_stations GROUP BY Division ORDER BY count DESC"
res = spark.sql(query2)
res.show(21)

end = time.time()
print("Elapsed time: ",end-start)
res.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+----------------+------+
|        Division|average_distance| count|
+----------------+----------------+------+
|       HOLLYWOOD|            2.07|224124|
|        VAN NUYS|            2.94|208129|
|       SOUTHWEST|            2.19|189119|
|        WILSHIRE|            2.59|186383|
|     77TH STREET|            1.72|170620|
| NORTH HOLLYWOOD|            2.64|168096|
|         OLYMPIC|            1.73|162805|
|         PACIFIC|            3.85|162027|
|         CENTRAL|            0.99|154689|
|         RAMPART|            1.53|153204|
|       SOUTHEAST|            2.44|143803|
|     WEST VALLEY|            3.02|136622|
|        FOOTHILL|            4.26|132482|
|         TOPANGA|             3.3|131054|
|          HARBOR|             3.7|127071|
|      HOLLENBECK|            2.68|116235|
|WEST LOS ANGELES|            2.79|115969|
|          NEWTON|            1.64|111392|
|       NORTHEAST|            3.62|108243|
|         MISSION|            3.68| 97926|
|      DEVO

# Second configuration

In [11]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "2",
        "spark.executor.memory": "4g",
        "spark.executor.cores": "2"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1779,application_1765289937462_1763,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1720,application_1765289937462_1704,pyspark,idle,Link,Link,,
1769,application_1765289937462_1753,pyspark,idle,Link,Link,,
1773,application_1765289937462_1757,pyspark,idle,Link,Link,,
1776,application_1765289937462_1760,pyspark,idle,Link,Link,,
1777,application_1765289937462_1761,pyspark,idle,Link,Link,,
1779,application_1765289937462_1763,pyspark,idle,Link,Link,,✔


In [12]:
from sedona.spark import *

# Initialize sedona context
sedona = SedonaContext.create(spark)

from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType
from pyspark.sql.functions import col, row_number, desc
from pyspark.sql.window import Window
import time

police_station_schema = StructType([
    StructField("X", DoubleType()),
    StructField("Y", DoubleType()),
    StructField("FID", IntegerType()),
    StructField("Division", StringType()),
    StructField("Location", StringType()),
    StructField("PREC", IntegerType()),
])

crime_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Descent", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
spark.catalog.clearCache()
start = time.time()

police_stations_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Police_Stations.csv", \
    header=True, \
    schema=police_station_schema)

data1 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
    header=True, \
    schema=crime_schema)

data2 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
    header=True, \
    schema=crime_schema)

crime_df = data1.union(data2)

# SQL Implementation
# Create views
police_stations_df.createOrReplaceTempView("stations")
crime_df.createOrReplaceTempView("crimes")

# Clear crimes in null island
query0 = "SELECT * FROM crimes WHERE NOT (LON=0 AND LAT=0)"
no_null_crime_df = spark.sql(query0)
no_null_crime_df.createOrReplaceTempView("crimes")

# First query: calculate distance of each crime from each station
# then partition over crimes and find the nearest station
query1 = "SELECT Division, Distance FROM \
    (SELECT *, ROW_NUMBER() OVER (PARTITION BY DR_NO ORDER BY Distance) AS row_num FROM \
        (SELECT DR_NO, Division,\
        ST_DistanceSphere(ST_Point(LON, LAT),ST_Point(X, Y))/1000 AS Distance \
        FROM crimes,stations) AS res1 \
    ) WHERE row_num=1"

intermediate_res = spark.sql(query1)
intermediate_res.createOrReplaceTempView("closest_stations")

# Second query: get average distance and total count of crimes per station
query2 = "SELECT Division, ROUND(AVG(Distance),2) AS average_distance, COUNT(*) AS count\
    FROM closest_stations GROUP BY Division ORDER BY count DESC"
res = spark.sql(query2)
res.show(21)

end = time.time()
print("Elapsed time: ",end-start)
#res.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+----------------+------+
|        Division|average_distance| count|
+----------------+----------------+------+
|       HOLLYWOOD|            2.07|224124|
|        VAN NUYS|            2.94|208129|
|       SOUTHWEST|            2.19|189119|
|        WILSHIRE|            2.59|186383|
|     77TH STREET|            1.72|170620|
| NORTH HOLLYWOOD|            2.64|168096|
|         OLYMPIC|            1.73|162805|
|         PACIFIC|            3.85|162027|
|         CENTRAL|            0.99|154689|
|         RAMPART|            1.53|153204|
|       SOUTHEAST|            2.44|143803|
|     WEST VALLEY|            3.02|136622|
|        FOOTHILL|            4.26|132482|
|         TOPANGA|             3.3|131054|
|          HARBOR|             3.7|127071|
|      HOLLENBECK|            2.68|116235|
|WEST LOS ANGELES|            2.79|115969|
|          NEWTON|            1.64|111392|
|       NORTHEAST|            3.62|108243|
|         MISSION|            3.68| 97926|
|      DEVO

# Third Configuration

In [14]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "2",
        "spark.executor.memory": "8g",
        "spark.executor.cores": "4"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1780,application_1765289937462_1764,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1720,application_1765289937462_1704,pyspark,idle,Link,Link,,
1769,application_1765289937462_1753,pyspark,idle,Link,Link,,
1773,application_1765289937462_1757,pyspark,idle,Link,Link,,
1777,application_1765289937462_1761,pyspark,idle,Link,Link,,
1780,application_1765289937462_1764,pyspark,idle,Link,Link,,✔


In [15]:
from sedona.spark import *

# Initialize sedona context
sedona = SedonaContext.create(spark)

from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType
from pyspark.sql.functions import col, row_number, desc
from pyspark.sql.window import Window
import time

police_station_schema = StructType([
    StructField("X", DoubleType()),
    StructField("Y", DoubleType()),
    StructField("FID", IntegerType()),
    StructField("Division", StringType()),
    StructField("Location", StringType()),
    StructField("PREC", IntegerType()),
])

crime_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Descent", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
spark.catalog.clearCache()
start = time.time()

police_stations_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Police_Stations.csv", \
    header=True, \
    schema=police_station_schema)

data1 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
    header=True, \
    schema=crime_schema)

data2 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
    header=True, \
    schema=crime_schema)

crime_df = data1.union(data2)

# SQL Implementation
# Create views
police_stations_df.createOrReplaceTempView("stations")
crime_df.createOrReplaceTempView("crimes")

# Clear crimes in null island
query0 = "SELECT * FROM crimes WHERE NOT (LON=0 AND LAT=0)"
no_null_crime_df = spark.sql(query0)
no_null_crime_df.createOrReplaceTempView("crimes")

# First query: calculate distance of each crime from each station
# then partition over crimes and find the nearest station
query1 = "SELECT Division, Distance FROM \
    (SELECT *, ROW_NUMBER() OVER (PARTITION BY DR_NO ORDER BY Distance) AS row_num FROM \
        (SELECT DR_NO, Division,\
        ST_DistanceSphere(ST_Point(LON, LAT),ST_Point(X, Y))/1000 AS Distance \
        FROM crimes,stations) AS res1 \
    ) WHERE row_num=1"

intermediate_res = spark.sql(query1)
intermediate_res.createOrReplaceTempView("closest_stations")

# Second query: get average distance and total count of crimes per station
query2 = "SELECT Division, ROUND(AVG(Distance),2) AS average_distance, COUNT(*) AS count\
    FROM closest_stations GROUP BY Division ORDER BY count DESC"
res = spark.sql(query2)
res.show(21)

end = time.time()
print("Elapsed time: ",end-start)
#res.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+----------------+------+
|        Division|average_distance| count|
+----------------+----------------+------+
|       HOLLYWOOD|            2.07|224124|
|        VAN NUYS|            2.94|208129|
|       SOUTHWEST|            2.19|189119|
|        WILSHIRE|            2.59|186383|
|     77TH STREET|            1.72|170620|
| NORTH HOLLYWOOD|            2.64|168096|
|         OLYMPIC|            1.73|162805|
|         PACIFIC|            3.85|162027|
|         CENTRAL|            0.99|154689|
|         RAMPART|            1.53|153204|
|       SOUTHEAST|            2.44|143803|
|     WEST VALLEY|            3.02|136622|
|        FOOTHILL|            4.26|132482|
|         TOPANGA|             3.3|131054|
|          HARBOR|             3.7|127071|
|      HOLLENBECK|            2.68|116235|
|WEST LOS ANGELES|            2.79|115969|
|          NEWTON|            1.64|111392|
|       NORTHEAST|            3.62|108243|
|         MISSION|            3.68| 97926|
|      DEVO