In [None]:
# QUERY 4 - DataFrame API 
# 1 core / 2GB

import time
from sedona.spark import *
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import count, col
from pyspark.sql.functions import udf
from pyspark.sql.types import StructField, StructType, StringType, DoubleType

# Initialize SparkSession
spark = SparkSession \
    .builder \
    .appName("DF query 4 execution") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.instances", "2") \
    .getOrCreate()

# Create Sedona
sedona = SedonaContext.create(spark)
# Register Sedona
SedonaRegistrator.registerAll(spark)

start_time = time.time()

# Selected Communities from Query 3
communities_max_values = ["Marina Peninsula","Pacific Palisades","Palisades Highlands"]
communities_min_values = ["Central","Vernon Central","University Park"]

# Read the file from S3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"

blocks_df = sedona.read.format("geojson") \
    .option("multiLine", "true").load(geojson_path) \
    .selectExpr("explode(features) as features") \
    .select("features.*")

communities_filtered_max_df = blocks_df.select( \
                                           [F.col(f"properties.{col_name}").alias(col_name) for col_name in \
                                            blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
.drop("properties") \
.drop("type")\
.filter(F.col("COMM").isin(communities_max_values))

communities_filtered_min_df = blocks_df.select( \
                                           [F.col(f"properties.{col_name}").alias(col_name) for col_name in \
                                            blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
.drop("properties") \
.drop("type")\
.filter(F.col("COMM").isin(communities_min_values))

# Read the Race and Ethnicity codes from S3

races_schema = StructType([
    StructField("Vict Descent", StringType()),
    StructField("Vict Descent Full", StringType())
])
    
races_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv", 
                           header=True, schema=races_schema)

crime_schema = StructType([
    StructField("DR_NO", StringType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", StringType()),
    StructField("AREA", StringType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", StringType()),
    StructField("Part 1-2", DoubleType()),
    StructField("Crm Cd", StringType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", StringType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", DoubleType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", StringType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Desc", StringType()),
    StructField("Crm Cd 1", StringType()),
    StructField("Crm Cd 2", StringType()),
    StructField("Crm Cd 3", StringType()),
    StructField("Crm Cd 4", StringType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType())
])

# Read the crime data from CSV files
crime_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721//CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", 
                           header=True, schema=crime_schema)

crime_filtered_df = crime_df.select(
    col("DR_NO"),
    col("Vict Descent"),
    col("LAT"),
    col("LON")
).filter(
    F.col("LAT").isNotNull() & 
    F.col("LON").isNotNull() & 
    (F.col("DATE OCC").substr(7, 4) == "2015")
).withColumn("geometry", ST_Point(F.col("LON"), F.col("LAT")))

output_max_df = crime_filtered_df\
.join(communities_filtered_max_df, ST_Within(crime_filtered_df.geometry, communities_filtered_max_df.geometry), "inner")\
.select("Vict Descent")

output_min_df = crime_filtered_df\
.join(communities_filtered_min_df, ST_Within(crime_filtered_df.geometry, communities_filtered_min_df.geometry), "inner")\
.select("Vict Descent")

output_adjusted_max_df = output_max_df.groupBy("Vict Descent").agg(
    count("*").alias("#")
).orderBy(col("#").desc())

output_adjusted_min_df = output_min_df.groupBy("Vict Descent").agg(
    count("*").alias("#")
).orderBy(col("#").desc())

output_final_max_df = output_adjusted_max_df.join(
    races_df,
    on="Vict Descent", 
    how="left"
).select("Vict Descent Full", "#").orderBy(col("#").desc()).withColumnRenamed("Vict Descent Full", "Vict Descent")

output_final_min_df = output_adjusted_min_df.join(
    races_df,
    on="Vict Descent", 
    how="left"
).select("Vict Descent Full", "#").orderBy(col("#").desc()).withColumnRenamed("Vict Descent Full", "Vict Descent")

output_final_max_df.show()
output_final_min_df.show()

Starting Spark application




In [None]:
# QUERY 4 - DataFrame API 
# 2 cores / 4GB

import time
from sedona.spark import *
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import count, col
from pyspark.sql.functions import udf
from pyspark.sql.types import StructField, StructType, StringType, DoubleType

# Initialize SparkSession
spark = SparkSession \
    .builder \
    .appName("DF query 4 execution") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.instances", "2") \
    .getOrCreate()

# Create Sedona
sedona = SedonaContext.create(spark)
# Register Sedona
SedonaRegistrator.registerAll(spark)

start_time = time.time()

# Selected Communities from Query 3
communities_max_values = ["Marina Peninsula","Pacific Palisades","Palisades Highlands"]
communities_min_values = ["Central","Vernon Central","University Park"]

# Read the file from S3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"

blocks_df = sedona.read.format("geojson") \
    .option("multiLine", "true").load(geojson_path) \
    .selectExpr("explode(features) as features") \
    .select("features.*")

communities_filtered_max_df = blocks_df.select( \
                                           [F.col(f"properties.{col_name}").alias(col_name) for col_name in \
                                            blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
.drop("properties") \
.drop("type")\
.filter(F.col("COMM").isin(communities_max_values))

communities_filtered_min_df = blocks_df.select( \
                                           [F.col(f"properties.{col_name}").alias(col_name) for col_name in \
                                            blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
.drop("properties") \
.drop("type")\
.filter(F.col("COMM").isin(communities_min_values))

# Read the Race and Ethnicity codes from S3

races_schema = StructType([
    StructField("Vict Descent", StringType()),
    StructField("Vict Descent Full", StringType())
])
    
races_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv", 
                           header=True, schema=races_schema)

crime_schema = StructType([
    StructField("DR_NO", StringType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", StringType()),
    StructField("AREA", StringType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", StringType()),
    StructField("Part 1-2", DoubleType()),
    StructField("Crm Cd", StringType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", StringType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", DoubleType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", StringType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Desc", StringType()),
    StructField("Crm Cd 1", StringType()),
    StructField("Crm Cd 2", StringType()),
    StructField("Crm Cd 3", StringType()),
    StructField("Crm Cd 4", StringType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType())
])

# Read the crime data from CSV files
crime_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721//CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", 
                           header=True, schema=crime_schema)

crime_filtered_df = crime_df.select(
    col("DR_NO"),
    col("Vict Descent"),
    col("LAT"),
    col("LON")
).filter(
    F.col("LAT").isNotNull() & 
    F.col("LON").isNotNull() & 
    (F.col("DATE OCC").substr(7, 4) == "2015")
).withColumn("geometry", ST_Point(F.col("LON"), F.col("LAT")))

output_max_df = crime_filtered_df\
.join(communities_filtered_max_df, ST_Within(crime_filtered_df.geometry, communities_filtered_max_df.geometry), "inner")\
.select("Vict Descent")

output_min_df = crime_filtered_df\
.join(communities_filtered_min_df, ST_Within(crime_filtered_df.geometry, communities_filtered_min_df.geometry), "inner")\
.select("Vict Descent")

output_adjusted_max_df = output_max_df.groupBy("Vict Descent").agg(
    count("*").alias("#")
).orderBy(col("#").desc())

output_adjusted_min_df = output_min_df.groupBy("Vict Descent").agg(
    count("*").alias("#")
).orderBy(col("#").desc())

output_final_max_df = output_adjusted_max_df.join(
    races_df,
    on="Vict Descent", 
    how="left"
).select("Vict Descent Full", "#").orderBy(col("#").desc()).withColumnRenamed("Vict Descent Full", "Vict Descent")

output_final_min_df = output_adjusted_min_df.join(
    races_df,
    on="Vict Descent", 
    how="left"
).select("Vict Descent Full", "#").orderBy(col("#").desc()).withColumnRenamed("Vict Descent Full", "Vict Descent")

output_final_max_df.show()
output_final_min_df.show()

In [None]:
# QUERY 4 - DataFrame API 
# 4 cores / 8GB

import time
from sedona.spark import *
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import count, col
from pyspark.sql.functions import udf
from pyspark.sql.types import StructField, StructType, StringType, DoubleType

# Initialize SparkSession
spark = SparkSession \
    .builder \
    .appName("DF query 4 execution") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.executor.instances", "2") \
    .getOrCreate()

# Create Sedona
sedona = SedonaContext.create(spark)
# Register Sedona
SedonaRegistrator.registerAll(spark)

start_time = time.time()

# Selected Communities from Query 3
communities_max_values = ["Marina Peninsula","Pacific Palisades","Palisades Highlands"]
communities_min_values = ["Central","Vernon Central","University Park"]

# Read the file from S3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"

blocks_df = sedona.read.format("geojson") \
    .option("multiLine", "true").load(geojson_path) \
    .selectExpr("explode(features) as features") \
    .select("features.*")

communities_filtered_max_df = blocks_df.select( \
                                           [F.col(f"properties.{col_name}").alias(col_name) for col_name in \
                                            blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
.drop("properties") \
.drop("type")\
.filter(F.col("COMM").isin(communities_max_values))

communities_filtered_min_df = blocks_df.select( \
                                           [F.col(f"properties.{col_name}").alias(col_name) for col_name in \
                                            blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
.drop("properties") \
.drop("type")\
.filter(F.col("COMM").isin(communities_min_values))

# Read the Race and Ethnicity codes from S3

races_schema = StructType([
    StructField("Vict Descent", StringType()),
    StructField("Vict Descent Full", StringType())
])
    
races_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv", 
                           header=True, schema=races_schema)

crime_schema = StructType([
    StructField("DR_NO", StringType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", StringType()),
    StructField("AREA", StringType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", StringType()),
    StructField("Part 1-2", DoubleType()),
    StructField("Crm Cd", StringType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", StringType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", DoubleType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", StringType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Desc", StringType()),
    StructField("Crm Cd 1", StringType()),
    StructField("Crm Cd 2", StringType()),
    StructField("Crm Cd 3", StringType()),
    StructField("Crm Cd 4", StringType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType())
])

# Read the crime data from CSV files
crime_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721//CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", 
                           header=True, schema=crime_schema)

crime_filtered_df = crime_df.select(
    col("DR_NO"),
    col("Vict Descent"),
    col("LAT"),
    col("LON")
).filter(
    F.col("LAT").isNotNull() & 
    F.col("LON").isNotNull() & 
    (F.col("DATE OCC").substr(7, 4) == "2015")
).withColumn("geometry", ST_Point(F.col("LON"), F.col("LAT")))

output_max_df = crime_filtered_df\
.join(communities_filtered_max_df, ST_Within(crime_filtered_df.geometry, communities_filtered_max_df.geometry), "inner")\
.select("Vict Descent")

output_min_df = crime_filtered_df\
.join(communities_filtered_min_df, ST_Within(crime_filtered_df.geometry, communities_filtered_min_df.geometry), "inner")\
.select("Vict Descent")

output_adjusted_max_df = output_max_df.groupBy("Vict Descent").agg(
    count("*").alias("#")
).orderBy(col("#").desc())

output_adjusted_min_df = output_min_df.groupBy("Vict Descent").agg(
    count("*").alias("#")
).orderBy(col("#").desc())

output_final_max_df = output_adjusted_max_df.join(
    races_df,
    on="Vict Descent", 
    how="left"
).select("Vict Descent Full", "#").orderBy(col("#").desc()).withColumnRenamed("Vict Descent Full", "Vict Descent")

output_final_min_df = output_adjusted_min_df.join(
    races_df,
    on="Vict Descent", 
    how="left"
).select("Vict Descent Full", "#").orderBy(col("#").desc()).withColumnRenamed("Vict Descent Full", "Vict Descent")

output_final_max_df.show()
output_final_min_df.show()