In [None]:
# SQL implementation

In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType
from pyspark.sql import functions as F

# Create spark session
spark = SparkSession \
    .builder \
    .appName("Dataframe query 2 execution with SQL") \
    .getOrCreate()

# Define the data schema
data_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

# Load data from the 2 buckets and combine them
data1 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
    header=False, \
    schema=data_schema)

data2 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
    header=False, \
    schema=data_schema)

df = data1.union(data2)
df.createOrReplaceTempView("crimes")

# Extract year of occurence
query0 = "SELECT *, year(to_timestamp(`DATE OCC`, 'yyyy MMM dd hh:mm:ss a')) AS year \
FROM crimes;"
df_with_year = spark.sql(query0)
df_with_year.createOrReplaceTempView("crimes")

# Query 1: group data by year and ethnicity
# Exclude null data
query1 = "SELECT year, `Vict Descent`, COUNT(*) AS Total FROM crimes \
            WHERE year IS NOT NULL AND `Vict Descent` IS NOT NULL \
            GROUP BY year, `Vict Descent`"

ordered_by_year_and_ethn = spark.sql(query1)

# Save the query
ordered_by_year_and_ethn.createOrReplaceTempView("modified_crimes")

# Query 2: Partition previous result over years, then sort, take the top 3 and calculate percentage
query2 = "SELECT year, `Vict Descent`, Total, ROUND(100*(Total/sum),2) as Percent FROM ( \
    SELECT *, ROW_NUMBER() OVER ( \
               PARTITION BY year \
               ORDER BY Total DESC \
           ) AS rn, SUM(Total) OVER (PARTITION BY year) as sum\
    FROM modified_crimes \
) AS t \
WHERE rn <= 3 ORDER BY year, Total DESC;"

res = spark.sql(query2)
res.show(1000, truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+------------+-----+-------+
|year|Vict Descent|Total|Percent|
+----+------------+-----+-------+
|2010|H           |73558|38.93  |
|2010|W           |53835|28.49  |
|2010|B           |33937|17.96  |
|2011|H           |70845|38.8   |
|2011|W           |51219|28.05  |
|2011|B           |32579|17.84  |
|2012|H           |70338|38.25  |
|2012|W           |51839|28.19  |
|2012|B           |33572|18.26  |
|2013|H           |66741|37.97  |
|2013|W           |48453|27.57  |
|2013|B           |31975|18.19  |
|2014|H           |68763|38.42  |
|2014|W           |47531|26.56  |
|2014|B           |32952|18.41  |
|2015|H           |55978|36.65  |
|2015|W           |44102|28.87  |
|2015|B           |26510|17.35  |
|2016|H           |99135|38.74  |
|2016|W           |63760|24.92  |
|2016|B           |42449|16.59  |
|2017|H           |78308|37.55  |
|2017|W           |52744|25.29  |
|2017|B           |34713|16.65  |
|2018|H           |75958|36.42  |
|2018|W           |52233|25.05  |
|2018|B       

In [None]:
# DataFrame implementation

In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType
from pyspark.sql.functions import col, row_number, desc, sum, round

from pyspark.sql.window import Window
from pyspark.sql import functions as F

# Create spark session
spark = SparkSession \
    .builder \
    .appName("Dataframe query 2 execution") \
    .getOrCreate()

# Define the data schema
data_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

# Load data from the 2 buckets and combine them
data1 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
    header=False, \
    schema=data_schema)

data2 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
    header=False, \
    schema=data_schema)

data = data1.union(data2)

# Extract year of occurence
df_with_year = data.withColumn(
    "year",
    F.year(F.to_timestamp(F.col("DATE OCC"), "yyyy MMM dd hh:mm:ss a"))
)
df_with_year.createOrReplaceTempView("crimes")

# Group by year and ethnicity, then remove null values
year_descent_grouped = df_with_year.groupBy(col("year"),col("Vict Descent")).count()
year_descent_grouped = year_descent_grouped.na.drop()

# Define window for ordered descent counts per year
windowSpec = Window.partitionBy("year").orderBy(desc("count"))

# Define a window for total per year (no ordering needed)
total_window = Window.partitionBy("year")

# Add row_number and total per year
# Then, get the top 3 rows and calculate the percent
df_final = year_descent_grouped.withColumn("rank", row_number().over(windowSpec)) \
                          .withColumn("total_count", sum("count").over(total_window)) \
                          .filter(col("rank") <= 3) \
                          .withColumn("percent", round(100 * col("count") / col("total_count"),2)) \
                          .drop("rank") \
                          .drop("total_count")

df_final.show(1000, truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+------------+-----+-------+
|year|Vict Descent|count|percent|
+----+------------+-----+-------+
|2010|H           |73558|38.93  |
|2010|W           |53835|28.49  |
|2010|B           |33937|17.96  |
|2011|H           |70845|38.8   |
|2011|W           |51219|28.05  |
|2011|B           |32579|17.84  |
|2012|H           |70338|38.25  |
|2012|W           |51839|28.19  |
|2012|B           |33572|18.26  |
|2013|H           |66741|37.97  |
|2013|W           |48453|27.57  |
|2013|B           |31975|18.19  |
|2014|H           |68763|38.42  |
|2014|W           |47531|26.56  |
|2014|B           |32952|18.41  |
|2015|H           |55978|36.65  |
|2015|W           |44102|28.87  |
|2015|B           |26510|17.35  |
|2016|H           |99135|38.74  |
|2016|W           |63760|24.92  |
|2016|B           |42449|16.59  |
|2017|H           |78308|37.55  |
|2017|W           |52744|25.29  |
|2017|B           |34713|16.65  |
|2018|H           |75958|36.42  |
|2018|W           |52233|25.05  |
|2018|B       