In [None]:
#QUERY 1
# RDD implementation

In [54]:
from pyspark.sql import SparkSession

sc = SparkSession \
    .builder \
    .appName("wordcount example") \
    .getOrCreate() \
    .sparkContext

data_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

df1 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv",
    header=False,
    schema=data_schema,
    quote='"',
    escape='"'
)

df2 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv",
    header=False,
    schema=data_schema,
    quote='"',
    escape='"'
)

data1 = df1.rdd
data2 = df2.rdd
data = data1.union(data2)

aggravated_assaults = data.filter(lambda x: "AGGRAVATED ASSAULT" in x[9])
print(aggravated_assaults)

def get_age_group(x):
    age = int(x[11])
    if age < 18 : return ["child", 1]
    elif 18<=age<=24 : return ["young adult", 1]
    elif 25<=age<=64 : return ["adult", 1]
    else : return ["senior", 1]

assault_age_group = aggravated_assaults.map(get_age_group)
age_group_count = assault_age_group.reduceByKey(lambda x,y: x+y)
sorted_data = age_group_count.map(lambda x: [x[1] , x[0]]).sortByKey(ascending=False).map(lambda x: [x[1] , x[0]])

print(sorted_data.collect())


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

PythonRDD[945] at RDD at PythonRDD.scala:55
[['adult', 121660], ['young adult', 33758], ['child', 16014], ['senior', 6011]]

In [None]:
# Dataframe implementation (no UDF)

In [60]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType
from pyspark.sql.functions import col, lit, count, when, expr

spark = SparkSession \
    .builder \
    .appName("Dataframe query 1 execution (no UDF)") \
    .getOrCreate()

data_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

data1 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
    header=False, \
    schema=data_schema)

data2 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
    header=False, \
    schema=data_schema)

data = data1.union(data2)
aggravated_assaults = data.filter(col("Crm Cd Desc").contains("AGGRAVATED ASSAULT"))

"""group1 = aggravated_assaults.filter(col("Vict Age")<18).count()
group1_df = spark.createDataFrame([(group1,)]).withColumn("AgeGroup", lit("child")).select("AgeGroup", "_1")

group2 = aggravated_assaults.filter((col("Vict Age")>=18) & (col("Vict Age")<=24)).count()
group2_df = spark.createDataFrame([(group2,)]).withColumn("AgeGroup", lit("young_adult")).select("AgeGroup", "_1")

group3 = aggravated_assaults.filter((col("Vict Age")>=25) & (col("Vict Age")<=64)).count()
group3_df = spark.createDataFrame([(group3,)]).withColumn("AgeGroup", lit("adult")).select("AgeGroup", "_1")

group4 = aggravated_assaults.filter(col("Vict Age")>64).count()
group4_df = spark.createDataFrame([(group4,)]).withColumn("AgeGroup", lit("senior")).select("AgeGroup", "_1")

groups = group1_df.unionByName(group2_df).unionByName(group3_df).unionByName(group4_df).sort(col("_1"),ascending=False)
groups.show()"""

# Count all age groups in one go
age_group_counts = aggravated_assaults.agg(
    count(when(col("Vict Age") < 18, True)).alias("child"),
    count(when((col("Vict Age") >= 18) & (col("Vict Age") <= 24), True)).alias("young_adult"),
    count(when((col("Vict Age") >= 25) & (col("Vict Age") <= 64), True)).alias("adult"),
    count(when(col("Vict Age") > 64, True)).alias("senior")
)

# Convert to rows using stack and sort
age_group_counts_melted = age_group_counts.selectExpr(
    "stack(4, 'child', child, 'young_adult', young_adult, 'adult', adult, 'senior', senior) as (AgeGroup, Count)"
).orderBy(col("Count").desc())

age_group_counts_melted.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------+------+
|   AgeGroup| Count|
+-----------+------+
|      adult|121660|
|young_adult| 33758|
|      child| 16014|
|     senior|  6011|
+-----------+------+

In [None]:
# Dataframe implementation (with UDF)

In [59]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType
from pyspark.sql.functions import col, udf

spark = SparkSession \
    .builder \
    .appName("Dataframe query 1 execution (with UDF)") \
    .getOrCreate()

data_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

data1 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
    header=False, \
    schema=data_schema)

data2 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
    header=False, \
    schema=data_schema)

data = data1.union(data2)

aggravated_assaults = data.filter(col("Crm Cd Desc").contains("AGGRAVATED ASSAULT"))
print(aggravated_assaults.count())

def get_group(x):
    if (x<18): return "child"
    elif (x>=18 and x<=24): return "young adult"
    elif (x>=25 and x<=64): return "adult"
    return "senior"

get_group_udf = udf(get_group,StringType())

results = aggravated_assaults.withColumn("age group", get_group_udf(col("Vict Age"))).groupBy(col("age group")).count()
results_sorted = results.sort(col("count"),ascending=False).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

3138130
177443
+-----------+------+
|  age group| count|
+-----------+------+
|      adult|121660|
|young adult| 33758|
|      child| 16014|
|     senior|  6011|
+-----------+------+