# QUERY 1

## RRD Implementation 

In [1]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1805,application_1765289937462_1789,pyspark,idle,Link,Link,,
1807,application_1765289937462_1791,pyspark,busy,Link,Link,,
1808,application_1765289937462_1792,pyspark,idle,Link,Link,,


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType
import time

# After initilizing the Spark Session, we can start the timer
start_time = time.time()

# Reading the 2 CSV files needed for the query
df1 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv",
    header=False,
    quote='"',
    escape='"'
)

df2 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv",
    header=False,
    quote='"',
    escape='"'
)

# Converting DataFrames to RDDs and merging them to have the LA Crime Data from 2010 to 2025
data1 = df1.rdd
data2 = df2.rdd
data = data1.union(data2)

# Filtering the data to keep only the aggravated assault crimes
aggravated_assaults = data.filter(lambda x: "AGGRAVATED ASSAULT" in x[9])


# The User Defined Function to categorize ages into groups and count occurrences
def get_age_group(x):
    age = int(x[11])
    if age < 18:
        return ["child", 1]
    elif age >= 18 and age <= 24:
        return ["young adult", 1]
    elif age >= 25 and age <= 64:
        return ["adult", 1]
    else:
        return ["senior", 1]


# Applying the UDF and grouping the data by age groups
assault_age_group = aggravated_assaults.map(get_age_group)
age_group_count = assault_age_group.reduceByKey(lambda x,y: x+y)

# Sorting the results in descending order based on the count of aggravated assaults
sorted_data = age_group_count.sortBy(lambda x: x[1], ascending=False)

# Collecting and printing the final results
print(sorted_data.collect())
end_time = time.time()
print(f"Execution Time: {end_time - start_time} seconds")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1809,application_1765289937462_1793,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[('adult', 121660), ('young adult', 33758), ('child', 16014), ('senior', 6011)]
Execution Time: 37.659892082214355 seconds

## Dataframe implementation (with UDF)

In [3]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1812,application_1765289937462_1796,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1805,application_1765289937462_1789,pyspark,idle,Link,Link,,
1807,application_1765289937462_1791,pyspark,idle,Link,Link,,
1808,application_1765289937462_1792,pyspark,idle,Link,Link,,
1811,application_1765289937462_1795,pyspark,busy,Link,Link,,
1812,application_1765289937462_1796,pyspark,idle,Link,Link,,✔


In [4]:
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType

# Defining the schema for the CSV files
data_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType
from pyspark.sql.functions import col, udf
import time

# After initilizing the Spark Session, we can start the timer
start_time = time.time()

# Reading the CSV files into DataFrames with the defined schema
data1 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", 
    header=False, 
    schema=data_schema)

data2 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", 
    header=False, 
    schema=data_schema)

data = data1.union(data2)

# Filtering the data to keep only the aggravated assault crimes
aggravated_assaults = data.filter(col("Crm Cd Desc").contains("AGGRAVATED ASSAULT"))


# The User Defined Function to categorize ages into groups
def get_group(x):
    if x < 18:
        return "child"
    elif x >= 18 and x <= 24:
        return "young adult"
    elif x >= 25 and x <= 64:
        return "adult"
    return "senior"


get_group_udf = udf(get_group, StringType())

# Applying the UDF and grouping the data by age groups
results = aggravated_assaults.withColumn("age group", get_group_udf(col("Vict Age"))).groupBy(col("age group")).count()
results_sorted = results.sort(col("count"), ascending=False).show()

end_time = time.time()
print(f"Execution Time: {end_time - start_time} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------+------+
|  age group| count|
+-----------+------+
|      adult|121660|
|young adult| 33758|
|      child| 16014|
|     senior|  6011|
+-----------+------+

Execution Time: 15.84756088256836 seconds

## Dataframe implementation (without UDF)

In [6]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1813,application_1765289937462_1797,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1805,application_1765289937462_1789,pyspark,idle,Link,Link,,
1807,application_1765289937462_1791,pyspark,idle,Link,Link,,
1808,application_1765289937462_1792,pyspark,idle,Link,Link,,
1811,application_1765289937462_1795,pyspark,idle,Link,Link,,
1813,application_1765289937462_1797,pyspark,idle,Link,Link,,✔


In [7]:
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType

# Defining the schema for the CSV files
data_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
from pyspark.sql.functions import col, lit, count, when, expr
import time

# After initilizing the Spark Session, we can start the timer
start_time = time.time()

# Reading the CSV files into DataFrames with the defined schema
data1 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv",
    header=False,
    schema=data_schema)

data2 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv",
    header=False,
    schema=data_schema)

# Merging the two DataFrames to have the LA Crime Data from 2010 to 2025
data = data1.union(data2)

# Filtering the data to keep only the aggravated assault crimes
aggravated_assaults = data.filter(col("Crm Cd Desc").contains("AGGRAVATED ASSAULT"))

updated_df = aggravated_assaults.withColumn(
    "age group",
    when(col("Vict Age") < 18, "child")
    .when((col("Vict Age") >= 18) & (col("Vict Age") < 25), "young adult")
    .when((col("Vict Age") >= 25) & (col("Vict Age") < 65), "adult")
    .otherwise("senior")
)

age_group_count = updated_df.groupBy("age group").count()
age_group_count = age_group_count.orderBy(col("count").desc())
age_group_count.show()

end_time = time.time()
print(f"Execution Time: {end_time - start_time} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------+------+
|  age group| count|
+-----------+------+
|      adult|121660|
|young adult| 33758|
|      child| 16014|
|     senior|  6011|
+-----------+------+

Execution Time: 13.86941146850586 seconds