In [27]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1777,application_1765289937462_1761,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1720,application_1765289937462_1704,pyspark,idle,Link,Link,,
1769,application_1765289937462_1753,pyspark,idle,Link,Link,,
1773,application_1765289937462_1757,pyspark,idle,Link,Link,,
1776,application_1765289937462_1760,pyspark,idle,Link,Link,,
1777,application_1765289937462_1761,pyspark,idle,Link,Link,,✔


# RDD implementation

In [28]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType
import time

start = time.time()

df1 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv",
    header=True,
    quote='"',
    escape='"'
)

df2 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv",
    header=True,
    quote='"',
    escape='"'
)

data1 = df1.rdd
data2 = df2.rdd
data = data1.union(data2)

mo_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt",
    header=False,
    quote='"',
    escape='"'
)

mo_codes = mo_df.rdd
mo_codes = mo_codes.map(lambda x: x[0].split(" ")).map(lambda x: [x[0], ' '.join(x[1:])])

split_mos = data.filter(lambda x: x[10] is not None).flatMap(lambda x: x[10].split(" "))
reduce_mos = split_mos.map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y)

joined_data = reduce_mos.join(mo_codes)
sorted_data = joined_data.sortBy(lambda x: x[1][0],ascending=False)

print("MO Code | Total number of occurences | MO Description")
for item in sorted_data.coalesce(1).collect():
    print(item)

end = time.time()
print("Elapsed time: ",end-start)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

MO Code | Total number of occurences | MO Description
('0344', (1002900, 'Removes vict property'))
('1822', (548422, 'Stranger'))
('0416', (404773, 'Hit-Hit w/ weapon'))
('0329', (377536, 'Vandalized'))
('0913', (278618, 'Victim knew Suspect'))
('2000', (256188, 'Domestic violence'))
('1300', (219082, 'Vehicle involved'))
('0400', (213165, 'Force used'))
('1402', (177470, 'Evidence Booked (any crime)'))
('1609', (131229, 'Smashed'))
('1309', (122108, 'Susp uses vehicle'))
('1202', (120238, 'Victim was aged (60 & over) or blind/physically disabled/unable to care for self'))
('0325', (120159, 'Took merchandise'))
('1814', (118073, 'Susp is/was current/former boyfriend/girlfriend'))
('0444', (116763, 'Pushed'))
('1501', (115589, 'Other MO (see rpt)'))
('1307', (113609, 'Breaks window'))
('0334', (105665, 'Brandishes weapon'))
('2004', (93426, 'Suspect is homeless/transient'))
('0432', (83562, 'Intimidation'))
('0342', (81230, 'Multi-susps overwhelm'))
('0421', (81036, 'Threaten to kill'))

# Dataframe implementation

We are restarting the spark session every time we execute the query again. This is done because otherwise the times recorded for the queries are not accurate (possibly due to caching of intermediate dataframes/RDDs to the spark clusters). 

In [11]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1554,application_1765289937462_1540,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1545,application_1765289937462_1531,pyspark,idle,Link,Link,,
1553,application_1765289937462_1539,pyspark,idle,Link,Link,,
1554,application_1765289937462_1540,pyspark,idle,Link,Link,,✔


In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType
from pyspark.sql.functions import col, instr, expr, udf
from pyspark.sql import functions as F
import time 

# Define schemas
crimes_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

mo_schema = StructType([
    StructField("MO_data", StringType()),
])

# Auxiliary functions for getting all MO codes for each crimes
def split_mo_1(x):
    lst = x.split(" ")
    return lst[0]

def split_mo_2(x):
    lst = x.split(" ")
    return ' '.join(lst[1:])

get_mo_code = udf(split_mo_1, StringType())
get_mo_desc = udf(split_mo_2, StringType())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
start = time.time()

# Load data
df1 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv",
    header=False,
    schema=crimes_schema,
    quote='"',
    escape='"'
)

df2 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv",
    header=False,
    schema=crimes_schema,
    quote='"',
    escape='"'
)

data = df1.union(df2)

mo_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt",
    header=False,
    schema=mo_schema,
    quote='"',
    escape='"'
)

# Properly split code from description
mo_df = mo_df.withColumn("MO_code",get_mo_code(col("MO_data"))).withColumn("MO_desc",get_mo_desc(col("MO_data"))).drop("MO_data")

# Make a separate column for each mo code for every crime
split_codes = data.withColumn("codes_array", F.split(F.col("Mocodes"), " ")).withColumn("code", F.explode("codes_array")).drop("codes_array")

# Group by and count codes
group_codes = split_codes.groupBy(col("code")).count()

# Join each code with its description
join_desc = group_codes.join(mo_df, group_codes.code==mo_df.MO_code).drop("MO_code")

# Finally, sort the data
result = join_desc.orderBy(col("count"),ascending=False)

result.show(1000, truncate=False)
end = time.time()
print("Elapsed time: ",end-start)
result.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-------+---------------------------------------------------------------------------------------+
|code|count  |MO_desc                                                                                |
+----+-------+---------------------------------------------------------------------------------------+
|0344|1002900|Removes vict property                                                                  |
|1822|548422 |Stranger                                                                               |
|0416|404773 |Hit-Hit w/ weapon                                                                      |
|0329|377536 |Vandalized                                                                             |
|0913|278618 |Victim knew Suspect                                                                    |
|2000|256188 |Domestic violence                                                                      |
|1300|219082 |Vehicle involved                                           

# Repeat with shuffle hash join

In [14]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1556,application_1765289937462_1542,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1545,application_1765289937462_1531,pyspark,idle,Link,Link,,
1555,application_1765289937462_1541,pyspark,idle,Link,Link,,
1556,application_1765289937462_1542,pyspark,idle,Link,Link,,✔


In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType
from pyspark.sql.functions import col, instr, expr, udf
from pyspark.sql import functions as F
import time

# Define schemas
crimes_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

mo_schema = StructType([
    StructField("MO_data", StringType()),
])

# Auxiliary functions for getting all MO codes for each crimes
def split_mo_1(x):
    lst = x.split(" ")
    return lst[0]

def split_mo_2(x):
    lst = x.split(" ")
    return ' '.join(lst[1:])

get_mo_code = udf(split_mo_1, StringType())
get_mo_desc = udf(split_mo_2, StringType())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
spark.catalog.clearCache()
start = time.time()

# Load data
df1 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv",
    header=False,
    schema=crimes_schema,
    quote='"',
    escape='"'
)

df2 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv",
    header=False,
    schema=crimes_schema,
    quote='"',
    escape='"'
)

data = df1.union(df2)

mo_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt",
    header=False,
    schema=mo_schema,
    quote='"',
    escape='"'
)

# Properly split code from description
mo_df = mo_df.withColumn("MO_code",get_mo_code(col("MO_data"))).withColumn("MO_desc",get_mo_desc(col("MO_data"))).drop("MO_data")

# Make a separate column for each mo code for every crime
split_codes = data.withColumn("codes_array", F.split(F.col("Mocodes"), " ")).withColumn("code", F.explode("codes_array")).drop("codes_array")

# Group by and count codes
group_codes = split_codes.groupBy(col("code")).count()

# Join each code with its description
join_desc = group_codes.hint("shuffle_hash").join(mo_df, group_codes.code==mo_df.MO_code).drop("MO_code")

# Finally, sort the data
result = join_desc.orderBy(col("count"),ascending=False)

result.show(1, truncate=False)
end = time.time()
print("Elapsed time: ",end-start)
#result.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-------+---------------------+
|code|count  |MO_desc              |
+----+-------+---------------------+
|0344|1002900|Removes vict property|
+----+-------+---------------------+
only showing top 1 row

Elapsed time:  17.50543236732483

# Execute with sort-merge join

In [17]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1557,application_1765289937462_1543,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1545,application_1765289937462_1531,pyspark,idle,Link,Link,,
1555,application_1765289937462_1541,pyspark,idle,Link,Link,,
1557,application_1765289937462_1543,pyspark,idle,Link,Link,,✔


In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType
from pyspark.sql.functions import col, instr, expr, udf
from pyspark.sql import functions as F
import time

# Define schemas
crimes_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

mo_schema = StructType([
    StructField("MO_data", StringType()),
])

# Auxiliary functions for getting all MO codes for each crimes
def split_mo_1(x):
    lst = x.split(" ")
    return lst[0]

def split_mo_2(x):
    lst = x.split(" ")
    return ' '.join(lst[1:])

get_mo_code = udf(split_mo_1, StringType())
get_mo_desc = udf(split_mo_2, StringType())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [19]:
spark.catalog.clearCache()
start = time.time()

# Load data
df1 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv",
    header=False,
    schema=crimes_schema,
    quote='"',
    escape='"'
)

df2 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv",
    header=False,
    schema=crimes_schema,
    quote='"',
    escape='"'
)

data = df1.union(df2)

mo_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt",
    header=False,
    schema=mo_schema,
    quote='"',
    escape='"'
)

# Properly split code from description
mo_df = mo_df.withColumn("MO_code",get_mo_code(col("MO_data"))).withColumn("MO_desc",get_mo_desc(col("MO_data"))).drop("MO_data")

# Make a separate column for each mo code for every crime
split_codes = data.withColumn("codes_array", F.split(F.col("Mocodes"), " ")).withColumn("code", F.explode("codes_array")).drop("codes_array")

# Group by and count codes
group_codes = split_codes.groupBy(col("code")).count()

# Join each code with its description
join_desc = group_codes.hint("merge").join(mo_df, group_codes.code==mo_df.MO_code).drop("MO_code")

# Finally, sort the data
result = join_desc.orderBy(col("count"),ascending=False)

result.show(1, truncate=False)
end = time.time()
print("Elapsed time: ",end-start)
#result.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-------+---------------------+
|code|count  |MO_desc              |
+----+-------+---------------------+
|0344|1002900|Removes vict property|
+----+-------+---------------------+
only showing top 1 row

Elapsed time:  18.069719552993774

# Execute with cartesian product

In [20]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1558,application_1765289937462_1544,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1545,application_1765289937462_1531,pyspark,idle,Link,Link,,
1555,application_1765289937462_1541,pyspark,idle,Link,Link,,
1558,application_1765289937462_1544,pyspark,idle,Link,Link,,✔


In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType
from pyspark.sql.functions import col, instr, expr, udf
from pyspark.sql import functions as F
import time

# Define schemas
crimes_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

mo_schema = StructType([
    StructField("MO_data", StringType()),
])

# Auxiliary functions for getting all MO codes for each crimes
def split_mo_1(x):
    lst = x.split(" ")
    return lst[0]

def split_mo_2(x):
    lst = x.split(" ")
    return ' '.join(lst[1:])

get_mo_code = udf(split_mo_1, StringType())
get_mo_desc = udf(split_mo_2, StringType())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
spark.catalog.clearCache()

start = time.time()

# Load data
df1 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv",
    header=False,
    schema=crimes_schema,
    quote='"',
    escape='"'
)

df2 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv",
    header=False,
    schema=crimes_schema,
    quote='"',
    escape='"'
)

data = df1.union(df2)

mo_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt",
    header=False,
    schema=mo_schema,
    quote='"',
    escape='"'
)

# Properly split code from description
mo_df = mo_df.withColumn("MO_code",get_mo_code(col("MO_data"))).withColumn("MO_desc",get_mo_desc(col("MO_data"))).drop("MO_data")

# Make a separate column for each mo code for every crime
split_codes = data.withColumn("codes_array", F.split(F.col("Mocodes"), " ")).withColumn("code", F.explode("codes_array")).drop("codes_array")

# Group by and count codes
group_codes = split_codes.groupBy(col("code")).count()

# Join each code with its description
join_desc = group_codes.hint("shuffle_replicate_nl").join(mo_df, group_codes.code==mo_df.MO_code).drop("MO_code")

# Finally, sort the data
result = join_desc.orderBy(col("count"),ascending=False)

result.show(1, truncate=False)
end = time.time()
print("Elapsed time: ",end-start)
#result.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-------+---------------------+
|code|count  |MO_desc              |
+----+-------+---------------------+
|0344|1002900|Removes vict property|
+----+-------+---------------------+
only showing top 1 row

Elapsed time:  20.218238353729248