In [None]:
# RDD implementation

In [39]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType

# Create spark session 
sc = SparkSession \
    .builder \
    .appName("Query 3 with RDD") \
    .getOrCreate() \
    .sparkContext

df1 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv",
    header=False,
    quote='"',
    escape='"'
)

df2 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv",
    header=False,
    quote='"',
    escape='"'
)

data1 = df1.rdd
data2 = df2.rdd
data = data1.union(data2)

mo_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt",
    header=False,
    quote='"',
    escape='"'
)

mo_codes = mo_df.rdd
mo_codes = mo_codes.map(lambda x: x[0].split(" ")).map(lambda x: [x[0], ' '.join(x[1:])])

split_mos = data.filter(lambda x: x[10] is not None).flatMap(lambda x: x[10].split(" ")).filter(lambda x: x != "Mocodes")
reduce_mos = split_mos.map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y)

joined_data = sorted_mos.join(mo_codes)
sorted_data = joined_data.map(lambda x: [x[1][0],(x[0], x[1][1])]).sortByKey(ascending=False)
for item in sorted_data.coalesce(1).collect():
    print(item)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(1002900, ('0344', 'Removes vict property'))
(548422, ('1822', 'Stranger'))
(404773, ('0416', 'Hit-Hit w/ weapon'))
(377536, ('0329', 'Vandalized'))
(278618, ('0913', 'Victim knew Suspect'))
(256188, ('2000', 'Domestic violence'))
(219082, ('1300', 'Vehicle involved'))
(213165, ('0400', 'Force used'))
(177470, ('1402', 'Evidence Booked (any crime)'))
(131229, ('1609', 'Smashed'))
(122108, ('1309', 'Susp uses vehicle'))
(120238, ('1202', 'Victim was aged (60 & over) or blind/physically disabled/unable to care for self'))
(120159, ('0325', 'Took merchandise'))
(118073, ('1814', 'Susp is/was current/former boyfriend/girlfriend'))
(116763, ('0444', 'Pushed'))
(115589, ('1501', 'Other MO (see rpt)'))
(113609, ('1307', 'Breaks window'))
(105665, ('0334', 'Brandishes weapon'))
(93426, ('2004', 'Suspect is homeless/transient'))
(83562, ('0432', 'Intimidation'))
(81230, ('0342', 'Multi-susps overwhelm'))
(81036, ('0421', 'Threaten to kill'))
(78910, ('0906', 'Gangs'))
(77442, ('0429', 'Vict kno

In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType
from pyspark.sql.functions import col, instr, expr, udf
from pyspark.sql import functions as F

spark = SparkSession \
    .builder \
    .appName("Query 3 with DataFrame") \
    .getOrCreate() \

crimes_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

df1 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv",
    header=False,
    schema=crimes_schema,
    quote='"',
    escape='"'
)

df2 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv",
    header=False,
    schema=crimes_schema,
    quote='"',
    escape='"'
)

data = df1.union(df2)

mo_schema = StructType([
    StructField("MO_data", StringType()),
])

mo_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt",
    header=False,
    schema=mo_schema,
    quote='"',
    escape='"'
)

def split_mo_1(x):
    lst = x.split(" ")
    return lst[0]

def split_mo_2(x):
    lst = x.split(" ")
    return ' '.join(lst[1:])

get_mo_code = udf(split_mo_1, StringType())
get_mo_desc = udf(split_mo_2, StringType())

mo_df = mo_df.withColumn("MO_code",get_mo_code(col("MO_data"))).withColumn("MO_desc",get_mo_desc(col("MO_data"))).drop("MO_data")

# Different method, possibly slower
#joined_df = data.join(mo_df, expr("instr(Mocodes, MO_code) > 0"),"inner")
#grouped_df = joined_df.groupBy(col("MO_code"),col("MO_desc")).count()
#sorted_df = grouped_df.orderBy(col("count"),ascending=False)
#sorted_df.show(1000, truncate=False)

split_codes = data.withColumn("codes_array", F.split(F.col("Mocodes"), " ")).withColumn("code", F.explode("codes_array")).drop("codes_array")
group_codes = split_codes.groupBy(col("code")).count()
join_desc = group_codes.join(mo_df, group_codes.code==mo_df.MO_code).drop("MO_code")
result = join_desc.orderBy(col("count"),ascending=False)
result.show(1000, truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-------+---------------------------------------------------------------------------------------+
|code|count  |MO_desc                                                                                |
+----+-------+---------------------------------------------------------------------------------------+
|0344|1002900|Removes vict property                                                                  |
|1822|548422 |Stranger                                                                               |
|0416|404773 |Hit-Hit w/ weapon                                                                      |
|0329|377536 |Vandalized                                                                             |
|0913|278618 |Victim knew Suspect                                                                    |
|2000|256188 |Domestic violence                                                                      |
|1300|219082 |Vehicle involved                                           