In [17]:
# QUERY 1 - DataFrame API
import time
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType, TimestampType, DoubleType

# Initialize SparkSession with 4 executors
spark = SparkSession.builder \
    .appName("DF query 1 execution") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# Define the schema for the crime data
crime_schema = StructType([
    StructField("DR_NO", StringType()),
    StructField("Date Rptd", TimestampType()),
    StructField("DATE OCC", TimestampType()),
    StructField("TIME OCC", StringType()),
    StructField("AREA", StringType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", StringType()),
    StructField("Part 1-2", DoubleType()),
    StructField("Crm Cd", StringType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", StringType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", DoubleType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", StringType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Desc", StringType()),
    StructField("Crm Cd 1", StringType()),
    StructField("Crm Cd 2", StringType()),
    StructField("Crm Cd 3", StringType()),
    StructField("Crm Cd 4", StringType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType())
])
start_time = time.time()


# Read the crime data from CSV files
crime1_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721//CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=False, schema=crime_schema)
crime2_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721//CrimeData/Crime_Data_from_2020_to_Present_20241101.csv", header=False, schema=crime_schema)

# Union the dataframes to combine the two datasets
crime_df = crime1_df.union(crime2_df)

# Filter for crimes related to "AGGRAVATED ASSAULT"
df_filtered = crime_df.filter(crime_df["Crm Cd Desc"].contains("AGGRAVATED ASSAULT"))

# Define a function to classify age groups
def age_group(age):
    if age < 18:
        return "Child"
    elif 18 <= age <= 24:
        return "Young Adult"
    elif 25 <= age <= 64:
        return "Adult"
    else:
        return "Elderly"

# Register the UDF for age grouping
age_group_udf = F.udf(age_group, StringType())

# Apply the UDF to the "Vict Age" column and create a new column "Age Group"
df_with_age_group = df_filtered.withColumn("Age Group", age_group_udf(df_filtered["Vict Age"].cast("int")))

# Select only the relevant columns: "DR_NO", "Crm Cd Desc", "Vict Age", and "Age Group"
df_selected = df_with_age_group.select("DR_NO", "Crm Cd Desc", "Vict Age", "Age Group")

# Sort the DataFrame by "Vict Age" in descending order (cast to integer to ensure numerical sorting)
sorted_by_age = df_selected.orderBy(F.col("Vict Age").cast("int").desc())

# Show the result
sorted_by_age.show()

end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+--------------------+--------+---------+
|    DR_NO|         Crm Cd Desc|Vict Age|Age Group|
+---------+--------------------+--------+---------+
|200808494|ASSAULT WITH DEAD...|     120|  Elderly|
|132005705|ASSAULT WITH DEAD...|      99|  Elderly|
|211305883|ASSAULT WITH DEAD...|      99|  Elderly|
|230611363|ASSAULT WITH DEAD...|      99|  Elderly|
|201614222|ASSAULT WITH DEAD...|      99|  Elderly|
|180514402|INTIMATE PARTNER ...|      99|  Elderly|
|201218590|ASSAULT WITH DEAD...|      99|  Elderly|
|232008242|ASSAULT WITH DEAD...|      99|  Elderly|
|201211729|INTIMATE PARTNER ...|      99|  Elderly|
|151209236|ASSAULT WITH DEAD...|      99|  Elderly|
|201108948|ASSAULT WITH DEAD...|      99|  Elderly|
|221309255|ASSAULT WITH DEAD...|      99|  Elderly|
|201506790|ASSAULT WITH DEAD...|      99|  Elderly|
|180412016|ASSAULT WITH DEAD...|      99|  Elderly|
|201807416|ASSAULT WITH DEAD...|      99|  Elderly|
|220114827|ASSAULT WITH DEAD...|      99|  Elderly|
|201508635|A

In [19]:
# QUERY 1 - RDD API

import time
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType
from pyspark import RDD

# Initialize SparkSession with 4 executors
spark = SparkSession.builder \
    .appName("RDD query 1 execution") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# Define the schema for the crime data
crime_schema = StructType([
    StructField("DR_NO", StringType()),
    StructField("Date Rptd", TimestampType()),
    StructField("DATE OCC", TimestampType()),
    StructField("TIME OCC", StringType()),
    StructField("AREA", StringType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", StringType()),
    StructField("Part 1-2", DoubleType()),
    StructField("Crm Cd", StringType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", StringType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", DoubleType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", StringType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Desc", StringType()),
    StructField("Crm Cd 1", StringType()),
    StructField("Crm Cd 2", StringType()),
    StructField("Crm Cd 3", StringType()),
    StructField("Crm Cd 4", StringType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType())
])

start_time = time.time()

# Read the crime data from CSV files into DataFrames
crime1_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721//CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=False, schema=crime_schema)
crime2_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721//CrimeData/Crime_Data_from_2020_to_Present_20241101.csv", header=False, schema=crime_schema)

# Convert DataFrames to RDDs
crime1_rdd = crime1_df.rdd
crime2_rdd = crime2_df.rdd

# Union the RDDs
crime_rdd = crime1_rdd.union(crime2_rdd)

# Filter for crimes related to "AGGRAVATED ASSAULT"
filtered_rdd = crime_rdd.filter(lambda row: "AGGRAVATED ASSAULT" in row["Crm Cd Desc"])

# Define a function to classify age groups
def age_group(age):
    try:
        if age < 18:
            return "Child"
        elif 18 <= age <= 24:
            return "Young Adult"
        elif 25 <= age <= 64:
            return "Adult"
        else:
            return "Elderly"
    except ValueError:
        return None  # In case of invalid or missing age values

# Function to process each row and add the age group column
def process_row(row):
    try:
        # Cast the age to integer, if it fails, use None
        age = int(row["Vict Age"]) if row["Vict Age"].isdigit() else None
        age_group_value = age_group(age)
        return (row["DR_NO"], row["Crm Cd Desc"], row["Vict Age"], age_group_value)
    except Exception as e:
        return None  # In case of any unexpected error, return None

# Apply the transformation
processed_rdd = filtered_rdd.map(process_row).filter(lambda x: x is not None)

# Sort the RDD by "Vict Age" (convert to integer) in descending order
sorted_rdd = processed_rdd.sortBy(lambda x: int(x[2]) if x[2].isdigit() else -1, ascending=False)

# Take the top 20 rows from the RDD
top_20_rows = sorted_rdd.take(20)

# Print the top 20 rows
for row in top_20_rows:
    print(row)
end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

('200808494', 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT', '120', 'Elderly')
('131209628', 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT', '99', 'Elderly')
('131012876', 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT', '99', 'Elderly')
('132005705', 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT', '99', 'Elderly')
('141607930', 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT', '99', 'Elderly')
('141226352', 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT', '99', 'Elderly')
('140511426', 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT', '99', 'Elderly')
('101408873', 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT', '99', 'Elderly')
('111916851', 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT', '99', 'Elderly')
('111221144', 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT', '99', 'Elderly')
('110210817', 'INTIMATE PARTNER - AGGRAVATED ASSAULT', '99', 'Elderly')
('122117683', 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT', '99', 'Elderly')
('151209236', 'ASSAULT WITH DEADLY W