In [1]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "4",
        "spark.executor.memory": "1g",
        "spark.executor.cores": "1",
        "spark.driver.memory": "2g"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2933,application_1732639283265_2892,pyspark,idle,Link,Link,,
2944,application_1732639283265_2903,pyspark,idle,Link,Link,,
2947,application_1732639283265_2906,pyspark,idle,Link,Link,,
2966,application_1732639283265_2924,pyspark,idle,Link,Link,,
2971,application_1732639283265_2929,pyspark,idle,Link,Link,,
2975,application_1732639283265_2933,pyspark,idle,Link,Link,,
2976,application_1732639283265_2934,pyspark,idle,Link,Link,,
2991,application_1732639283265_2949,pyspark,idle,Link,Link,,
3000,application_1732639283265_2958,pyspark,idle,Link,Link,,


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
from pyspark.sql.functions import col
from pyspark.sql import functions as F
import time

spark = SparkSession \
    .builder \
    .appName("DF query 1 execution") \
    .getOrCreate()

# Access configuration
conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))



Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3002,application_1732639283265_2960,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: 4
Executor Memory: 1g
Executor Cores: 1

In [3]:

# Spark DataFrame code

# Define the schema
# victim_schema = StructType([
#     StructField("Vict Age", StringType(), True),
#     StructField("Crm Cd", IntegerType(), True)  # IntegerType for numeric crm_cd
# ])

victim19 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
    header = True
)

victim20 = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",
    header = True
)

victim_df19 = victim19.select("Vict Age", "Crm Cd")

victim_df20 = victim20.select("Vict Age", "Crm Cd")


# Combine the two datasets
victim_df = victim_df19.union(victim_df20)

# print("victim_df:")
# victim_df.show(10)


victim_df_cleaned = victim_df.filter(
    (victim_df["Crm Cd"] == 230) |
    (victim_df["Crm Cd"] == 235) |
    (victim_df["Crm Cd"] == 236)
)

# print("victim_df_cleaned:")
# victim_df_cleaned.show(10)

victim_df_categorized = victim_df_cleaned.withColumn(
    "Vict Age",
    F.when((victim_df_cleaned["Vict Age"] >= "0") & (victim_df_cleaned["Vict Age"] < "18"), "children")
    .when((victim_df_cleaned["Vict Age"] >= "18") & (victim_df_cleaned["Vict Age"] <= "24"), "young_adults")
    .when((victim_df_cleaned["Vict Age"] >= "25") & (victim_df_cleaned["Vict Age"] <= "64"), "adults")
    .when(victim_df_cleaned["Vict Age"] > "64", "elders")
    .otherwise("unknown")
)
victim_df_categorized = victim_df_categorized.filter(victim_df_categorized["Vict Age"] != "unknown")
# Show the result
# print("victim_df_categorized:")
# victim_df_categorized.show(10)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
#map-reduce

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
start_time = time.time()

victim_df_reduced = victim_df_categorized.select("Vict Age")

victim_df_partitioned = victim_df_categorized.repartition(4)

victim_df_reduced = victim_df_partitioned.groupby("Vict Age").count()

victim_df_reduced = victim_df_reduced.withColumn("count", victim_df_reduced["count"].cast("int"))

victim_df_reduced = victim_df_reduced.orderBy('count', ascending=False)


victim_df_reduced.show()


end_time = time.time()
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")
# victim_df_reduced.explain(True)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+------+
|    Vict Age| count|
+------------+------+
|      adults|121883|
|young_adults| 33825|
|    children| 14194|
|      elders|  6704|
+------------+------+

Time taken: 21.43 seconds

In [6]:
# import time

# sc = SparkSession \
#     .builder \
#     .appName("result") \
#     .getOrCreate() \
#     .sparkContext


# #we have a preprocessed victim_df_categorized
# victim_rdd = victim_df_categorized.rdd

# start_time = time.time()
# victim_rdd = victim_rdd.repartition(4)
# result = victim_rdd \
#     .map(lambda x: (x[0], 1)) \
#     .reduceByKey(lambda x,y: x+y) \
#     .sortBy(lambda x: x[1], ascending=False)

# end_time = time.time()


# print(result.collect())

# elapsed_time = end_time - start_time
# print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
#rdd

In [4]:
import time

sc = SparkSession \
    .builder \
    .appName("result") \
    .getOrCreate() \
    .sparkContext





FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
#we have a preprocessed victim_df_categorized
start_time = time.time()
victim_df_reduced = victim_df_categorized.select("Vict Age")
victim_df_reduced=victim_df_reduced.collect()
#print(type(victim_df_reduced))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
victim_rdd = sc.parallelize(victim_df_reduced, 4) #4 executors --> 4 partitions (make it RDD)
#!!?
# victim_rdd = victim_rdd.repartition(4)
result = victim_rdd \
    .map(lambda x: (x[0], 1)) \
    .reduceByKey(lambda x,y: x+y) \
    .sortBy(lambda x: x[1], ascending=False)


print(result.collect())

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[('adults', 121883), ('young_adults', 33825), ('children', 14194), ('elders', 6704)]
Time taken: 26.84 seconds

In [7]:
#IF WE RE-RUN THIS, IT CACHES SOME DATA SO WE HAVE RESULTS AT 1/4 OF THE TIME -->RDD Persistence
start_time = time.time()

victim_rdd = sc.parallelize(victim_df_reduced, 4)

# victim_rdd = victim_rdd.repartition(4)
result = victim_rdd \
    .map(lambda x: (x[0], 1)) \
    .reduceByKey(lambda x,y: x+y) \
    .sortBy(lambda x: x[1], ascending=False)


print(result.collect())

end_time = time.time()

elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[('adults', 121883), ('young_adults', 33825), ('children', 14194), ('elders', 6704)]
Time taken: 4.64 seconds