# First Configuration

In [32]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "2",
        "spark.executor.memory": "8g",
        "spark.executor.cores": "4"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1790,application_1765289937462_1774,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1773,application_1765289937462_1757,pyspark,idle,Link,Link,,
1777,application_1765289937462_1761,pyspark,idle,Link,Link,,
1780,application_1765289937462_1764,pyspark,idle,Link,Link,,
1781,application_1765289937462_1765,pyspark,idle,Link,Link,,
1784,application_1765289937462_1768,pyspark,idle,Link,Link,,
1789,application_1765289937462_1773,pyspark,idle,Link,Link,,
1790,application_1765289937462_1774,pyspark,idle,Link,Link,,✔


In [33]:
from sedona.spark import *

# Create sedona context
sedona = SedonaContext.create(spark)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [34]:
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType
from pyspark.sql.functions import udf

from pyspark.sql.functions import col, sum, round, count
from pyspark.sql import functions as F
import time

income_schema =  StructType([
    StructField("Zip_code", IntegerType()),
    StructField("Community", StringType()),
    StructField("Est_med_income", StringType())
])

crime_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Descent", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

# Convert dollar value to float
def money_to_int(x):
    if (x=="---"): return 0
    vals = x.split('$')[1].split(',')
    vals[1] = vals[1].split(' ')[0]
    res = ''.join(vals)
    return int(res)

money_to_int_udf = udf(money_to_int)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [35]:
start = time.time()

# Read census data for 2020
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Census_Blocks_2020.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

# Formatting magic
flattened_df = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")

# Read income for 2021
income_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_income_2021.csv",
    header=True,
    schema=income_schema,
    sep=";"
)

# Convert dollar values to integers
income_df = income_df.withColumn("median_income", money_to_int_udf(col("Est_med_income"))).drop("Est_med_income")

# Read crime data for period 2020-
crime_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
    header=True, \
    schema=crime_schema)

# Exclude data that don't belong to years 2020 and 2021
# Also exclude data in the null island
crime_df.withColumn("year",F.year(F.to_timestamp(col("DATE OCC"), "yyyy MMM dd hh:mm:ss a")))\
    .filter((col("year")==2020) | (col("year")==2021))\
    .drop("year")\
    .filter((col("LON")==0) & (col("LAT")==0))

# Data loading and preprocessing is over
# Perform main computatiom
# Join census blocks with income per zip code
census_income = flattened_df.join(income_df, flattened_df.ZCTA20==income_df.Zip_code)\
    .select("COMM","HOUSING20","POP20","median_income")\
    .withColumn("total_income", col("median_income")*col("HOUSING20"))\
    .filter(col("POP20")!=0)\
    .drop("HOUSING20", "median_income")

# Calculate income per capita per area
income_per_area = census_income.groupBy("COMM")\
    .agg(round(sum("total_income")/sum("POP20"),2).alias("Per Capita Income"))\
    .na.replace("", "Los Angeles County", subset=["COMM"])\
    .withColumnRenamed("COMM", "COMM_name")

# Aggregate areas over the geometry value
LA_areas = flattened_df.filter(col("CITY") == "Los Angeles") \
                .groupBy("COMM") \
                .agg(ST_Union_Aggr("geometry").alias("geometry"),sum("POP20").alias("Population"))

# Find the area for each crime
join_crime_area = crime_df.join(LA_areas, ST_Within(ST_Point(crime_df.LON, crime_df.LAT), LA_areas.geometry), "inner")\
    .select("COMM","DR_NO","Population")

# Aggregate crimes over areas to get average crime per person and area
crimes_per_person_x_area = join_crime_area.groupBy("COMM")\
    .agg((1000*count("DR_NO")/sum("Population")).alias("Crimes per area per 1K people"))\
    .na.replace("", "Los Angeles County", subset=["COMM"])

# Combine average crimes per person with income per person for each area
result = crimes_per_person_x_area.join(income_per_area, crimes_per_person_x_area.COMM==income_per_area.COMM_name).drop("COMM_name")
result.show(1000,truncate=False)

results_top10 = result.orderBy("Per Capita Income",ascending=False)
results_bottom10 = result.orderBy("Per Capita Income",ascending=True)
results_top10.show(10)
results_bottom10.show(10)

end = time.time()
print("Elapsed time: ",end-start)
result.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------------+-----------------------------+-----------------+
|COMM                   |Crimes per area per 1K people|Per Capita Income|
+-----------------------+-----------------------------+-----------------+
|Toluca Terrace         |0.7363770250368189           |25786.81         |
|Elysian Park           |0.21235931195582927          |26141.35         |
|Longwood               |0.26062027625749284          |20725.21         |
|Green Meadows          |0.04729921483303377          |11848.86         |
|Cadillac-Corning       |0.1643115346697338           |33043.29         |
|Lincoln Heights        |0.03406690740614567          |18288.88         |
|Van Nuys               |0.01106941630967799          |20630.62         |
|Gramercy Place         |0.09433072351664937          |23247.01         |
|Faircrest Heights      |0.2745744096650192           |31042.24         |
|Boyle Heights          |0.01272070421818552          |14566.39         |
|Lafayette Square       |0.24254183846

# Second configuration

In the following 2 configurations we just want to measure execution times. Therefore, we don't present the execution plan (since its the same) and only print one row of each relation, since we have already printed the results above

In [26]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "4g",
        "spark.executor.cores": "2"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1785,application_1765289937462_1769,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1720,application_1765289937462_1704,pyspark,idle,Link,Link,,
1773,application_1765289937462_1757,pyspark,idle,Link,Link,,
1777,application_1765289937462_1761,pyspark,idle,Link,Link,,
1780,application_1765289937462_1764,pyspark,idle,Link,Link,,
1781,application_1765289937462_1765,pyspark,idle,Link,Link,,
1784,application_1765289937462_1768,pyspark,idle,Link,Link,,
1785,application_1765289937462_1769,pyspark,idle,Link,Link,,✔


In [27]:
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType
from pyspark.sql.functions import udf

from pyspark.sql.functions import col, sum, round, count
from pyspark.sql import functions as F
import time

from sedona.spark import *

# Create sedona context
sedona = SedonaContext.create(spark)

income_schema =  StructType([
    StructField("Zip_code", IntegerType()),
    StructField("Community", StringType()),
    StructField("Est_med_income", StringType())
])

crime_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Descent", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

# Convert dollar value to float
def money_to_int(x):
    if (x=="---"): return 0
    vals = x.split('$')[1].split(',')
    vals[1] = vals[1].split(' ')[0]
    res = ''.join(vals)
    return int(res)

money_to_int_udf = udf(money_to_int)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
# Create sedona context
sedona = SedonaContext.create(spark)

start = time.time()

# Read census data for 2020
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Census_Blocks_2020.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

# Formatting magic
flattened_df = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")

# Read income for 2021
income_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_income_2021.csv",
    header=True,
    schema=income_schema,
    sep=";"
)

# Convert dollar values to integers
income_df = income_df.withColumn("median_income",money_to_int_udf(col("Est_med_income"))).drop("Est_med_income")

# Read crime data for period 2020-
crime_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
    header=True, \
    schema=crime_schema)

# Exclude data that don't belong to years 2020 and 2021
# Also exclude data in the null island
crime_df.withColumn("year",F.year(F.to_timestamp(col("DATE OCC"), "yyyy MMM dd hh:mm:ss a")))\
    .filter((col("year")==2020) | (col("year")==2021))\
    .drop("year")\
    .filter((col("LON")==0) & (col("LAT")==0))

# Data loading and preprocessing is over
# Perform main computatiom
# Join census blocks with income per zip code
census_income = flattened_df.join(income_df, flattened_df.ZCTA20==income_df.Zip_code)\
    .select("COMM","HOUSING20","POP20","median_income")\
    .withColumn("total_income",col("median_income")*col("HOUSING20"))\
    .filter(col("POP20")!=0)\
    .drop("HOUSING20","median_income")

# Calculate income per capita per area
income_per_area = census_income.groupBy("COMM")\
    .agg(round(sum("total_income")/sum("POP20"),2).alias("Per Capita Income"))\
    .na.replace("", "Los Angeles County", subset=["COMM"])\
    .withColumnRenamed("COMM", "COMM_name")

# Aggregate areas over the geometry value
LA_areas = flattened_df.filter(col("CITY") == "Los Angeles") \
                .groupBy("COMM") \
                .agg(ST_Union_Aggr("geometry").alias("geometry"),sum("POP20").alias("Population"))

# Find the area for each crime
join_crime_area = crime_df.join(LA_areas, ST_Within(ST_Point(crime_df.LON, crime_df.LAT), LA_areas.geometry), "inner")\
    .select("COMM","DR_NO","Population")

# Aggregate crimes over areas to get average crime per person and area
crimes_per_person_x_area = join_crime_area.groupBy("COMM")\
    .agg((1000*count("DR_NO")/sum("Population")).alias("Crimes per area per 1K people"))\
    .na.replace("", "Los Angeles County", subset=["COMM"])

# Combine average crimes per person with income per person for each area
result = crimes_per_person_x_area.join(income_per_area, crimes_per_person_x_area.COMM==income_per_area.COMM_name).drop("COMM_name")
result.show(10,truncate=False)

results_top10 = result.orderBy("Per Capita Income",ascending=False)
results_bottom10 = result.orderBy("Per Capita Income",ascending=True)
results_top10.show(10)
results_bottom10.show(10)

end = time.time()
print("Elapsed time: ",end-start)
#result.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------+-----------------------------+-----------------+
|COMM             |Crimes per area per 1K people|Per Capita Income|
+-----------------+-----------------------------+-----------------+
|Toluca Terrace   |0.7363770250368189           |25786.81         |
|Elysian Park     |0.21235931195582927          |26141.35         |
|Longwood         |0.26062027625749284          |20725.21         |
|Green Meadows    |0.04729921483303377          |11848.86         |
|Cadillac-Corning |0.1643115346697338           |33043.29         |
|Lincoln Heights  |0.03406690740614567          |18288.88         |
|Van Nuys         |0.01106941630967799          |20630.62         |
|Gramercy Place   |0.09433072351664937          |23247.01         |
|Faircrest Heights|0.2745744096650192           |31042.24         |
|Boyle Heights    |0.01272070421818552          |14566.39         |
+-----------------+-----------------------------+-----------------+
only showing top 10 rows

+-------------------+-

# Third configuration

In [29]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "8",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1787,application_1765289937462_1771,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1720,application_1765289937462_1704,pyspark,idle,Link,Link,,
1773,application_1765289937462_1757,pyspark,idle,Link,Link,,
1777,application_1765289937462_1761,pyspark,idle,Link,Link,,
1780,application_1765289937462_1764,pyspark,idle,Link,Link,,
1781,application_1765289937462_1765,pyspark,idle,Link,Link,,
1784,application_1765289937462_1768,pyspark,idle,Link,Link,,
1786,application_1765289937462_1770,pyspark,idle,Link,Link,,
1787,application_1765289937462_1771,pyspark,idle,Link,Link,,✔


In [30]:
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType
from pyspark.sql.functions import udf

from pyspark.sql.functions import col, sum, round, count
from pyspark.sql import functions as F
import time

from sedona.spark import *

# Create sedona context
sedona = SedonaContext.create(spark)

income_schema =  StructType([
    StructField("Zip_code", IntegerType()),
    StructField("Community", StringType()),
    StructField("Est_med_income", StringType())
])

crime_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Descent", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

# Convert dollar value to float
def money_to_int(x):
    if (x=="---"): return 0
    vals = x.split('$')[1].split(',')
    vals[1] = vals[1].split(' ')[0]
    res = ''.join(vals)
    return int(res)

money_to_int_udf = udf(money_to_int)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [31]:
# Create sedona context
sedona = SedonaContext.create(spark)

start = time.time()

# Read census data for 2020
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Census_Blocks_2020.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

# Formatting magic
flattened_df = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")

# Read income for 2021
income_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_income_2021.csv",
    header=True,
    schema=income_schema,
    sep=";"
)

# Convert dollar values to integers
income_df = income_df.withColumn("median_income",money_to_int_udf(col("Est_med_income"))).drop("Est_med_income")

# Read crime data for period 2020-
crime_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
    header=True, \
    schema=crime_schema)

# Exclude data that don't belong to years 2020 and 2021
# Also exclude data in the null island
crime_df.withColumn("year",F.year(F.to_timestamp(col("DATE OCC"), "yyyy MMM dd hh:mm:ss a")))\
    .filter((col("year")==2020) | (col("year")==2021))\
    .drop("year")\
    .filter((col("LON")==0) & (col("LAT")==0))

# Data loading and preprocessing is over
# Perform main computatiom
# Join census blocks with income per zip code
census_income = flattened_df.join(income_df, flattened_df.ZCTA20==income_df.Zip_code)\
    .select("COMM","HOUSING20","POP20","median_income")\
    .withColumn("total_income",col("median_income")*col("HOUSING20"))\
    .filter(col("POP20")!=0)\
    .drop("HOUSING20","median_income")

# Calculate income per capita per area
income_per_area = census_income.groupBy("COMM")\
    .agg(round(sum("total_income")/sum("POP20"),2).alias("Per Capita Income"))\
    .na.replace("", "Los Angeles County", subset=["COMM"])\
    .withColumnRenamed("COMM", "COMM_name")

# Aggregate areas over the geometry value
LA_areas = flattened_df.filter(col("CITY") == "Los Angeles") \
                .groupBy("COMM") \
                .agg(ST_Union_Aggr("geometry").alias("geometry"),sum("POP20").alias("Population"))

# Find the area for each crime
join_crime_area = crime_df.join(LA_areas, ST_Within(ST_Point(crime_df.LON, crime_df.LAT), LA_areas.geometry), "inner")\
    .select("COMM","DR_NO","Population")

# Aggregate crimes over areas to get average crime per person and area
crimes_per_person_x_area = join_crime_area.groupBy("COMM")\
    .agg((1000*count("DR_NO")/sum("Population")).alias("Crimes per area per 1K people"))\
    .na.replace("", "Los Angeles County", subset=["COMM"])

# Combine average crimes per person with income per person for each area
result = crimes_per_person_x_area.join(income_per_area, crimes_per_person_x_area.COMM==income_per_area.COMM_name).drop("COMM_name")
result.show(10,truncate=False)

results_top10 = result.orderBy("Per Capita Income",ascending=False)
results_bottom10 = result.orderBy("Per Capita Income",ascending=True)
results_top10.show(10)
results_bottom10.show(10)

end = time.time()
print("Elapsed time: ",end-start)
#result.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------+-----------------------------+-----------------+
|COMM             |Crimes per area per 1K people|Per Capita Income|
+-----------------+-----------------------------+-----------------+
|Toluca Terrace   |0.7363770250368189           |25786.81         |
|Elysian Park     |0.21235931195582927          |26141.35         |
|Longwood         |0.26062027625749284          |20725.21         |
|Green Meadows    |0.04729921483303377          |11848.86         |
|Cadillac-Corning |0.1643115346697338           |33043.29         |
|Lincoln Heights  |0.03406690740614567          |18288.88         |
|Van Nuys         |0.01106941630967799          |20630.62         |
|Gramercy Place   |0.09433072351664937          |23247.01         |
|Faircrest Heights|0.2745744096650192           |31042.24         |
|Boyle Heights    |0.01272070421818552          |14566.39         |
+-----------------+-----------------------------+-----------------+
only showing top 10 rows

+-------------------+-