In [1]:
from sedona.spark import *
from pyspark.sql.functions import col
from pyspark.sql import SparkSession

# Create spark Session
spark = SparkSession.builder \
    .appName("GeoJSON read") \
    .getOrCreate()

# Create sedona context
sedona = SedonaContext.create(spark)
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Census_Blocks_2020.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
flattened_df = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")
# Print schema
flattened_df.printSchema()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1268,application_1765289937462_1258,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- BG20: string (nullable = true)
 |-- BG20FIP_CURRENT: string (nullable = true)
 |-- BGFIP20: string (nullable = true)
 |-- CB20: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- CITYCOMM: string (nullable = true)
 |-- CITYCOMM_CURRENT: string (nullable = true)
 |-- CITY_CURRENT: string (nullable = true)
 |-- COMM: string (nullable = true)
 |-- COMM_CURRENT: string (nullable = true)
 |-- COUNTY: string (nullable = true)
 |-- CT20: string (nullable = true)
 |-- CTCB20: string (nullable = true)
 |-- FEAT_TYPE: string (nullable = true)
 |-- FIP20: string (nullable = true)
 |-- FIP_CURRENT: string (nullable = true)
 |-- HD22: long (nullable = true)
 |-- HD_NAME: string (nullable = true)
 |-- HOUSING20: long (nullable = true)
 |-- OBJECTID: long (nullable = true)
 |-- POP20: long (nullable = true)
 |-- SPA22: long (nullable = true)
 |-- SPA_NAME: string (nullable = true)
 |-- SUP21: string (nullable = true)
 |-- SUP_LABEL: string (nullable = true)
 |-- ShapeSTArea: 

In [2]:
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, DoubleType, StringType

income_schema =  StructType([
    StructField("Zip_code", IntegerType()),
    StructField("Community", StringType()),
    StructField("Est_med_income", StringType())
])

crime_schema = StructType([
    StructField("DR_NO", IntegerType()),
    StructField("Date Rptd", StringType()),
    StructField("DATE OCC", StringType()),
    StructField("TIME OCC", IntegerType()),
    StructField("AREA", IntegerType()),
    StructField("AREA NAME", StringType()),
    StructField("Rpt Dist No", IntegerType()),
    StructField("Part 1-2", IntegerType()),
    StructField("Crm Cd", IntegerType()),
    StructField("Crm Cd Desc", StringType()),
    StructField("Mocodes", StringType()),
    StructField("Vict Age", IntegerType()),
    StructField("Vict Sex", StringType()),
    StructField("Vict Descent", StringType()),
    StructField("Premis Cd", IntegerType()),
    StructField("Premis Desc", StringType()),
    StructField("Weapon Used Cd", IntegerType()),
    StructField("Weapon Desc", StringType()),
    StructField("Status", StringType()),
    StructField("Status Descent", StringType()),
    StructField("Crm Cd 1", IntegerType()),
    StructField("Crm Cd 2", IntegerType()),
    StructField("Crm Cd 3", IntegerType()),
    StructField("Crm Cd 4", IntegerType()),
    StructField("LOCATION", StringType()),
    StructField("Cross Street", StringType()),
    StructField("LAT", DoubleType()),
    StructField("LON", DoubleType()),
])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [27]:
from pyspark.sql.functions import udf, sum, round, count
from pyspark.sql import functions as F
import time

start = time.time()

income_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_income_2021.csv",
    header=True,
    schema=income_schema,
    sep=";"
)

# Load data from the 2 buckets and combine them
crime_df1 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
    header=True, \
    schema=crime_schema)

crime_df2 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
    header=True, \
    schema=crime_schema)

crime_df = crime_df1.union(crime_df2)

# Convert dollar value to float
def money_to_int(x):
    if (x=="---"): return 0
    vals = x.split('$')[1].split(',')
    vals[1] = vals[1].split(' ')[0]
    res = ''.join(vals)
    return int(res)

money_to_int_udf = udf(money_to_int)
income_df = income_df.withColumn("median_income",money_to_int_udf(col("Est_med_income"))).drop("Est_med_income")

# Join census blocks with income per zip code
census_income = flattened_df.join(income_df, flattened_df.ZCTA20==income_df.Zip_code)\
    .select("COMM","HOUSING20","median_income")\
    .filter(col("HOUSING20")!=0)

# Calculate income per capita per area
income_per_area = census_income.groupBy("COMM")\
    .agg(round(sum("median_income")/sum("HOUSING20"),2).alias("AEP"))\
    .na.replace("", "Los Angeles County", subset=["COMM"])\
    .withColumnRenamed("COMM", "COMM_name")

# Aggregate areas over the geometry value
LA_areas = flattened_df.filter(col("CITY") == "Los Angeles") \
                .groupBy("COMM") \
                .agg(ST_Union_Aggr("geometry").alias("geometry"),sum("HOUSING20").alias("Population"))

# Find the area for each crime
join_crime_area = crime_df.join(LA_areas, ST_Within(ST_Point(crime_df.LON, crime_df.LAT), LA_areas.geometry), "inner")\
    .select("COMM","DR_NO","Population")

# Aggregate crimes over areas to get average crime per person and area
crimes_per_person_x_area = join_crime_area.groupBy("COMM")\
    .agg((1000*count("DR_NO")/sum("Population")).alias("Crimes per area per 1K people"))\
    .na.replace("", "Los Angeles County", subset=["COMM"])

result = crimes_per_person_x_area.join(income_per_area, crimes_per_person_x_area.COMM==income_per_area.COMM_name).drop("COMM_name")
result.show(1000,truncate=False)

end = time.time()
print("Elapsed time: ",end-start)
result.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------------+-----------------------------+--------+
|COMM                   |Crimes per area per 1K people|AEP     |
+-----------------------+-----------------------------+--------+
|Toluca Terrace         |1.763668430335097            |544.63  |
|Elysian Park           |0.468384074941452            |779.94  |
|Longwood               |0.6747638326585695           |760.35  |
|Green Meadows          |0.1869857890800299           |1446.63 |
|Cadillac-Corning       |0.4434589800443459           |976.6   |
|Lincoln Heights        |0.10263779123473263          |956.22  |
|Van Nuys               |0.0316646084671163           |586.88  |
|Gramercy Place         |0.25131942699170645          |2127.25 |
|Faircrest Heights      |0.6939625260235948           |1898.5  |
|Boyle Heights          |0.044027649363800464         |921.66  |
|Lafayette Square       |0.6369426751592356           |897.59  |
|Granada Hills          |0.0524714030853185           |2446.12 |
|North Hills            |