In [21]:
from sedona.spark import *
from pyspark.sql.functions import col, trim, sum as Fsum
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import time

# Create Spark Session
spark = SparkSession.builder \
    .appName("GeoJSON read") \
    .getOrCreate()

# Create Sedona context
sedona = SedonaContext.create(spark)
start_time = time.time()

# Read GeoJSON data
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = sedona.read.format("geojson") \
    .option("multiLine", "true").load(geojson_path) \
    .selectExpr("explode(features) as features") \
    .select("features.*")

blocks_census = blocks_df.select(
    [col(f"properties.{col_name}").alias(col_name) for col_name in 
     blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]
).drop("properties").drop("type").filter(col("CITY") == "Los Angeles")

# Cast numeric columns
blocks_census = blocks_census.withColumn("POP_2010", col("POP_2010").cast("int")) \
                             .withColumn("HOUSING10", col("HOUSING10").cast("int")) \
                             .withColumn("ZCTA10", col("ZCTA10").cast("int"))

# Filter and aggregate
census = blocks_census.select("ZCTA10", "COMM", "POP_2010", "HOUSING10") \
    .filter((col("ZCTA10") > 0) & (col("POP_2010") > 0) & (col("HOUSING10") > 0)) \
    .filter(trim(col("COMM")).isNotNull() & (trim(col("COMM")) != "")) \
    .groupBy("ZCTA10", "COMM") \
    .agg(Fsum("POP_2010").alias("Total_Population_Zip_COMM"), 
         Fsum("HOUSING10").alias("Housing"))

# Read income data
income = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True)
income = income.withColumn("Estimated Median Income", 
                           F.regexp_replace(col("Estimated Median Income"), "[$,]", "").cast("float")) \
               .filter(col("Estimated Median Income").isNotNull())

# Join income and census
res3a = income.join(census, census["ZCTA10"] == income["Zip Code"]) \
    .withColumn("total_income", col("Estimated Median Income") * col("Housing")) \
    .groupBy("COMM").agg(
        F.sum("Total_Population_Zip_COMM").alias("total_population"),
        F.sum("total_income").alias("total_income")
    ).withColumn("average_income", col("total_income") / col("total_population"))

# Read crime data
dataframe1 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True)
dataframe2 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv", header=True)
dataframe = dataframe1.union(dataframe2)

crime_dataset = dataframe.withColumn("point", ST_Point(col("LON"), col("LAT")))

# Spatial join and aggregation
res2 = crime_dataset.join(blocks_census, ST_Within(crime_dataset.point, blocks_census.geometry)) \
    .select("COMM", "POP_2010") \
    .groupBy("COMM") \
    .agg(F.sum("POP_2010").alias("TotalPopulation"), F.count("*").alias("NumberOfCrimes")) \
    .withColumn("crimes_per_person", col("NumberOfCrimes") / col("TotalPopulation"))

# Final join and result
res = res3a.join(res2, res3a["COMM"] == res2["COMM"], "inner") \
    .select(
        res3a["COMM"].alias("COMM"),
        "average_income",
        "crimes_per_person",
        "total_population",
        "total_income"
    ).orderBy("average_income")

# End timing
end_time = time.time()

# Show results
print(f"Time taken: {end_time - start_time:.2f} seconds")
res.show(n=res.count(), truncate=False)
print(f"Number of rows in the table: {res.count()}")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Time taken: 5.08 seconds
+-----------------------+------------------+---------------------+----------------+-------------+
|COMM                   |average_income    |crimes_per_person    |total_population|total_income |
+-----------------------+------------------+---------------------+----------------+-------------+
|Vernon Central         |6624.606832349082 |0.0037792246412336095|48768           |3.23068826E8 |
|University Park        |6877.686715882044 |0.0035691453589901836|24789           |1.70490976E8 |
|Central                |6973.305663786775 |0.0037903478362198303|35418           |2.4698054E8  |
|South Park             |6975.786575847878 |0.0034128330484230088|35235           |2.4579184E8  |
|Watts                  |7755.220014122869 |0.0038045267128661666|39652           |3.07509984E8 |
|Green Meadows          |8029.932320581407 |0.008199889186064806 |19814           |1.59105079E8 |
|Florence-Firestone     |8079.274737614006 |0.003784982222596155 |43638           |3.52563391