In [1]:
%run "nb_spatial_analysis_functions"

StatementMeta(, 96bdbf0a-a4b5-46a1-98de-30916349ac1a, 9, Finished, Available, Finished)

In [2]:
kml_path = "/lakehouse/default/Files/force kmls/cumbria.kml"
with open(kml_path, 'r', encoding='utf-8') as file:
    kml_content = file.read()

# Step 2: Parse coordinates directly into list of (lon, lat)
soup = BeautifulSoup(kml_content, 'xml')
placemarks = soup.find_all('Placemark')

coords = []
for placemark in placemarks:
    coords_tag = placemark.find('coordinates')
    if coords_tag:
        raw_coords = coords_tag.text.strip().split()
        for coord in raw_coords:
            try:
                lon, lat = map(float, coord.split(',')[:2])
                coords.append((lon, lat))
            except ValueError:
                continue

# Step 3: Build polygon and simplify
polygon = Polygon(coords)
simplified = polygon.simplify(0.04, preserve_topology=True)

# Step 4: Create API string in [lat,lng:lat,lng:...] format
polygon_str = ":".join(f"{lat},{lon}" for lon, lat in simplified.exterior.coords)

# Step 5: Use in API
date = "2023-04"
url = f"https://data.police.uk/api/crimes-street/all-crime?poly={polygon_str}&date={date}"

StatementMeta(, 96bdbf0a-a4b5-46a1-98de-30916349ac1a, 10, Finished, Available, Finished)

In [3]:
# Fetch data
response = requests.get(url)
response.raise_for_status()
data = response.json()

# Define nested schema
schema = StructType([
    StructField("category", StringType(), True),
    StructField("persistent_id", StringType(), True),
    StructField("location_subtype", StringType(), True),
    StructField("id", StringType(), True),
    StructField("location_type", StringType(), True),
    StructField("context", StringType(), True),
    StructField("month", StringType(), True),
    StructField("location", StructType([
        StructField("latitude", StringType(), True),
        StructField("longitude", StringType(), True),
        StructField("street", StructType([
            StructField("id", StringType(), True),
            StructField("name", StringType(), True)
        ]))
    ])),
    StructField("outcome_status", StructType([
        StructField("category", StringType(), True),
        StructField("date", StringType(), True)
    ]))
])

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

# Select nested fields and flatten for display
df_flat = df.select(
    "category",
    "month",
    "location.latitude",
    "location.longitude",
    "location.street.name",
    col("outcome_status.category").alias("outcome_category"),
    col("outcome_status.date").alias("outcome_date")
)

df_flat.createOrReplaceTempView("crimes")
spark.sql("SELECT category, COUNT(*) AS count FROM crimes GROUP BY category").show()

StatementMeta(, 96bdbf0a-a4b5-46a1-98de-30916349ac1a, 11, Finished, Available, Finished)

+--------------------+-----+
|            category|count|
+--------------------+-----+
|anti-social-behav...|  374|
|            burglary|  102|
|       bicycle-theft|   21|
|criminal-damage-a...|  478|
|         other-theft|  202|
|               drugs|  106|
|possession-of-wea...|   45|
|        public-order|  360|
|             robbery|   14|
|         shoplifting|  127|
|       vehicle-crime|   87|
|theft-from-the-pe...|    8|
|       violent-crime| 1434|
|         other-crime|   46|
+--------------------+-----+



In [4]:
query = """
            Select 
                category
                ,month
                ,cast(latitude as float) as latitude
                ,cast(longitude as float) as longitude
                ,name
                ,outcome_category
                ,outcome_date
            from crimes
        """

df_crimes = spark.sql(query)

StatementMeta(, 96bdbf0a-a4b5-46a1-98de-30916349ac1a, 12, Finished, Available, Finished)

In [5]:
df_crimes = df_crimes.withColumn(
    "h3_index_9",
    lat_long_to_h3_index(df_crimes["latitude"], df_crimes["longitude"],lit(9))
)

StatementMeta(, 96bdbf0a-a4b5-46a1-98de-30916349ac1a, 13, Finished, Available, Finished)

In [6]:
lat_long_to_wkt_udf = udf(lat_long_to_wkt, StringType())

df_crimes = df_crimes.withColumn(
    "wkt_point",
    lat_long_to_wkt_udf(df_crimes["latitude"], df_crimes["longitude"])
)

StatementMeta(, 96bdbf0a-a4b5-46a1-98de-30916349ac1a, 14, Finished, Available, Finished)

In [7]:
windowSpec = Window.orderBy("h3_index_9")
df_crimes = df_crimes.withColumn("crime_sk", F.row_number().over(windowSpec))

StatementMeta(, 96bdbf0a-a4b5-46a1-98de-30916349ac1a, 15, Finished, Available, Finished)

In [8]:
display(df_crimes)

StatementMeta(, 96bdbf0a-a4b5-46a1-98de-30916349ac1a, 16, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 01daa154-1d04-4dcf-b101-b0bb36ff6207)

In [None]:
df_crimes.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("fact_crimes")