# KDE Sampling

police.uk data has high (perturbed?) spatial resolution but very poor temporal (month of occurrence) and only broad categorical resolution:

- Other theft
- Violence and sexual offences
- Bicycle theft
- Criminal damage and arson
- Public order
- Robbery
- Shoplifting
- Vehicle crime
- Drugs
- Possession of weapons
- Theft from the person
- Burglary
- Other crime

This notebook creates a Kernel Density Estimate (KDE) over an entire force area, for one of the above crime categories, using 3 years data.

Synthetic crimes can then be sampled from this spatial distribution


In [2]:
%load_ext autoreload
%autoreload 2

In [107]:
import geopandas as gpd
import numpy as np
from scipy.stats import gaussian_kde

from utils import extract_crime_data

In [108]:
rng = np.random.default_rng(19937)

CATEGORY = "Public order"
crime_data = extract_crime_data("./data/wy202204-202503.zip")
# filter by catgegory and remove any points
crime_data = crime_data[(crime_data["Crime type"] == CATEGORY)]

In [109]:
crime_data

Unnamed: 0_level_0,Month,Reported by,Falls within,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context,Outcome type,geometry
Crime ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
cde2b5f3ded5c85add5aafdb536bdfe31ecf3dae4399b4d38948092ea8a8a301,2022-04,West Yorkshire Police,West Yorkshire Police,On or near Mount Pleasant,E01010692,Bradford 001D,Public order,Investigation complete; no suspect identified,,Investigation complete; no suspect identified,POINT (412145.992 447319.956)
95aaf13495d017cde7ad486e4e57b2899c1b1c16b28b953d752c76d70d5b1a7c,2022-04,West Yorkshire Police,West Yorkshire Police,On or near Bus/Coach Station,E01010692,Bradford 001D,Public order,Unable to prosecute suspect,,Unable to prosecute suspect,POINT (411854.969 447613.948)
bbe61d934f89b85f42e2e6a812706779a9ed8a0106d3dd8173de1ab288022b17,2022-04,West Yorkshire Police,West Yorkshire Police,On or near Back Middleton Road,E01010694,Bradford 002C,Public order,Unable to prosecute suspect,,Unable to prosecute suspect,POINT (411289.979 447872.023)
259057f8d09ba2423f3f1cee456240754eb2cb27b9ace24064320777155cafed,2022-04,West Yorkshire Police,West Yorkshire Police,On or near Supermarket,E01010695,Bradford 002D,Public order,Investigation complete; no suspect identified,,Investigation complete; no suspect identified,POINT (411717 447717.967)
10b856f1e9d9b4ece555be9ad38198ec7745ca421a383f658eda8ce75c6c0a30,2022-04,West Yorkshire Police,West Yorkshire Police,On or near Back Grove Road,E01010695,Bradford 002D,Public order,Investigation complete; no suspect identified,,Investigation complete; no suspect identified,POINT (411587.999 447648.998)
...,...,...,...,...,...,...,...,...,...,...,...
a13de3868acbdb6b12499c6225a035dc8f1bcd8b82bdc6b3931336f8bd009aac,2025-03,West Yorkshire Police,West Yorkshire Police,On or near Edna Street,E01011859,Wakefield 043A,Public order,Awaiting court outcome,,Suspect charged,POINT (447099.03 411218.977)
6d2686d2eb6df343e68490dbf6cb11721a8a764c2baad2fc545e71190e91a457,2025-03,West Yorkshire Police,West Yorkshire Police,On or near Burton Street,E01011869,Wakefield 043B,Public order,Unable to prosecute suspect,,Unable to prosecute suspect,POINT (446517.024 410962.975)
507aef1dcc104d4c02d6dc1fccc305eea596477b665722986f955e0a574e4343,2025-03,West Yorkshire Police,West Yorkshire Police,On or near Parking Area,E01011869,Wakefield 043B,Public order,Awaiting court outcome,,Suspect charged,POINT (446779.006 411205.004)
ad2a3d4463dcecd01a77ed9cb0a4b131fd921efaa7974e25d37e67b55dc1bbef,2025-03,West Yorkshire Police,West Yorkshire Police,On or near Holmsley Avenue,E01011864,Wakefield 045A,Public order,Unable to prosecute suspect,,Unable to prosecute suspect,POINT (444280.028 410671.995)


In [110]:
# Ensure you download at least one of these from ONS
# e.g. https://geoportal.statistics.gov.uk/datasets/ons::lower-layer-super-output-areas-december-2021-boundaries-ew-bsc-v4-2/about
lsoa_boundary_files = {
    "FE": "Lower_layer_Super_Output_Areas_December_2021_Boundaries_EW_BFE_V10_-3435351624505741073.zip",
    "GC": "Lower_layer_Super_Output_Areas_December_2021_Boundaries_EW_BGC_V5_4492169359079898015.zip",
    "SC": "Lower_layer_Super_Output_Areas_December_2021_Boundaries_EW_BSC_V4_-5236167991066794441.zip",
}

lsoa_boundaries = gpd.read_file(f"./data/{lsoa_boundary_files['GC']}").set_index(
    "LSOA21CD"
)

# throw away any not in the bounding box defined by the crimes
bbox = crime_data.geometry.unary_union.envelope
lsoa_boundaries = lsoa_boundaries[lsoa_boundaries.geometry.intersects(bbox)]
lsoa_boundaries

  bbox = crime_data.geometry.unary_union.envelope


Unnamed: 0_level_0,LSOA21NM,LSOA21NMW,BNG_E,BNG_N,LAT,LONG,GlobalID,geometry
LSOA21CD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
E01005327,Oldham 017A,,390193,405056,53.5421,-2.14946,1513f820-ce1c-4b13-be4c-69dea8325140,"POLYGON ((390535.153 405469.056, 390539.371 40..."
E01005328,Oldham 017B,,390798,405015,53.5417,-2.14033,5ab640be-3dc1-475c-a36a-58701b77be95,"POLYGON ((391018.814 405283.19, 391198.975 405..."
E01005330,Oldham 015B,,389308,404650,53.5384,-2.16280,d686e513-28b2-4a5e-956d-3bef27e572c2,"POLYGON ((389312.316 405904.622, 389557.81 405..."
E01005334,Oldham 017D,,390085,405867,53.5494,-2.15111,20d9b57e-b781-4b60-b471-30b8326b8e0a,"POLYGON ((389771.442 406206.073, 389773.407 40..."
E01005336,Oldham 015D,,389820,406376,53.5539,-2.15513,42d587ae-3fbe-4044-9239-5be6601bfd55,"POLYGON ((390018.848 406631.078, 390044.068 40..."
...,...,...,...,...,...,...,...,...
E01035050,Leeds 105G,,430214,426790,53.7366,-1.54345,4f0cba78-bfe1-4843-ab4b-bb44e691f99e,"POLYGON ((430737.415 427501.249, 430760.513 42..."
E01035051,Leeds 105H,,431335,427174,53.7400,-1.52641,2de24ca1-ac16-44b0-a316-24ba49dba14c,"POLYGON ((431229.959 427511.255, 431264.502 42..."
E01035052,Leeds 105I,,431014,426532,53.7343,-1.53135,1d808faa-55df-40ac-8dbd-50144f26e717,"POLYGON ((431312.508 427066, 431268.346 426814..."
E01035053,Leeds 105J,,432010,426253,53.7317,-1.51628,68610023-f8dc-40af-95ef-d069556a4516,"POLYGON ((432214.264 427106.239, 432235.065 42..."


In [126]:
# create the KDE
points = np.array(crime_data.geometry.map(lambda p: [p.y, p.x]).tolist()).T
kernel = gaussian_kde(
    points, 0.01
)  # values greater than this produce samples that are too "spread out"

In [127]:
# Create synthetic data
n_crimes = 5000

y, x = kernel.resample(n_crimes, seed=rng.integers(2**32))
synth_crime = gpd.GeoDataFrame(
    index=range(n_crimes),
    data={
        "Month": None,
        "Reported by": "Officer Alex James Murphy",
        "Falls within": "West Yorkshire Police",
        "Location": None,
        "Crime type": CATEGORY,
    },
    geometry=gpd.points_from_xy(x, y),
    crs="epsg:27700",
)
# spatial join to get LSOA from point
synth_crime = gpd.sjoin(
    synth_crime, lsoa_boundaries[["LSOA21NM", "geometry"]], how="left"
).rename(columns={"LSOA21CD": "LSOA code", "LSOA21NM": "LSOA name"})

In [128]:
# overlay the same number of real crimes on sampled crimes.
map = synth_crime.explore(color="red")
map = crime_data.sample(n_crimes, replace=False).explore(m=map)

map