# KDE Sampling

police.uk data has high (perturbed?) spatial resolution but very poor temporal (month of occurrence) and only broad categorical resolution:

- Other theft
- Violence and sexual offences
- Bicycle theft
- Criminal damage and arson
- Public order
- Robbery
- Shoplifting
- Vehicle crime
- Drugs
- Possession of weapons
- Theft from the person
- Burglary
- Other crime

This notebook creates a Kernel Density Estimate (KDE) over an entire force area, for one of the above crime categories, using 3 years data.

Synthetic crimes can then be sampled from this spatial distribution


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from functools import partial

import geopandas as gpd
import numpy as np
from scipy.stats import gaussian_kde

from utils import CATEGORIES, extract_crime_data, get_lsoa_boundaries

In [None]:
rng = np.random.default_rng(19937)

CATEGORY = CATEGORIES[2]
crime_data = extract_crime_data("./data/wy202204-202503.zip")
# filter by catgegory and remove any points
crime_data = crime_data[(crime_data["Crime type"] == CATEGORY)]
crime_data

Unnamed: 0_level_0,Month,Reported by,Falls within,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context,Outcome type,geometry
Crime ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
298e807de56cd30acd92aeaae0530b24252a2ba0fb5041a03e2828d1b8e2c841,2022-04,West Yorkshire Police,West Yorkshire Police,On or near Wheatley Road,E01010691,Bradford 002A,Possession of weapons,Formal action is not in the public interest,,Formal action is not in the public interest,POINT (412405.002 447323.97)
b4cdb65818f3c72c99e878bc1cf9dd12105ac362c08f20a8f49cf43b700494e5,2022-04,West Yorkshire Police,West Yorkshire Police,On or near Wheatley Road,E01010691,Bradford 002A,Possession of weapons,Offender given a caution,,Offender given a caution,POINT (412405.002 447323.97)
c1b07eecc81cce4f3d874cd085bf5a18fb9ad82abb066743b3401e1facab4a58,2022-04,West Yorkshire Police,West Yorkshire Police,On or near Pattie Street,E01010728,Bradford 007C,Possession of weapons,Unable to prosecute suspect,,Unable to prosecute suspect,POINT (405925.016 442351.98)
8fb8e069000240559159390651104a297b75d6234a6c120b94ac046cef00ecc2,2022-04,West Yorkshire Police,West Yorkshire Police,On or near Greenthwaite Close,E01010729,Bradford 007D,Possession of weapons,Offender given a caution,,Offender given a caution,POINT (405385.017 442041.03)
3138d50e8e27c26500c72ba5d1700d09a091746ac4f5a9d132f3b55338298387,2022-04,West Yorkshire Police,West Yorkshire Police,On or near East Parade,E01010710,Bradford 008G,Possession of weapons,Court result unavailable,,Suspect charged,POINT (406401.971 441195.033)
...,...,...,...,...,...,...,...,...,...,...,...
e64c0683ab4d06814a96805a4cb4681874aeb9d912d89ba3ac63db399c5d6038,2025-03,West Yorkshire Police,West Yorkshire Police,On or near Nelson Street,E01011819,Wakefield 016E,Possession of weapons,Offender given a caution,,Offender given a caution,POINT (439161.033 423080.032)
6837eb184f8b046caadee5e771638f6b948972063ec2b94b59896382d5e4d710,2025-03,West Yorkshire Police,West Yorkshire Police,On or near Jacob'S Well Lane,E01011913,Wakefield 017C,Possession of weapons,Investigation complete; no suspect identified,,Investigation complete; no suspect identified,POINT (433603.018 421240.971)
678e9eac568c6bd03e48cc61cea462184332d04d7fc726ce95b1bb3a4b2e4f29,2025-03,West Yorkshire Police,West Yorkshire Police,On or near Frain Close,E01011840,Wakefield 018A,Possession of weapons,Awaiting court outcome,,Suspect charged,POINT (446014.03 421497.982)
b1696b6815931851c0f41568cd98c6b93b8f50bbb9c4e8b30b64e01b06e39fc5,2025-03,West Yorkshire Police,West Yorkshire Police,On or near Park Lodge Crescent,E01011915,Wakefield 020E,Possession of weapons,Local resolution,,Local resolution,POINT (434196.977 420839.026)


In [9]:
lsoa_boundaries = get_lsoa_boundaries("FE", overlapping=crime_data)
lsoa_boundaries

Unnamed: 0_level_0,LSOA21NM,LSOA21NMW,BNG_E,BNG_N,LAT,LONG,GlobalID,geometry
LSOA21CD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
E01010568,Bradford 016A,,416258,438952,53.8466,-1.75438,137e4959-a6d1-40d7-b2a5-563c9ff4af20,"POLYGON ((416422.17 439366.289, 416417.848 439..."
E01010569,Bradford 016B,,414693,438453,53.8421,-1.77819,1f4e6cbc-75a9-4ffa-b851-0792627c632e,"POLYGON ((415369.043 439212.461, 415368.435 43..."
E01010571,Bradford 016C,,415223,438398,53.8416,-1.77013,1894828c-0aae-4ded-8e36-77a62ab8c4b4,"POLYGON ((415768.621 438654.729, 415766.728 43..."
E01010572,Bradford 016D,,415428,438857,53.8457,-1.76700,9a46128a-3965-45d7-b06f-e6f2af85bbb1,"POLYGON ((415616.807 439194.596, 415622.818 43..."
E01010573,Bradford 013A,,414123,440131,53.8572,-1.78677,25d2a550-da16-4b04-99ed-16404376537c,"POLYGON ((414068.435 441336.988, 414070 441336..."
...,...,...,...,...,...,...,...,...
E01035048,Leeds 099G,,430979,428149,53.7488,-1.53171,3402834e-dade-45d4-b600-835a4a7d8bea,"POLYGON ((431362 428526, 431366 428520, 431369..."
E01035050,Leeds 105G,,430214,426790,53.7366,-1.54345,607700e7-4a88-4736-9f01-e26e4347dc5d,"POLYGON ((430986.896 427756.226, 430982.739 42..."
E01035051,Leeds 105H,,431335,427174,53.7400,-1.52641,96961ba7-331b-4410-ac81-6d58fc3cbe31,"POLYGON ((431044.305 427760.241, 431050.502 42..."
E01035052,Leeds 105I,,431014,426532,53.7343,-1.53135,9cd4244a-8d53-4af2-8362-1f9898de0a65,"POLYGON ((431312.508 427066, 431277.221 426958..."


In [33]:
# create the KDE
def kde_bandwidth(obj, *, fac: float = 0.2) -> float:
    """We use Scott's Rule, multiplied by a constant factor."""
    return np.power(obj.n, -1.0 / (obj.d + 4)) * fac


points = np.array(crime_data.geometry.map(lambda p: [p.y, p.x]).tolist()).T
# kde = gaussian_kde(
#     points, 0.01
# )  # values greater than this produce samples that are too "spread out"

kde = gaussian_kde(points, bw_method=partial(kde_bandwidth, fac=0.2))

In [41]:
# Create synthetic data
n_crimes = min(len(crime_data), 10000)

y, x = kde.resample(n_crimes, seed=rng.integers(2**32))
synth_crime = gpd.GeoDataFrame(
    index=range(n_crimes),
    data={
        "Month": None,
        "Reported by": "Officer Alex James Murphy",
        "Falls within": "West Yorkshire Police",
        "Location": None,
        "Crime type": CATEGORY,
    },
    geometry=gpd.points_from_xy(x, y),
    crs="epsg:27700",
)
# spatial join to get LSOA from point
synth_crime = gpd.sjoin(synth_crime, lsoa_boundaries[["LSOA21NM", "geometry"]], how="left").rename(
    columns={"LSOA21CD": "LSOA code", "LSOA21NM": "LSOA name"}
)

In [42]:
# overlay the same number of real crimes on sampled crimes.
map = synth_crime.explore(color="red")
map = crime_data.sample(n_crimes, replace=False).explore(m=map)

map

In [None]:
#