In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import box
from pathlib import Path

In [None]:
# loading the cleaned crime data
gdf = gpd.read_parquet("cleaned_crime_data.parquet")
gdf.crs

<Projected CRS: EPSG:27700>
Name: OSGB36 / British National Grid
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- name: United Kingdom (UK) - offshore to boundary of UKCS within 49째45'N to 61째N and 9째W to 2째E; onshore Great Britain (England, Wales and Scotland). Isle of Man onshore.
- bounds: (-9.01, 49.75, 2.01, 61.01)
Coordinate Operation:
- name: British National Grid
- method: Transverse Mercator
Datum: Ordnance Survey of Great Britain 1936
- Ellipsoid: Airy 1830
- Prime Meridian: Greenwich

In [49]:
gdf.head()
gdf.shape

(915850, 5)

In [48]:
grid_size = 200  # in meters

In [50]:
minx, miny, maxx, maxy = gdf.total_bounds
minx, miny, maxx, maxy

(np.float64(90689.0351068792),
 np.float64(52643.034682033),
 np.float64(652943.9749586208),
 np.float64(649146.9619947528))

In [52]:
x_cord = np.arange(minx, maxx, grid_size)
y_cord = np.arange(miny, maxy, grid_size)
polygons = []
for x in x_cord:
    for y in y_cord:
        polygons.append(box(x, y, x + grid_size, y + grid_size))

grid = gpd.GeoDataFrame({'geometry': polygons}, crs=gdf.crs)
grid.head()


Unnamed: 0,geometry
0,"POLYGON ((90889.035 52643.035, 90889.035 52843..."
1,"POLYGON ((90889.035 52843.035, 90889.035 53043..."
2,"POLYGON ((90889.035 53043.035, 90889.035 53243..."
3,"POLYGON ((90889.035 53243.035, 90889.035 53443..."
4,"POLYGON ((90889.035 53443.035, 90889.035 53643..."


In [56]:
joined = gpd.sjoin(gdf, grid, how='left', predicate='within')

In [57]:
risk_scores = (joined.groupby("index_right")
               .agg(
                   Risk_Score=("Weight", "sum"),
                   Crime_Count=("Weight", "count")
               )
               .reset_index()
               )

In [58]:
grid = grid.merge(
    risk_scores,
    left_index=True,
    right_on="index_right",
    how="left"
)

grid = grid.drop(columns=["index_right"])
grid["Risk_Score"] = grid["Risk_Score"].fillna(0)
grid["Crime_Count"] = grid["Crime_Count"].fillna(0)


In [None]:
grid[["Risk_Score", "Crime_Count"]].describe()
# from what i can see the data from the metropolitain police  goes farther than greater london boundaries
# so we will clip the grid to greater london boundary

Unnamed: 0,Risk_Score,Crime_Count
count,8388196.0,8388196.0
mean,0.692868,0.109183
std,30.77656,4.827439
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,21850.0,3240.0


In [64]:
del grid

In [65]:
ldn_bounds = {
    "minx": 503571.5, # min_e
    "miny": 155854.3, # min_n
    "maxx": 561957.4962, # max_e
    "maxy": 200933.6227 # max_n
}


In [67]:
newx_coords = np.arange(ldn_bounds["minx"], ldn_bounds["maxx"], grid_size)
newy_coords = np.arange(ldn_bounds["miny"], ldn_bounds["maxy"], grid_size)

cells = [
    box(x, y, x + grid_size, y + grid_size)
    for x in newx_coords
    for y in newy_coords
]

grid = gpd.GeoDataFrame(geometry=cells, crs="EPSG:27700")

In [68]:
len(grid)

65992

In [69]:
#clip crime data to greater london boundary
gdf = gdf.cx[ldn_bounds["minx"]:ldn_bounds["maxx"], ldn_bounds["miny"]:ldn_bounds["maxy"]]

In [70]:
# spatial join
joined = gpd.sjoin(gdf, grid, how='left', predicate='intersects')
joined["index_right"].notna().sum()  

np.int64(913878)

In [71]:
# aggregate risk scores
risk_scores = (joined.groupby("index_right")
               .agg(
                   Risk_Score=("Weight", "sum"),
                   Crime_Count=("Weight", "count")
               )
               .reset_index()
               )

In [72]:
risk_scores["index_right"].is_unique

True

In [73]:
grid = grid.merge(
    risk_scores,
    left_index=True,
    right_on="index_right",
    how="left"
)

grid = grid.drop(columns=["index_right"])
grid["Risk_Score"] = grid["Risk_Score"].fillna(0)
grid["Crime_Count"] = grid["Crime_Count"].fillna(0)


In [82]:
grid[["Risk_Score", "Crime_Count"]].describe()


Unnamed: 0,Risk_Score,Crime_Count
count,65992.0,65992.0
mean,87.793505,13.848315
std,337.761503,52.773668
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,58.0,9.0
max,22907.0,3221.0


In [75]:
"""log transform risk scores to reduce skewness since crime risk is highly skwed because most grid cells will ahve zzero or low risk scores while a few will have very high risk scores
using log(1+x) to compress the extreme values and make the distribution more normal like"""
grid["Log_Risk_Score"] = grid["Risk_Score"].apply(lambda x: np.log1p(x))

In [80]:
grid.to_parquet("risk_grid.parquet")