# Makeover Monday: Bike crashes in London

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import altair as alt
import altair_latimes as lat
import datetime as dt



In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = None
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [4]:
today = dt.datetime.today().strftime("%Y-%m-%d")

---

In [5]:
# https://data.world/makeovermonday/2021w31/workspace/file?filename=Bike+Collisions.xlsx
src = pd.read_excel("https://query.data.world/s/tfaspfre5yetduocupcyfbhdawcv6y")

In [6]:
src.columns = src.columns.str.lower().str.replace(" ", "", regex=False)

### Dates

In [7]:
src["year"] = src["date"].dt.year.astype(str)
src["weekday"] = src["date"].dt.day_name()
src["monthname"] = src["date"].dt.month_name()
src["monthyear"] = src["date"].dt.strftime("%m-%Y")

### Times

In [8]:
src["date_clean"] = [d.date() for d in src["date"]]

In [9]:
src["hour"] = pd.to_datetime(src["date"]).dt.hour

### Categorize the time of the incident

In [10]:
def categorize_hours(h):
    if (h > 4) and (h <= 8):
        return "5-8am"
    elif (h > 8) and (h <= 12):
        return "9-noon"
    elif (h > 12) and (h <= 16):
        return "1-4pm"
    elif (h > 16) and (h <= 20):
        return "5-8pm"
    elif (h > 20) and (h <= 24):
        return "9-midnight"
    elif h <= 4:
        return "1-4am"

In [11]:
src["time_period"] = src["hour"].apply(categorize_hours)

### Just solo cyclist incidents

In [12]:
src = src[src["casualties"] == "Cyclist"]

### Clean up

In [13]:
src.drop(["url", "apiurl", "date", "casualties"], inplace=True, axis=1)

In [14]:
src.rename(columns={"date_clean": "date"}, inplace=True)

---

In [15]:
df = src.copy()

In [16]:
df.severity.value_counts()

slight     51707
serious     7396
fatal        188
Name: severity, dtype: int64

### Convert to geodataframe with lat/lon

In [17]:
df_geo = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs=4326
)

In [18]:
df_geo = df_geo.to_crs(27700)

---

### Years

In [19]:
years = df.groupby(["year"]).agg({"numberofcasualties": "sum"}).reset_index()

### Wards

In [20]:
wards = df.groupby(["ward", "year"]).agg({"numberofcasualties": "sum"}).reset_index()

### Boroughs

In [21]:
boroughs = (
    df.groupby(["borough", "year"]).agg({"numberofcasualties": "sum"}).reset_index()
)

---

## Geography

In [22]:
# via https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london &
# https://data.london.gov.uk/dataset/inner-and-outer-london-boundaries-london-plan-consultation-2009

### Wards

In [23]:
wards_geo = gpd.read_file(
    "data/raw/London-wards-2018/London-wards-2018_ESRI/London_Ward.shp", crs=4326
)

In [24]:
wards_geo.crs = "epsg:27700"

In [25]:
wards_geo.columns = wards_geo.columns.str.lower()

### Boroughs

In [26]:
boroughs_geo = gpd.read_file(
    "data/raw/statistical-gis-boundaries-london/ESRI/London_Borough_Excluding_MHW.shp"
)

In [27]:
boroughs_geo.crs = "epsg:27700"

In [28]:
boroughs_geo.columns = boroughs_geo.columns.str.lower()

### London city boundary

In [29]:
london_boundaries_geo = gpd.read_file(
    "data/raw/inner-outer-london-shp/lp-consultation-oct-2009-inner-outer-london.shp"
)

In [30]:
london_boundaries_geo.columns = london_boundaries_geo.columns.str.lower()

In [31]:
london_boundaries_geo.crs = "epsg:27700"

### Dissolve the inner and outer boundaries

In [32]:
london_geo = london_boundaries_geo.dissolve(by="source")

### Inner vs. Outer

In [33]:
inner = london_boundaries_geo[london_boundaries_geo["boundary"] == "Inner London"]

In [34]:
outer = london_boundaries_geo[london_boundaries_geo["boundary"] == "Outer London"]

### Half-mile hex grid

In [35]:
hex_halfmile = gpd.read_file("data/processed/london-halfmile.geojson")

---

### Aggregate the collisions by hex bin

In [36]:
merged = gpd.sjoin(df_geo, hex_halfmile, op="within")

In [37]:
merged_hex_geo = (
    merged.groupby(["id"])
    .agg({"accidentindex": "count"})
    .reset_index()
    .rename(columns={"accidentindex": "count"})
)

In [38]:
merged_hex_geo.sort_values("count", ascending=False).head()

Unnamed: 0,id,count
1297,2362,414
1247,2305,408
1250,2308,373
1343,2417,333
1201,2252,329


In [39]:
hex_map_counts = gpd.GeoDataFrame(merged_hex_geo.merge(hex_halfmile, on="id"))

---

## Exports

### London boundaries

In [40]:
london_geo.to_file("data/processed/london_boundary_dissolved.geojson", driver="GeoJSON")

### Hex bins with counts

In [41]:
hex_map_counts.to_file("data/processed/hex_map_counts.geojson", driver="GeoJSON")

### Inner London boundary file

In [42]:
inner.to_file("data/processed/inner_london.geojson", driver="GeoJSON")

### All the collisions

In [43]:
df.to_csv("data/processed/bike-collisions.csv", index=False)