In [1]:
import polars as pl
import json
import geopandas as gpd

In [2]:
data = pl.read_parquet("data/processed.parquet")
invalid = pl.read_parquet("data/invalid_coordinate.parquet")

In [3]:
data.head()

borough,zip_code,latitude,longitude,on_street_name,cross_street_name,off_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle_1,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,vehicle_type_code_1,vehicle_type_code_2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5,date,year,month,time,hour,number_of_casualty
str,i64,f64,f64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,str,str,str,str,str,str,str,str,str,str,date,i32,i8,time,i8,i64
,,,,"""WHITESTONE EXPRESSWAY""","""20 AVENUE""",,2,0,0,0,0,0,2,0,"""Aggressive Driving/Road Rage""","""Unspecified""",,,,"""Sedan""","""Sedan""",,,,2021-09-11,2021,9,02:39:00,2,2
,,,,"""QUEENSBORO BRIDGE UPPER""",,,1,0,0,0,0,0,1,0,"""Pavement Slippery""",,,,,"""Sedan""",,,,,2022-03-26,2022,3,11:45:00,11,1
,,,,"""THROGS NECK BRIDGE""",,,0,0,0,0,0,0,0,0,"""Following Too Closely""","""Unspecified""",,,,"""Sedan""","""Pick-up Truck""",,,,2022-06-29,2022,6,06:55:00,6,0
"""BROOKLYN""",11208.0,40.667202,-73.8665,,,"""1211 LORING AVENUE""",0,0,0,0,0,0,0,0,"""Unspecified""",,,,,"""Sedan""",,,,,2021-09-11,2021,9,09:35:00,9,0
"""BROOKLYN""",11233.0,40.683304,-73.917274,"""SARATOGA AVENUE""","""DECATUR STREET""",,0,0,0,0,0,0,0,0,,,,,,,,,,,2021-12-14,2021,12,08:13:00,8,0


In [4]:
marked = data.drop_nulls(subset=["latitude"])
marked = marked.drop_nulls(subset=["longitude"])

In [5]:
by = ["borough", "year"]
on = ["number_of_persons_injured", "number_of_persons_killed", "number_of_casualty"]
for col in by:
    metrics = data.drop_nulls(subset=[col]).group_by(col).agg(pl.col(on).sum())
    metrics = metrics.sort(by=col)
    print(metrics.head())
    metrics.write_parquet(f"data/metrics_{col}.parquet")

shape: (5, 4)
┌───────────────┬───────────────────────────┬──────────────────────────┬────────────────────┐
│ borough       ┆ number_of_persons_injured ┆ number_of_persons_killed ┆ number_of_casualty │
│ ---           ┆ ---                       ┆ ---                      ┆ ---                │
│ str           ┆ i64                       ┆ i64                      ┆ i64                │
╞═══════════════╪═══════════════════════════╪══════════════════════════╪════════════════════╡
│ BRONX         ┆ 72466                     ┆ 291                      ┆ 72757              │
│ BROOKLYN      ┆ 157539                    ┆ 648                      ┆ 158187             │
│ MANHATTAN     ┆ 70563                     ┆ 351                      ┆ 70914              │
│ QUEENS        ┆ 120793                    ┆ 545                      ┆ 121338             │
│ STATEN ISLAND ┆ 17809                     ┆ 97                       ┆ 17906              │
└───────────────┴───────────────────────────┴─

In [6]:
row = {
    "Locations marked": marked.shape[0],
    "Locations unmarked": data.shape[0] - marked.shape[0],
    "Valid markings": marked.shape[0] - invalid.shape[0],
    "Invalid markings": invalid.shape[0],
    "Total number of crashes": data.shape[0],
    "Crashes with time, location": data.select(["date", "borough", "time", "latitude", "longitude"]).drop_nulls().shape[0],
    "Total killed": data["number_of_persons_killed"].sum(),
    "Total injured": data["number_of_persons_injured"].sum()
}

In [7]:
row

{'Locations marked': 1873166,
 'Locations unmarked': 252597,
 'Valid markings': 1869560,
 'Invalid markings': 3606,
 'Total number of crashes': 2125763,
 'Crashes with time, location': 1414687,
 'Total killed': 3249,
 'Total injured': 673440}

In [8]:
with open("data/metrics.json", mode='w') as f:
    json.dump(row, f)

In [9]:
nyc = gpd.read_parquet("data/heatmap.parquet")
nyc

Unnamed: 0,code,length,area,geometry,borough,number_of_persons_injured,number_of_persons_killed,number_of_casualty,number_of_crash
0,5,330470.010332,1623820000.0,"MULTIPOLYGON (((-74.05051 40.56642, -74.05047 ...",staten island,17809,97,17906,61409
1,4,896344.047763,3045213000.0,"MULTIPOLYGON (((-73.83668 40.59495, -73.83678 ...",queens,120793,545,121338,392867
2,3,741080.523166,1937479000.0,"MULTIPOLYGON (((-73.86706 40.58209, -73.86769 ...",brooklyn,157539,648,158187,466928
3,1,359299.096471,636471500.0,"MULTIPOLYGON (((-74.01093 40.68449, -74.01193 ...",manhattan,70563,351,70914,326658
4,2,464392.991824,1186925000.0,"MULTIPOLYGON (((-73.89681 40.79581, -73.89694 ...",bronx,72466,291,72757,216786


In [11]:
per_unit = nyc.drop(["geometry", "code"], axis=1)
per_unit

Unnamed: 0,length,area,borough,number_of_persons_injured,number_of_persons_killed,number_of_casualty,number_of_crash
0,330470.010332,1623820000.0,staten island,17809,97,17906,61409
1,896344.047763,3045213000.0,queens,120793,545,121338,392867
2,741080.523166,1937479000.0,brooklyn,157539,648,158187,466928
3,359299.096471,636471500.0,manhattan,70563,351,70914,326658
4,464392.991824,1186925000.0,bronx,72466,291,72757,216786
