# New York Motor Vehicle collisions

## Setup

In [47]:
# import required libraries
import requests
import pandas as pd
import numpy as np
import re
import seaborn as sns

## Data acquision

### define functions to read data

In [48]:
# def read_api_chunk(api, limit=1000, offset=0):
#     source = f"{api}?${limit=}&${offset=}"
#     r = requests.get(source)
#     data = pd.DataFrame.from_dict(r.json()) # read directly from json instead? faster?
#     return data

# use https://docs.python.org/3/reference/expressions.html#yield-expressions


def read_api_chunk(api, limit=1000, offset=0):
    """read a single chunk from the api"""
    return pd.read_json(f"{api}?${limit=}&${offset=}")


def read_api(api, size=1000, chunk_size=1000):
    """read given number of lines from api, applying the chunk_size along the way"""
    chunk_generator = (
        # define chunks; the last chunk might be smaller than chunk_size
        read_api_chunk(api, limit=min(chunk_size, size - x), offset=x)
        for x in range(0, size, chunk_size)
    )
    # in the generator expressions, the chunks are not yet read and stored in memory
    # the outer paranthesis are synctactilly required for generator expressions; they
    # are not included simply in order to permit the multiline definition

    # pd.concat can handle generator expressions. According to the api reference, the objs argument
    # accepts a sequence of DataFrame objects. This indicates that any iterable that yields DataFrame
    # objects will be accepted, which is what chunk_generator provides.
    return pd.concat(chunk_generator)

### inputs

In [49]:
# set input parameters
api = "https://data.cityofnewyork.us/resource/h9gi-nx95.json"
size = 12000
limit = 1000

### read data

In [50]:
# read data
data_raw = read_api_chunk(api, limit=15000)

In [51]:
# initial cleaning of data
data = data_raw.rename(
    columns={
        "vehicle_type_code1": "vehicle_type_code_1",
        "vehicle_type_code2": "vehicle_type_code_2",
    }
)
data.set_index(keys="collision_id", drop=False, inplace=True)
text_cols = [col for col in data if re.search("(street|contributing_factor)", col)]
data[text_cols] = data[text_cols].astype("string")

In [52]:
# format data to cover 5 vehicles in one column
data_wide = pd.wide_to_long(
    data,
    stubnames=["vehicle_type_code_", "contributing_factor_vehicle_"],
    i="collision_id",
    j="vehicle_no",
)
data_wide.rename(
    columns={
        "vehicle_type_code_": "vehicle_type_code",
        "contributing_factor_vehicle_": "contributing_factor_vehicle",
    },
    inplace=True,
)

# keep rows for vehicle no. > 1 only if relevant information pertaining to the vehicle is present; the row is redundant otherwise
_cnd1 = (
    data_wide[["vehicle_type_code", "contributing_factor_vehicle"]]
    .notnull()
    .any(axis=1)
)
_cnd2 = data_wide.index.get_level_values(level=1) == 1
_cnd = _cnd1 | _cnd2
data_wide = data_wide.loc[_cnd, :]
# data_wide.dtypes
data_wide.to_csv("data_wide.csv")

#### contributing Factors

In [53]:
# create a mapping for contributing factor to their code
confac = (
    data_wide["contributing_factor_vehicle"]
    .drop_duplicates()
    .dropna()
    .reset_index(drop=True)
    .to_frame(name="contributing_factor")
)


def get_first_chars(input):
    """retrieve first character of each word in a string of words"""
    return "".join(item[0].upper() for item in re.findall("\w+", input))


confac["cf"] = confac["contributing_factor"].apply(get_first_chars)
confac["n"] = confac.groupby(["cf"]).cumcount()
k = confac["n"] > 0
confac.loc[k, "cf"] = confac.loc[k, "cf"] + confac.loc[k, "n"].astype("string")
confac.set_index("contributing_factor", inplace=True)
confac_cols = "cf." + confac["cf"]
mapping_cf = pd.Series(confac["cf"]).to_dict()

In [54]:
# determine dummies grouped by collision_id
data_wide["cf"] = data_wide["contributing_factor_vehicle"].replace(mapping_cf)
dummies_long = pd.get_dummies(data_wide, columns=["cf"], prefix_sep=".")
dummies = dummies_long[confac_cols].groupby(level=0).max()
dummies["n_vehicles"] = dummies.sum(axis=1)
dummies.head()

Unnamed: 0_level_0,cf.ADRR,cf.PS,cf.FTC,cf.U,cf.PTC,cf.DI,cf.POLUI,cf.TI,cf.ULC,cf.US,...,cf.CPHF,cf.LMII,cf.CPHH,cf.UOBND,cf.OED,cf.TCDINW,cf.THD,cf.WI,cf.VV,n_vehicles
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4136992,False,False,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,2
4277087,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,2
4345591,False,False,True,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,2
4388940,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
4395664,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1


#### vehicle types

In [110]:
vehicle_type_codes = data_wide["vehicle_type_code"].astype("string").dropna()

In [111]:
# limit number of vehicle categories
# https://stackoverflow.com/questions/56440580/how-to-get-dummies-of-only-those-values-that-occur-more-than-x-time-in-pandas
v_cats = data_wide["vehicle_type_code"].value_counts().head(30)
# data_wide["vehicle_type_code"].str.replace(pat=r'\W+', repl="_", regex=True).value_counts().head(30)
data_wide["vehicle_cats"] = data_wide["vehicle_type_code"].str.replace(pat=r'\W+', repl="_", regex=True).str.lower()
data_wide["vehicle_cats"].nunique()

149

In [117]:
# reduce to top 50 most frequent types of cars. last category is set to "other"
top = 50
vehicle_top_cats = data_wide["vehicle_cats"].value_counts().head(top - 1).index
data_wide["vehicle_cats_short"] = data_wide["vehicle_cats"]

data_wide.loc[
    ~data_wide["vehicle_cats"].isin(vehicle_top_cats)
    & ~data_wide["vehicle_cats"].isna(),
    ["vehicle_cats_short"],
] = "other"

data_wide["vehicle_cats_short"] = data_wide["vehicle_cats_short"].astype("category")

data_wide["vehicle_cats_short"].value_counts()

vehicle_cats_short
sedan                                  12657
station_wagon_sport_utility_vehicle     9134
bike                                     747
taxi                                     572
box_truck                                566
pick_up_truck                            556
bus                                      483
e_bike                                   320
motorcycle                               248
tractor_truck_diesel                     213
ambulance                                172
van                                      165
e_scooter                                149
other                                    131
moped                                     97
dump                                      94
pk                                        61
convertible                               58
flat_bed                                  54
garbage_or_refuse                         51
carry_all                                 40
tow_truck_wrecker                   

## Roads

In [None]:
# data["on_street_name"].str.split(" ")

## Cyclists

### Injured Cyclists

In [None]:
data["number_of_cyclist_injured"].value_counts(dropna=True)
# data.head(10)

number_of_cyclist_injured
0    14331
1      660
2        9
Name: count, dtype: int64

### Contributing factors