# New York Motor Vehicle collisions

## Setup

In [1]:
# import required libraries
import requests
import pandas as pd
import numpy as np
import re
import seaborn as sns

## Data acquision

### define functions to read data

In [2]:
# def read_api_chunk(api, limit=1000, offset=0):
#     source = f"{api}?${limit=}&${offset=}"
#     r = requests.get(source)
#     data = pd.DataFrame.from_dict(r.json()) # read directly from json instead? faster?
#     return data

# use https://docs.python.org/3/reference/expressions.html#yield-expressions


def read_api_chunk(api, limit=1000, offset=0):
    """read a single chunk from the api"""
    return pd.read_json(f"{api}?${limit=}&${offset=}")


def read_api(api, size=1000, chunk_size=1000):
    """read given number of lines from api, applying the chunk_size along the way"""
    chunk_generator = (
        # define chunks; the last chunk might be smaller than chunk_size
        read_api_chunk(api, limit=min(chunk_size, size - x), offset=x)
        for x in range(0, size, chunk_size)
    )
    # in the generator expressions, the chunks are not yet read and stored in memory
    # the outer paranthesis are synctactilly required for generator expressions; they
    # are not included simply in order to permit the multiline definition

    # pd.concat can handle generator expressions. According to the api reference, the objs argument
    # accepts a sequence of DataFrame objects. This indicates that any iterable that yields DataFrame
    # objects will be accepted, which is what chunk_generator provides.
    return pd.concat(chunk_generator)

### inputs

In [3]:
# set input parameters
api = "https://data.cityofnewyork.us/resource/h9gi-nx95.json"
n = 50e3
# limit = 1000

### read data

In [4]:
# read data
data_raw = read_api_chunk(api, limit=int(n))

In [5]:
# initial cleaning of data
data = data_raw.rename(
    columns={
        "vehicle_type_code1": "vehicle_type_code_1",
        "vehicle_type_code2": "vehicle_type_code_2",
    }
)
data.set_index(keys="collision_id", drop=False, inplace=True)
text_cols = [col for col in data if re.search("(street|contributing_factor)", col)]
data[text_cols] = data[text_cols].astype("string")

In [6]:
# format data to cover 5 vehicles in one column
data_wide = pd.wide_to_long(
    data,
    stubnames=["vehicle_type_code_", "contributing_factor_vehicle_"],
    i="collision_id",
    j="vehicle_no",
)
data_wide.rename(
    columns={
        "vehicle_type_code_": "vehicle_type_code",
        "contributing_factor_vehicle_": "contributing_factor_vehicle",
    },
    inplace=True,
)

# keep rows for vehicle no. > 1 only if relevant information pertaining to the vehicle is present; the row is redundant otherwise
_cnd1 = (
    data_wide[["vehicle_type_code", "contributing_factor_vehicle"]]
    .notnull()
    .any(axis=1)
)
_cnd2 = data_wide.index.get_level_values(level=1) == 1
_cnd = _cnd1 | _cnd2
data_wide = data_wide.loc[_cnd, :]
# data_wide.dtypes
data_wide.to_csv("data_wide.csv")

#### contributing Factors

In [7]:
# create a mapping for contributing factor to their code
confac = (
    data_wide["contributing_factor_vehicle"]
    .drop_duplicates()
    .dropna()
    .reset_index(drop=True)
    .to_frame(name="contributing_factor")
)


def get_first_chars(input):
    """retrieve first character of each word in a string of words"""
    return "".join(item[0].upper() for item in re.findall("\w+", input))


confac["cf"] = confac["contributing_factor"].apply(get_first_chars)

# if not unique, add counting index
confac["n"] = confac.groupby(["cf"]).cumcount()
_k = confac["n"] > 0
confac.loc[_k, "cf"] = confac.loc[_k, "cf"] + confac.loc[_k, "n"].astype("string")
confac.set_index("contributing_factor", inplace=True)

confac.to_csv("output/confac.csv")

confac_cols = "cf." + confac["cf"]
mapping_cf = pd.Series(confac["cf"]).to_dict()

In [8]:
# determine dummies grouped by collision_id
data_wide["cf"] = data_wide["contributing_factor_vehicle"].replace(mapping_cf)
dummies_cf_long = pd.get_dummies(data_wide, columns=["cf"], prefix_sep=".")
dummies_cf = dummies_cf_long[confac_cols].groupby(level=0).max()
dummies_cf["n_vehicles"] = dummies_cf.sum(axis=1)
dummies_cf.head()

Unnamed: 0_level_0,cf.ADRR,cf.PS,cf.FTC,cf.U,cf.PTC,cf.DI,cf.POLUI,cf.TI,cf.ULC,cf.US,...,cf.OED,cf.TCDINW,cf.THD,cf.WI,cf.VV,cf.PM,cf.SDI,cf.LUH,cf.T,n_vehicles
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3456194,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
3460534,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
3528065,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,2
4136992,False,False,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,2
4277087,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,2


#### vehicle types

In [9]:
vehicle_type_codes = data_wide["vehicle_type_code"].astype("string").dropna()

In [10]:
# limit number of vehicle categories
# https://stackoverflow.com/questions/56440580/how-to-get-dummies-of-only-those-values-that-occur-more-than-x-time-in-pandas
v_cats = data_wide["vehicle_type_code"].value_counts().head(30)
# data_wide["vehicle_type_code"].str.replace(pat=r'\W+', repl="_", regex=True).value_counts().head(30)
data_wide["vehicle_type"] = data_wide["vehicle_type_code"].str.replace(pat=r'\W+', repl="_", regex=True).str.lower()
data_wide["vehicle_type"].nunique()


274

In [11]:
# reduce to top 50 most frequent types of cars. last category is set to "other"
top = 50
vehicle_top_cats = data_wide["vehicle_type"].value_counts().head(top - 1).index
data_wide["vt"] = data_wide["vehicle_type"]

data_wide.loc[
    ~data_wide["vehicle_type"].isin(vehicle_top_cats)
    & ~data_wide["vehicle_type"].isna(),
    ["vt"],
] = "other"

data_wide["vt"] = data_wide["vt"].astype("category")
data_wide["vt"].value_counts()

vt
sedan                                  41994
station_wagon_sport_utility_vehicle    30460
bike                                    2483
box_truck                               1950
pick_up_truck                           1933
taxi                                    1796
bus                                     1550
e_bike                                  1110
motorcycle                               859
tractor_truck_diesel                     722
van                                      590
e_scooter                                567
ambulance                                541
other                                    430
moped                                    331
dump                                     329
pk                                       206
convertible                              179
flat_bed                                 168
garbage_or_refuse                        164
carry_all                                145
motorbike                                129
motorsc

In [12]:
dummies_long = pd.get_dummies(data_wide, prefix=["vt", "cf"], columns=["vt", "cf"], prefix_sep=".")
dummies = dummies_long.filter(regex=r"^(vt|cf)\.").groupby(level=0).max()

In [13]:
# split data
from sklearn.model_selection import train_test_split
data2 = data.join(dummies)
data2.to_csv("output/data2.csv")
X = data2.filter(regex=r"^(vt|cf)\.", axis=1)
y = data2["number_of_persons_injured"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [14]:
# test ridge classifier
from sklearn.linear_model import RidgeClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html#sklearn.linear_model.RidgeClassifier
clf_ridge = RidgeClassifier()
clf_ridge.fit(X_train, y_train)
clf_ridge.score(X_test, y_test)

0.71528

In [15]:
# test random tree classifier
from sklearn.ensemble import RandomForestClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn-ensemble-randomforestclassifier
clf_rfc = RandomForestClassifier(n_estimators=10)
# clf_rfc = clf_rfc.fit(X_train, y_train)
clf_rfc.fit(X_train, y_train)
clf_rfc.score(X_test, y_test)


0.71464

In [16]:
from sklearn.tree import DecisionTreeClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
clf_dct = DecisionTreeClassifier(min_samples_leaf=3, min_samples_split=9, random_state=500)
clf_dct.fit(X_train, y_train)
clf_dct.score(X_test, y_test)

0.71696

In [17]:
from sklearn.neighbors import KNeighborsClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn-neighbors-kneighborsclassifier
clf_knn = KNeighborsClassifier(algorithm='ball_tree')
clf_knn = clf_knn.fit(X_train, y_train)
clf_knn.score(X_test, y_test)

0.69096

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
# # Prepare the list of tuples with the first-layer classifiers


clf_ridge = RidgeClassifier()
clf_rfc = RandomForestClassifier(n_estimators=10)
clf_dct = DecisionTreeClassifier(min_samples_leaf=3, min_samples_split=9, random_state=500)

classifiers = [
	clf_ridge,
    clf_rfc,
    clf_dct
]

estimators = [
    # ('ridge', RidgeClassifier()),
    ('random_forest', RandomForestClassifier(n_estimators=10)),
    ('decision_tree', DecisionTreeClassifier(min_samples_leaf=3, min_samples_split=9, random_state=500))
]

# Instantiate the second-layer meta estimator
clf_meta = LogisticRegression()
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression

# Build the stacking classifier
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingClassifier.html#sklearn-ensemble-stackingclassifier
clf_stack = StackingClassifier(
   estimators=estimators,
   final_estimator=clf_meta,
   # stack_method='predict_proba',
   passthrough = False)



clf_stack.fit(X_train, y_train)
clf_stack.score(X_test, y_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.71872

In [28]:
zip(*classifiers)

TypeError: 'RidgeClassifier' object is not iterable

In [None]:
clf_rfc.get_params()

## Roads

In [None]:
# data["on_street_name"].str.split(" ")

## Cyclists

### Injured Cyclists

In [None]:
data["number_of_cyclist_injured"].value_counts(dropna=True)
# data.head(10)

### Contributing factors