# New York Motor Vehicle collisions

## Setup

In [1]:
import requests
import pandas as pd
import numpy as np
import re

## Data acquision

### define functions to read data

In [2]:
# def read_api_chunk(api, limit=1000, offset=0):
#     source = f"{api}?${limit=}&${offset=}"
#     r = requests.get(source)
#     data = pd.DataFrame.from_dict(r.json()) # read directly from json instead? faster?
#     return data

# use https://docs.python.org/3/reference/expressions.html#yield-expressions

def read_api_chunk(api, limit=1000, offset=0):
    """ read a single chunk from the api
    """
    return pd.read_json(f"{api}?${limit=}&${offset=}")

def read_api(api, size=1000, chunk_size=1000):
    """ read given number of lines from api, applying the chunk_size along the way
    """
    chunk_generator = ( 
        # define chunks; the last chunk might be smaller than chunk_size
        read_api_chunk(api, limit=min(chunk_size, size - x), offset=x) 
        for x in range(0, size, chunk_size)
        )
        # in the generator expressions, the chunks are not yet read and stored in memory
        # the outer paranthesis are synctactilly required for generator expressions; they 
        # are not included simply in order to permit the multiline definition
    
    # pd.concat can handle generator expressions. According to the api reference, the objs argument
    # accepts a sequence of DataFrame objects. This indicates that any iterable that yields DataFrame
    # objects will be accepted, which is what chunk_generator provides. 
    return pd.concat(chunk_generator) 




### inputs

In [26]:
api = "https://data.cityofnewyork.us/resource/h9gi-nx95.json"
size = 12000
limit = 1000
# data = read_api(api, size=12500)
data_raw = read_api_chunk(api, limit=15000)

In [58]:
data_raw.head()
data_raw.columns
data = data_raw
data.rename(columns={"vehicle_type_code1":"vehicle_type_code_1", "vehicle_type_code2":"vehicle_type_code_2"}, inplace=True)
data = pd.wide_to_long(data, stubnames=["vehicle_type_code_", "contributing_factor_vehicle_"], i="collision_id", j="vehicle_no")
data = data.loc[data["vehicle_type_code_"].isnull() & data["contributing_factor_vehicle_"].isnull(), :]

## EDA

### Cleaning

In [4]:
data.dtypes

crash_date                               object
crash_time                       datetime64[ns]
on_street_name                           object
off_street_name                          object
number_of_persons_injured                 int64
number_of_persons_killed                  int64
number_of_pedestrians_injured             int64
number_of_pedestrians_killed              int64
number_of_cyclist_injured                 int64
number_of_cyclist_killed                  int64
number_of_motorist_injured                int64
number_of_motorist_killed                 int64
contributing_factor_vehicle_1            object
contributing_factor_vehicle_2            object
collision_id                              int64
vehicle_type_code1                       object
vehicle_type_code2                       object
borough                                  object
zip_code                                float64
latitude                                float64
longitude                               

In [5]:
text_cols = [col for col in data if re.search("(street|contributing_factor)", col)]
data[text_cols] = data[text_cols].astype("string")
cols_contributing_factor = [col for col in data if re.match(r"^contributing_factor", col)]
# pd.Series(data.columns).astype("string").str.startswith("contributing")
# print(cols_contributing_factor)
# data.filter(regex=r"^vehicle", axis=1).isnull().sum(axis=1)
vehicle_type_cols = data.columns.to_series().filter(regex=r"^vehicle").to_list()
data["number_of_vehicles"] = len(vehicle_type_cols) - data[vehicle_type_cols].isnull().sum(axis=1)
contributing_factor_cols = data.columns.to_series().filter(regex=r"^contributing_factor").to_list()
data["number_of_contributing_factors"] = len(contributing_factor_cols) - data[contributing_factor_cols].isnull().sum(axis=1)


## Roads

In [6]:
data["on_street_name"].str.split(" ")

0           [WHITESTONE, EXPRESSWAY]
1        [QUEENSBORO, BRIDGE, UPPER]
2             [THROGS, NECK, BRIDGE]
3                               <NA>
4                 [SARATOGA, AVENUE]
                    ...             
14995               [JEROME, AVENUE]
14996       [NORTH, CONDUIT, AVENUE]
14997                           <NA>
14998                    [4, AVENUE]
14999                   [89, STREET]
Name: on_street_name, Length: 15000, dtype: object

## Cyclists

### Injured Cyclists

In [7]:
data["number_of_cyclist_injured"].value_counts(dropna=True)
# data.head(10)

number_of_cyclist_injured
0    14331
1      660
2        9
Name: count, dtype: int64

### Contributing factors

In [8]:
# determine contributing factors
confac_ = pd.concat((data[col] for col in data if re.match("contributing_factor_vehicle", col)))
confac = confac_.drop_duplicates().dropna().to_frame(name="ContributingFactor")
# get dummies
# filter by number of involved vehicles
# add rolling count
# use mapping
# extract leading characters
def get_first_chars(input):
    """ retrieve first character of each word
    """
    return "".join(item[0].upper() for item in re.findall("\w+", input))

confac["CF"] = confac["ContributingFactor"].apply(get_first_chars)
confac["CFCount"] = confac.groupby(["CF"]).cumcount()
# confac.loc[confac["MnemoCount"] > 0, ["Mnemo", "MnemoCount"]]
confac.loc[confac["CFCount"] > 0, "CF"] = \
    confac.loc[confac["CFCount"] > 0, "CF"] + \
    confac.loc[confac["CFCount"] > 0, "CFCount"].astype("string")
confac.set_index("ContributingFactor", inplace=True)
confac_mapping = pd.Series(confac["CF"]).to_dict()

cf_cols = [
    re.sub('contributing_factor_vehicle_', 'CFV', col) 
    for col in contributing_factor_cols
    ]
data[cf_cols] = data[contributing_factor_cols].replace(confac_mapping)
# pd.get_dummies(data, columns=cf_cols)

In [22]:
data.filter(regex=r"^CF", axis=1)
# pd.get_dummies(data, columns=cf_cols).filter(regex=r"^CF", axis=1)
# use pd.wide_to_long


    


Unnamed: 0,CFV1,CFV2,CFV3,CFV4,CFV5
0,ADRR,U,,,
1,PS,,,,
2,FTC,U,,,
3,U,,,,
4,,,,,
...,...,...,...,...,...
14995,DID,PTC,,,
14996,ULC,U,,,
14997,FTYROW,,,,
14998,DI,DI,,,
