# New York Motor Vehicle collisions

## Setup

In [1]:
import requests
import pandas as pd
import numpy as np
import re

## Data acquision

### define functions to read data

In [2]:
# def read_api_chunk(api, limit=1000, offset=0):
#     source = f"{api}?${limit=}&${offset=}"
#     r = requests.get(source)
#     data = pd.DataFrame.from_dict(r.json()) # read directly from json instead? faster?
#     return data

# use https://docs.python.org/3/reference/expressions.html#yield-expressions

def read_api_chunk(api, limit=1000, offset=0):
    """ read a single chunk from the api
    """
    return pd.read_json(f"{api}?${limit=}&${offset=}")

def read_api(api, size=1000, chunk_size=1000):
    """ read given number of lines from api, applying the chunk_size along the way
    """
    chunk_generator = ( 
        # define chunks; the last chunk might be smaller than chunk_size
        read_api_chunk(api, limit=min(chunk_size, size - x), offset=x) 
        for x in range(0, size, chunk_size)
        )
        # in the generator expressions, the chunks are not yet read and stored in memory
        # the outer paranthesis are synctactilly required for generator expressions; they 
        # are not included simply in order to permit the multiline definition
    
    # pd.concat can handle generator expressions. According to the api reference, the objs argument
    # accepts a sequence of DataFrame objects. This indicates that any iterable that yields DataFrame
    # objects will be accepted, which is what chunk_generator provides. 
    return pd.concat(chunk_generator) 




### inputs

In [3]:
api = "https://data.cityofnewyork.us/resource/h9gi-nx95.json"
size = 12000
limit = 1000
# data = read_api(api, size=12500)
data_raw = read_api_chunk(api, limit=15000)

In [55]:
data = data_raw
data.rename(columns={"vehicle_type_code1":"vehicle_type_code_1", "vehicle_type_code2":"vehicle_type_code_2"}, inplace=True)
data = pd.wide_to_long(data, stubnames=["vehicle_type_code_", "contributing_factor_vehicle_"], i="collision_id", j="vehicle_no")
data.rename(columns={"vehicle_type_code_":"vehicle_type_code", "contributing_factor_vehicle_":"contributing_factor_vehicle"}, inplace=True)
condition1 = data[["vehicle_type_code", "contributing_factor_vehicle"]].notnull().any(axis=1)
condition2 = data.index.get_level_values(level=1) == 1
relevant_rows = condition1 | condition2 
# keep rows for vehicle no. > 1 only if relevant information pertaining to the vehicle is present. 
# the row will be redundant otherwise. 
data = data.loc[relevant_rows, :]
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,number_of_persons_injured,zip_code,crash_time,number_of_pedestrians_killed,on_street_name,location,number_of_persons_killed,cross_street_name,crash_date,number_of_motorist_killed,number_of_pedestrians_injured,number_of_cyclist_killed,number_of_cyclist_injured,off_street_name,longitude,borough,latitude,number_of_motorist_injured,vehicle_type_code,contributing_factor_vehicle
collision_id,vehicle_no,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,1,2,,2023-07-08 02:39:00,0,WHITESTONE EXPRESSWAY,,0,,2021-09-11T00:00:00.000,0,0,0,0,20 AVENUE,,,,2,Sedan,Aggressive Driving/Road Rage
4513547,1,1,,2023-07-08 11:45:00,0,QUEENSBORO BRIDGE UPPER,,0,,2022-03-26T00:00:00.000,0,0,0,0,,,,,1,Sedan,Pavement Slippery
4541903,1,0,,2023-07-08 06:55:00,0,THROGS NECK BRIDGE,,0,,2022-06-29T00:00:00.000,0,0,0,0,,,,,0,Sedan,Following Too Closely
4456314,1,0,11208.0,2023-07-08 09:35:00,0,,"{'latitude': '40.667202', 'longitude': '-73.86...",0,1211 LORING AVENUE,2021-09-11T00:00:00.000,0,0,0,0,,-73.8665,BROOKLYN,40.667202,0,Sedan,Unspecified
4486609,1,0,11233.0,2023-07-08 08:13:00,0,SARATOGA AVENUE,"{'latitude': '40.683304', 'longitude': '-73.91...",0,,2021-12-14T00:00:00.000,0,0,0,0,DECATUR STREET,-73.917274,BROOKLYN,40.683304,0,,


## EDA

### Cleaning

In [56]:
data.dtypes

number_of_persons_injured                 int64
zip_code                                float64
crash_time                       datetime64[ns]
number_of_pedestrians_killed              int64
on_street_name                           object
location                                 object
number_of_persons_killed                  int64
cross_street_name                        object
crash_date                               object
number_of_motorist_killed                 int64
number_of_pedestrians_injured             int64
number_of_cyclist_killed                  int64
number_of_cyclist_injured                 int64
off_street_name                          object
longitude                               float64
borough                                  object
latitude                                float64
number_of_motorist_injured                int64
vehicle_type_code                        object
contributing_factor_vehicle              object
dtype: object

In [57]:
text_cols = [col for col in data if re.search("(street|contributing_factor)", col)]
data[text_cols] = data[text_cols].astype("string")

In [106]:
data_cf = data["contributing_factor_vehicle"].drop_duplicates().dropna().reset_index(drop=True).to_frame(name="contributing_factor")
def get_first_chars(input):
    """ retrieve first character of each word
    """
    return "".join(item[0].upper() for item in re.findall("\w+", input))

data_cf["cf"] = data_cf["contributing_factor"].apply(get_first_chars)
data_cf["cf_count"] = data_cf.groupby(["cf"]).cumcount()
k = data_cf["cf_count"] > 0
data_cf.loc[k, "cf"] = data_cf.loc[k, "cf"] + data_cf.loc[k, "cf_count"].astype("string")
data_cf.set_index("contributing_factor", inplace=True)
mapping_cf = pd.Series(data_cf["cf"]).to_dict()

{'Aggressive Driving/Road Rage': 'ADRR',
 'Pavement Slippery': 'PS',
 'Following Too Closely': 'FTC',
 'Unspecified': 'U',
 'Passing Too Closely': 'PTC',
 'Driver Inexperience': 'DI',
 'Passing or Lane Usage Improper': 'POLUI',
 'Turning Improperly': 'TI',
 'Unsafe Lane Changing': 'ULC',
 'Unsafe Speed': 'US',
 'Reaction to Uninvolved Vehicle': 'RTUV',
 'Steering Failure': 'SF',
 'Traffic Control Disregarded': 'TCD',
 'Other Vehicular': 'OV',
 'Driver Inattention/Distraction': 'DID',
 'Oversized Vehicle': 'OV1',
 'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion': 'PBOPEC',
 'Alcohol Involvement': 'AI',
 'View Obstructed/Limited': 'VOL',
 'Failure to Yield Right-of-Way': 'FTYROW',
 'Illnes': 'I',
 'Lost Consciousness': 'LC',
 'Brakes Defective': 'BD',
 'Backing Unsafely': 'BU',
 'Glare': 'G',
 'Passenger Distraction': 'PD',
 'Fell Asleep': 'FA',
 'Obstruction/Debris': 'OD',
 'Tinted Windows': 'TW',
 'Animals Action': 'AA',
 'Drugs (illegal)': 'DI1',
 'Pavement Defective': 'PD1',
 

In [5]:

cols_contributing_factor = [col for col in data if re.match(r"^contributing_factor", col)]
# pd.Series(data.columns).astype("string").str.startswith("contributing")
# print(cols_contributing_factor)
# data.filter(regex=r"^vehicle", axis=1).isnull().sum(axis=1)
vehicle_type_cols = data.columns.to_series().filter(regex=r"^vehicle").to_list()
data["number_of_vehicles"] = len(vehicle_type_cols) - data[vehicle_type_cols].isnull().sum(axis=1)
contributing_factor_cols = data.columns.to_series().filter(regex=r"^contributing_factor").to_list()
data["number_of_contributing_factors"] = len(contributing_factor_cols) - data[contributing_factor_cols].isnull().sum(axis=1)


## Roads

In [6]:
data["on_street_name"].str.split(" ")

0           [WHITESTONE, EXPRESSWAY]
1        [QUEENSBORO, BRIDGE, UPPER]
2             [THROGS, NECK, BRIDGE]
3                               <NA>
4                 [SARATOGA, AVENUE]
                    ...             
14995               [JEROME, AVENUE]
14996       [NORTH, CONDUIT, AVENUE]
14997                           <NA>
14998                    [4, AVENUE]
14999                   [89, STREET]
Name: on_street_name, Length: 15000, dtype: object

## Cyclists

### Injured Cyclists

In [7]:
data["number_of_cyclist_injured"].value_counts(dropna=True)
# data.head(10)

number_of_cyclist_injured
0    14331
1      660
2        9
Name: count, dtype: int64

### Contributing factors

In [8]:
# determine contributing factors
confac_ = pd.concat((data[col] for col in data if re.match("contributing_factor_vehicle", col)))
confac = confac_.drop_duplicates().dropna().to_frame(name="ContributingFactor")
# get dummies
# filter by number of involved vehicles
# add rolling count
# use mapping
# extract leading characters
def get_first_chars(input):
    """ retrieve first character of each word
    """
    return "".join(item[0].upper() for item in re.findall("\w+", input))

confac["CF"] = confac["ContributingFactor"].apply(get_first_chars)
confac["CFCount"] = confac.groupby(["CF"]).cumcount()
# confac.loc[confac["MnemoCount"] > 0, ["Mnemo", "MnemoCount"]]
confac.loc[confac["CFCount"] > 0, "CF"] = \
    confac.loc[confac["CFCount"] > 0, "CF"] + \
    confac.loc[confac["CFCount"] > 0, "CFCount"].astype("string")
confac.set_index("ContributingFactor", inplace=True)
confac_mapping = pd.Series(confac["CF"]).to_dict()

cf_cols = [
    re.sub('contributing_factor_vehicle_', 'CFV', col) 
    for col in contributing_factor_cols
    ]
data[cf_cols] = data[contributing_factor_cols].replace(confac_mapping)
# pd.get_dummies(data, columns=cf_cols)

In [22]:
data.filter(regex=r"^CF", axis=1)
# pd.get_dummies(data, columns=cf_cols).filter(regex=r"^CF", axis=1)
# use pd.wide_to_long


    


Unnamed: 0,CFV1,CFV2,CFV3,CFV4,CFV5
0,ADRR,U,,,
1,PS,,,,
2,FTC,U,,,
3,U,,,,
4,,,,,
...,...,...,...,...,...
14995,DID,PTC,,,
14996,ULC,U,,,
14997,FTYROW,,,,
14998,DI,DI,,,
