# New York Motor Vehicle collisions

## Setup

In [1]:
import requests
import pandas as pd
import numpy as np
import re

## Data acquision

### define functions to read data

In [2]:
# def read_api_chunk(api, limit=1000, offset=0):
#     source = f"{api}?${limit=}&${offset=}"
#     r = requests.get(source)
#     data = pd.DataFrame.from_dict(r.json()) # read directly from json instead? faster?
#     return data

# use https://docs.python.org/3/reference/expressions.html#yield-expressions

def read_api_chunk(api, limit=1000, offset=0):
    """ read a single chunk from the api
    """
    return pd.read_json(f"{api}?${limit=}&${offset=}")

def read_api(api, size=1000, chunk_size=1000):
    """ read given number of lines from api, applying the chunk_size along the way
    """
    chunk_generator = ( 
        # define chunks; the last chunk might be smaller than chunk_size
        read_api_chunk(api, limit=min(chunk_size, size - x), offset=x) 
        for x in range(0, size, chunk_size)
        )
        # in the generator expressions, the chunks are not yet read and stored in memory
        # the outer paranthesis are synctactilly required for generator expressions; they 
        # are not included simply in order to permit the multiline definition
    
    # pd.concat can handle generator expressions. According to the api reference, the objs argument
    # accepts a sequence of DataFrame objects. This indicates that any iterable that yields DataFrame
    # objects will be accepted, which is what chunk_generator provides. 
    return pd.concat(chunk_generator) 




### inputs

In [3]:
api = "https://data.cityofnewyork.us/resource/h9gi-nx95.json"
size = 12000
limit = 1000
# data = read_api(api, size=12500)
data = read_api_chunk(api, limit=15000)

## EDA

### Cleaning

In [4]:
data.dtypes

crash_date                               object
crash_time                       datetime64[ns]
on_street_name                           object
off_street_name                          object
number_of_persons_injured                 int64
number_of_persons_killed                  int64
number_of_pedestrians_injured             int64
number_of_pedestrians_killed              int64
number_of_cyclist_injured                 int64
number_of_cyclist_killed                  int64
number_of_motorist_injured                int64
number_of_motorist_killed                 int64
contributing_factor_vehicle_1            object
contributing_factor_vehicle_2            object
collision_id                              int64
vehicle_type_code1                       object
vehicle_type_code2                       object
borough                                  object
zip_code                                float64
latitude                                float64
longitude                               

In [5]:
text_cols = [col for col in data if re.search("(street|contributing_factor)", col)]
data[text_cols] = data[text_cols].astype("string")
cols_contributing_factor = [col for col in data if re.match(r"^contributing_factor", col)]
# pd.Series(data.columns).astype("string").str.startswith("contributing")
# print(cols_contributing_factor)
# data.filter(regex=r"^vehicle", axis=1).isnull().sum(axis=1)
vehicle_type_cols = data.columns.to_series().filter(regex=r"^vehicle").to_list()
data["number_of_vehicles"] = len(vehicle_type_cols) - data[vehicle_type_cols].isnull().sum(axis=1)
contributing_factor_cols = data.columns.to_series().filter(regex=r"^contributing_factor").to_list()
data["number_of_contributing_factors"] = len(contributing_factor_cols) - data[contributing_factor_cols].isnull().sum(axis=1)


## Roads

In [6]:
data["on_street_name"].str.split(" ")

0           [WHITESTONE, EXPRESSWAY]
1        [QUEENSBORO, BRIDGE, UPPER]
2             [THROGS, NECK, BRIDGE]
3                               <NA>
4                 [SARATOGA, AVENUE]
                    ...             
14995               [FULTON, STREET]
14996               [JEROME, AVENUE]
14997       [NORTH, CONDUIT, AVENUE]
14998                           <NA>
14999                    [4, AVENUE]
Name: on_street_name, Length: 15000, dtype: object

## Cyclists

### Injured Cyclists

In [7]:
data["number_of_cyclist_injured"].value_counts(dropna=True)
# data.head(10)

number_of_cyclist_injured
0    14331
1      660
2        9
Name: count, dtype: int64

### Contributing factors

In [29]:
# determine contributing factors
confac0 = pd.concat((data[col] for col in data if re.match("contributing_factor_vehicle", col)))
confac = pd.Series(confac0.unique().dropna())
print(type(confac))
confac
# data["contributing_factor_vehicle_1"].value_counts()
# get dummies
# filter by number of involved vehicles
# add rolling count

<class 'pandas.core.series.Series'>


0                          Aggressive Driving/Road Rage
1                                     Pavement Slippery
2                                 Following Too Closely
3                                           Unspecified
4                                   Passing Too Closely
5                                   Driver Inexperience
6                        Passing or Lane Usage Improper
7                                    Turning Improperly
8                                  Unsafe Lane Changing
9                                          Unsafe Speed
10                       Reaction to Uninvolved Vehicle
11                                     Steering Failure
12                          Traffic Control Disregarded
13                                      Other Vehicular
14                       Driver Inattention/Distraction
15                                    Oversized Vehicle
16    Pedestrian/Bicyclist/Other Pedestrian Error/Co...
17                                  Alcohol Invo

In [25]:
# extract leading characters
def get_first_chars(input):
    return "".join(item[0].upper() for item in re.findall("\w+", input))

pd.Series(confac).apply(get_first_chars)


0       ADRR
1         PS
2        FTC
3          U
4        PTC
5         DI
6      POLUI
7         TI
8        ULC
9         US
10      RTUV
11        SF
12       TCD
13        OV
14       DID
15        OV
16    PBOPEC
17        AI
18       VOL
19    FTYROW
20         I
21        LC
22        BD
23        BU
24         G
25        PD
26        FA
27        OD
28        TW
29        AA
30        DI
31        PD
32       OLD
33       OCD
34       DRV
35       TFI
36        FD
37        HD
38        AD
39      FTKR
40        PD
41       EOD
42      CPHF
43      LMII
44      CPHH
45     UOBND
46       OED
47    TCDINW
48       THD
49        WI
50        VV
dtype: object