# Project 5: Identifying High Risk Areas from NYC Traffic Conditions
## *Template Notebook*

In this notebook:

* [Topic 1](#topic-1)
* [Topic 2](#topic-2)

#### Import Libraries & Read in Data

In [10]:
## standard imports 
import pandas as pd 
import numpy as np
import re
## visualizations
import matplotlib.pyplot as plt
import seaborn as sns
## preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.dummy import DummyClassifier
## modeling
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.naive_bayes import MultinomialNB
## trees
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor
## NLP
from sklearn.feature_extraction.text import CountVectorizer
## analysis
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, accuracy_score, make_scorer, f1_score, mean_squared_error

## options
import sklearn
pd.options.display.max_rows = 4000
pd.options.display.max_columns = 100
pd.set_option('max_colwidth', 100)

In [46]:
### read in data
data = pd.read_csv('./data/crash_features_2018.csv')

In [12]:
data.head()

Unnamed: 0,date,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5,geometry,TIME HR,date_2
0,2018-01-01 00:00:00,2018-01-01,0:00,MANHATTAN,10016.0,40.743126,-73.981674,"(40.743126, -73.981674)",,,135 EAST 29 STREET,1.0,0.0,0,0,0,0,1,0,Unspecified,Unspecified,,,,3820090,Sedan,Taxi,,,,POINT (-73.981674 40.743126),0,2018-01-01 00:00:00
1,2018-01-01 00:00:00,2018-01-01,0:00,BRONX,10454.0,40.80321,-73.91892,"(40.80321, -73.91892)",BRUCKNER BOULEVARD,SAINT ANNS PLACE,,0.0,0.0,0,0,0,0,0,0,Unspecified,Unspecified,,,,3819077,Pick-up Truck,Sedan,,,,POINT (-73.91892 40.80321),0,2018-01-01 00:00:00
2,2018-01-01 00:00:00,2018-01-01,0:00,BROOKLYN,11230.0,40.62322,-73.96102,"(40.62322, -73.96102)",,,1095 EAST 15 STREET,0.0,0.0,0,0,0,0,0,0,Driver Inattention/Distraction,Unspecified,,,,3821055,Taxi,,,,,POINT (-73.96101999999999 40.62322),0,2018-01-01 00:00:00
3,2018-01-01 00:00:00,2018-01-01,0:00,QUEENS,11419.0,40.68297,-73.82824,"(40.68297, -73.82824)",107 AVENUE,113 STREET,,0.0,0.0,0,0,0,0,0,0,Passing Too Closely,Unspecified,,,,3819067,Sedan,,,,,POINT (-73.82824000000001 40.68297),0,2018-01-01 00:00:00
4,2018-01-01 00:00:00,2018-01-01,0:00,,,40.655743,-73.99927,"(40.655743, -73.99927)",31 STREET,,,0.0,0.0,0,0,0,0,0,0,Unspecified,Unspecified,,,,3819251,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,,POINT (-73.99927 40.655743),0,2018-01-01 00:00:00


In [13]:
data.shape

(216106, 33)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216106 entries, 0 to 216105
Data columns (total 33 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   date                           216106 non-null  object        
 1   CRASH DATE                     216106 non-null  datetime64[ns]
 2   CRASH TIME                     216106 non-null  object        
 3   BOROUGH                        144564 non-null  object        
 4   ZIP CODE                       144527 non-null  float64       
 5   LATITUDE                       216106 non-null  float64       
 6   LONGITUDE                      216106 non-null  float64       
 7   LOCATION                       216106 non-null  object        
 8   ON STREET NAME                 162315 non-null  object        
 9   CROSS STREET NAME              105275 non-null  object        
 10  OFF STREET NAME                53393 non-null   object        
 11  

In [11]:
data.isna().sum()

date                                  0
CRASH DATE                            0
CRASH TIME                            0
BOROUGH                           71542
ZIP CODE                          71579
LATITUDE                              0
LONGITUDE                             0
LOCATION                              0
ON STREET NAME                    53791
CROSS STREET NAME                110831
OFF STREET NAME                  162713
NUMBER OF PERSONS INJURED             4
NUMBER OF PERSONS KILLED             12
NUMBER OF PEDESTRIANS INJURED         0
NUMBER OF PEDESTRIANS KILLED          0
NUMBER OF CYCLIST INJURED             0
NUMBER OF CYCLIST KILLED              0
NUMBER OF MOTORIST INJURED            0
NUMBER OF MOTORIST KILLED             0
CONTRIBUTING FACTOR VEHICLE 1       664
CONTRIBUTING FACTOR VEHICLE 2     32961
CONTRIBUTING FACTOR VEHICLE 3    202358
CONTRIBUTING FACTOR VEHICLE 4    213234
CONTRIBUTING FACTOR VEHICLE 5    215348
COLLISION_ID                          0


In [52]:
### convert crash date to datetime object
data['CRASH DATE'] = pd.to_datetime(data['CRASH DATE'])

KeyError: 'CRASH DATE'

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216106 entries, 0 to 216105
Data columns (total 33 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   date                           216106 non-null  object        
 1   CRASH DATE                     216106 non-null  datetime64[ns]
 2   CRASH TIME                     216106 non-null  object        
 3   BOROUGH                        144564 non-null  object        
 4   ZIP CODE                       144527 non-null  float64       
 5   LATITUDE                       216106 non-null  float64       
 6   LONGITUDE                      216106 non-null  float64       
 7   LOCATION                       216106 non-null  object        
 8   ON STREET NAME                 162315 non-null  object        
 9   CROSS STREET NAME              105275 non-null  object        
 10  OFF STREET NAME                53393 non-null   object        
 11  

In [48]:
data.set_index('CRASH DATE', inplace=True)

In [24]:
data.columns

Index(['date', 'CRASH TIME', 'BOROUGH', 'ZIP CODE', 'LATITUDE', 'LONGITUDE',
       'LOCATION', 'ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME',
       'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
       'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED',
       'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
       'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED',
       'CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2',
       'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4',
       'CONTRIBUTING FACTOR VEHICLE 5', 'COLLISION_ID', 'VEHICLE TYPE CODE 1',
       'VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4',
       'VEHICLE TYPE CODE 5', 'geometry', 'TIME HR', 'date_2'],
      dtype='object')

In [53]:
### only need number of crashes each day, drop rest, using location as no null values here
data = data[['LOCATION']] 

In [51]:
data.head()

pandas.core.frame.DataFrame

In [54]:
crashes = data.groupby(data.index.date).count()

# data['num_crashes'] = 

In [55]:
crashes.columns

Index(['LOCATION'], dtype='object')

In [39]:
# crashes.drop(columns = ['BOROUGH', 'LONGITUDE', 'LOCATION'], inplace=True)

In [40]:
crashes.head()

Unnamed: 0,LATITUDE
2018-01-01,435
2018-01-02,610
2018-01-03,619
2018-01-04,439
2018-01-05,676


In [56]:
crashes.rename(columns={'LOCATION': 'CRASHES'}, inplace=True)

In [None]:
### select data
# X = 
# y = 
### TTS
# X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [57]:
crashes.head()

Unnamed: 0,CRASHES
2018-01-01,435
2018-01-02,610
2018-01-03,619
2018-01-04,439
2018-01-05,676


## Topic 1 <a class="anchor" id="topic-1"></a>
<hr/>

In [8]:
def create_year(df, year, location):
    df['CRASH DATE'] = pd.to_datetime(df['CRASH DATE'])
    df.set_index('CRASH DATE', inplace=True)
    df = df[['LOCATION']]
    crash_nums = df.groupby(df.index.date).count()
    crash_nums.rename(columns={'LOCATION': 'CRASHES'}, inplace=True)
    crash_nums.to_csv(f'./data/crash_per_day_{year}-'+f'{location}.csv')
    pass

In [67]:
df = pd.read_csv('./data/crash_features_2018.csv')
create_year(df, 2018)

In [68]:
df = pd.read_csv('./data/crash_features_2019.csv')
create_year(df, 2019)

## Topic 2 <a class="anchor" id="topic-2"></a>
<hr/>

In [65]:
df.head()

Unnamed: 0_level_0,date,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5,geometry,date_2
CRASH DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2019-01-01,2019-01-01,1:10,MANHATTAN,10011.0,40.73736,-73.99685,"(40.73736, -73.99685)",AVENUE OF THE AMERICAS,WEST 14 STREET,,0.0,0.0,0,0,0,0,0,0,Driver Inattention/Distraction,,,,,4060795,Taxi,,,,,POINT (-73.99684999999999 40.73736),2019-01-01
2019-01-01,2019-01-01,15:00,BRONX,10466.0,40.892624,-73.859924,"(40.892624, -73.859924)",,,645 EAST 231 STREET,0.0,0.0,0,0,0,0,0,0,Other Vehicular,Backing Unsafely,,,,4060650,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,,POINT (-73.85992399999999 40.892624),2019-01-01
2019-01-01,2019-01-01,15:10,BROOKLYN,11219.0,40.632103,-73.99335,"(40.632103, -73.99335)",,,1350 53 STREET,1.0,0.0,0,0,0,0,1,0,Failure to Yield Right-of-Way,Unspecified,,,,4062940,Station Wagon/Sport Utility Vehicle,Sedan,,,,POINT (-73.99335000000001 40.632103),2019-01-01
2019-01-01,2019-01-01,16:06,QUEENS,11354.0,40.76465,-73.823494,"(40.76465, -73.823494)",PARSONS BOULEVARD,NORTHERN BOULEVARD,,0.0,0.0,0,0,0,0,0,0,Backing Unsafely,Unspecified,,,,4060603,Station Wagon/Sport Utility Vehicle,Sedan,,,,POINT (-73.823494 40.76465),2019-01-01
2019-01-01,2019-01-01,20:23,BROOKLYN,11218.0,40.640205,-73.98558,"(40.640205, -73.98558)",,,1325 39 STREET,0.0,0.0,0,0,0,0,0,0,Unspecified,Unspecified,,,,4061129,Van,Sedan,,,,POINT (-73.98558 40.640205),2019-01-01


### Create for Lincoln Tunnel Date

In [2]:
### read in data
df = pd.read_csv('./data/crashes_lincoln_tunnel.csv', index_col=0)

In [11]:
df.head()

Unnamed: 0_level_0,date,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,CONTRIBUTING FACTOR VEHICLE 1,geometry
CRASH DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-01-01,2019-01-01,4:50,MANHATTAN,10001.0,40.751026,-73.99662,"(40.751026, -73.99662)",Unspecified,POINT (-73.99661999999999 40.751026)
2019-01-01,2019-01-01,4:23,,,40.7615,-73.997826,"(40.7615, -73.997826)",Driver Inattention/Distraction,POINT (-73.997826 40.7615)
2019-01-01,2019-01-01,2:30,MANHATTAN,10001.0,40.749706,-73.99157,"(40.749706, -73.99157)",Passing or Lane Usage Improper,POINT (-73.99157 40.749706)
2019-01-01,2019-01-01,19:45,,,40.758533,-73.98885,"(40.758533, -73.98885)",Turning Improperly,POINT (-73.98885 40.758533)
2019-01-01,2019-01-01,19:03,MANHATTAN,10018.0,40.754055,-73.99583,"(40.754055, -73.99583)",Other Vehicular,POINT (-73.99583 40.754055)


In [12]:
def create_year_v2(df, year, location):
#     df['CRASH DATE'] = pd.to_datetime(df['CRASH DATE'])
#     df.set_index('CRASH DATE', inplace=True)
    df = df[['LOCATION']]
    crash_nums = df.groupby(df.index.date).count()
    crash_nums.rename(columns={'LOCATION': 'CRASHES'}, inplace=True)
    crash_nums.to_csv(f'./data/crashes_per_day_{year}-'+f'{location}.csv')
    pass

create_year_v2(df, 2019, 'lincoln-tunnel')