# Project 5: Identifying High Risk Areas from NYC Traffic Conditions
## *Template Notebook*

In this notebook:

* [Topic 1](#topic-1)
* [Topic 2](#topic-2)

#### Import Libraries & Read in Data

In [1]:
## standard imports 
import pandas as pd 
import numpy as np
import re
## visualizations
import matplotlib.pyplot as plt
import seaborn as sns
## preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.dummy import DummyClassifier
## modeling
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.naive_bayes import MultinomialNB
## trees
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor
## NLP
from sklearn.feature_extraction.text import CountVectorizer
## analysis
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, accuracy_score, make_scorer, f1_score, mean_squared_error

## options
import sklearn
pd.options.display.max_rows = 4000
pd.options.display.max_columns = 100
pd.set_option('max_colwidth', 100)

In [2]:
### read in data
data = pd.read_csv('../data_large/selected_data.csv')

Trimmed data from https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Vehicles/bm4k-52h4

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data.head()

Notes on data:

* ID and LINK_ID are the same (according to the data dictionary) Drop one of these (I say ID so LINK_ID is easy to associate to LINK_POINTS
* Says both STATUS and TRANSCOM_ID are not useful (artifacts) -- drop these
* TRAVEL_TIME - units? -- drop
* Unnamed: 0 -- need to remove index from data set when export to csv



In [None]:
data.columns

In [None]:
drop_cols = ['Unnamed: 0', 'ID', 'TRAVEL_TIME', 'STATUS',
       'OWNER', 'TRANSCOM_ID']
data.drop(columns=drop_cols, axis=1, inplace=True)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.columns

In [None]:
### make column names lowercase
data.columns = map(str.lower, data.columns)

In [None]:
data.columns

In [None]:
### rename columns
rename_cols = {
    'data_as_of' : 'date_time',
}

data.rename(columns=rename_cols, inplace=True)
data.head()

In [None]:
### convert date to datetime object
data['date_time'] = pd.to_datetime(data['date_time'])

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.iloc[0,3]

In [None]:
data.columns

In [None]:
### to reduce size further for modeling lets delete all the link info. This needs to be cleaned anyways, can be added back later via link id.
drop_link_info = ['link_points', 'encoded_poly_line', 'encoded_poly_line_lvls', 'link_name']
data.drop(columns=drop_link_info, axis=1, inplace=True)

In [None]:
data.head()

In [None]:
### check size of this dataframe
# data.to_csv('./data_test/check-size-2.csv', index=False) ### still 453.2 MB ### note: wayyyy smaller after all GPS info was dropped!

In [None]:
### check if pickle helps?
data.to_pickle('./pickles/check_size.pickle')  ### 59.9 MB!!

In [3]:
### load pickle

df = pd.read_pickle('./pickles/check_size.pickle')
df.head()

Unnamed: 0,speed,date_time,link_id,link_points,encoded_poly_line,encoded_poly_line_lvls,borough,link_name
0,49.08,2019-05-20 19:29:13,4616364,"40.73744001,-73.85188001 40.737015,-73.85373001 40.73673,-73.85543001 40.736526,-73.85631001 ...",_pswFfewaMtApJv@rIh@nDlFbXhH`]bJla@|@`FbHz\fBvG,BBBBBBBBBB,Queens,LIE W 108TH ST - 84TH ST
1,34.79,2019-05-20 19:29:13,4616220,"40.6162405,-74.02612 40.61923,-74.02361 40.62362,-74.019831 40.6248406,-74.019061 40.62579,-74.0...",oz{vFffybMuQuNmZsVsFyC}D}AeLmDeO_E{JkCaEg@,BBBBBBBBB,Brooklyn,GOW N 92ND STREET - 7TH AVENUE
2,0.0,2019-05-20 19:29:13,4616216,"40.63089,-74.14569 40.6298,-74.14569 40.62883,-74.14556 40.62805,-74.145451 40.6272105,-74.14546...",av~vFpqpcMxE?`EYzCUfD@|CZtDdAnCd@`Cz@rCdAzCzAvDlCzNlNfEfEbChBhD~AjJnBnVzDtA]|@cBD_Bq@qAaBD,BBBBBBBBBBBBBBBBBBBBBBB,Staten Island,SIE E-MLK N RICHMOND AVENUE - WALKER STREET
3,52.81,2019-05-20 19:29:13,4616211,"40.6151706,-74.15738 40.61739,-74.16056 40.6205405,-74.16683 40.6209604,-74.16791 40.6210504,-74...",ys{vFrzrcM{LzRuRdf@sAvEQv@[lB,BBBBBB,Staten Island,SIE W RICHMOND AVENUE - SOUTH AVENUE
4,0.0,2019-05-20 19:29:13,4616210,"40.63092,-74.14592 40.62975,-74.14593 40.62877,-74.14579 40.6279506,-74.145671 40.62713,-74.1456...",gv~vF~rpcMhF@bE[bDWbD@vCb@|D~@dCf@`Cv@jCjA~CzAhDfCrDjDfJ~IhDhDxB~AlDlBdEdArDp@jC|Bt@zBCjDeB~DgAzA,BBBBBBBBBBBBBBBBBBBBBBBB,Staten Island,MLK S - SIE W WALKER STREET - RICHMOND AVENUE


In [None]:
df.shape

In [None]:
data['BOROUGH'].value_counts()

In [None]:
# data['LINK_ID'].value_counts()

In [None]:
# data['ID'].value_counts()

In [None]:
data['OWNER'].value_counts()

### Make a data set for manhattan only

In [None]:
data_man = data[data['BOROUGH']== 'Manhattan']
data_man = data_sm.drop(columns=['ENCODED_POLY_LINE', ])

# data_man = pd.to_csv('./data/speed_data-manhattan.csv')

## Multiindex_data <a class="anchor" id="topic-1"></a>
<hr/>

In [None]:
### read in data
mdata = pd.read_csv('../data_large/multiindex_data.csv')

In [None]:
mdata.head()

In [None]:
mdata.info()

In [None]:
mdata.shape

In [None]:
mdata['DATA_AS_OF'] = pd.to_datetime(mdata['DATA_AS_OF'])

In [None]:
mdata.info()

In [None]:
some_id = mdata[mdata['LINK_ID']==4616267].copy()
some_id.head()

In [None]:
some_id[some_id['DATA_AS_OF'] == '2019-05-04']

In [None]:
plt.scatter(some_id['DATA_AS_OF'], some_id['SPEED'], s=0.1)

In [None]:
first_day = some_id[some_id['DATA_AS_OF'].dt.date]
first_day.head()

In [None]:
some_id[some_id['DATA_AS_OF'].dt.date == '2019-05-20']

## Topic 2 <a class="anchor" id="topic-2"></a>
<hr/>

In [None]:
### read in data
cdata = pd.read_csv('../data_large/collisions.csv')

In [None]:
cdata.shape

In [None]:
cdata.head()

In [None]:
cdata.isnull().sum()

In [None]:
cdata.columns

In [None]:
drop_cols = ['OFF STREET NAME', 'CONTRIBUTING FACTOR VEHICLE 3',
       'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5',
       'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5']
cdata.drop(columns=drop_cols, inplace=True)

In [None]:
cdata.head()

In [None]:
cdata['BOROUGH'].value_counts(dropna=False)

In [None]:
# cdata['NUMBER OF PERSONS INJURED'].value_counts()

In [None]:
cdata['NUMBER OF PERSONS KILLED'].value_counts()

In [None]:
cdata.info()

In [None]:
cdata_man = cdata[cdata['BOROUGH']=='MANHATTAN'].copy()

In [None]:
cdata_man.shape

In [None]:
cdata_man.columns

In [None]:
cdata_man.isna().sum()

In [None]:
cdata_man.drop(columns=['ON STREET NAME', 'CROSS STREET NAME', 'CONTRIBUTING FACTOR VEHICLE 2','VEHICLE TYPE CODE 2'], inplace=True)

In [None]:
cdata_man.dropna(axis=0, inplace=True)

In [None]:
cdata_man.shape

In [None]:
cdata_man.head()

In [None]:
cdata_man.isna().sum()

In [None]:
plt.scatter(cdata['LATITUDE'], cdata['LONGITUDE'])