# Project 5: Identifying High Risk Areas from NYC Traffic Conditions
## *Template Notebook*

In this notebook:

* [Topic 1](#topic-1)
* [Topic 2](#topic-2)

#### Import Libraries & Read in Data

In [1]:
## standard imports 
import pandas as pd 
import numpy as np
import re
## visualizations
import matplotlib.pyplot as plt
import seaborn as sns
## preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.dummy import DummyClassifier
## modeling
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.naive_bayes import MultinomialNB
## trees
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor
## NLP
from sklearn.feature_extraction.text import CountVectorizer
## analysis
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, accuracy_score, make_scorer, f1_score, mean_squared_error

## options
import sklearn
pd.options.display.max_rows = 4000
pd.options.display.max_columns = 100
pd.set_option('max_colwidth', 100)

In [2]:
### read in data
filename = '../data_large/nyc_speed_data_full.csv'
data = pd.read_csv(filename)

[NYC Real Time Traffic Data](https://data.cityofnewyork.us/Transportation/Real-Time-Traffic-Speed-Data/qkm5-nuaq)

In [18]:
# Let's scan the csv to see how dates are ordered
# df = pd.DataFrame()

# chunksize=10**6

# for chunk in pd.read_csv(filename, chunksize=chunksize):
#     dx = pd.DataFrame(chunk)
#     dx = dx['DATA_AS_OF']
#     print(dx.head(2))

In [3]:
data.head()

Unnamed: 0,ID,SPEED,TRAVEL_TIME,STATUS,DATA_AS_OF,LINK_ID,LINK_POINTS,ENCODED_POLY_LINE,ENCODED_POLY_LINE_LVLS,OWNER,TRANSCOM_ID,BOROUGH,LINK_NAME
0,169,57.16,160,0,06/06/2020 08:33:04 AM,4616355,"40.66639,-73.76248 40.66579,-73.75706 40.66574,-73.7563 40.66571,-73.7494 40.66542,-73.74244 40....",}sewFnveaMvB{`@HwCDcj@x@oj@SmCWoAw@aCc@eAgBiC_BaB}E}D_XyOiHeFeIwGgTeS_TwMsFsCWE,BBBBBBBBBBBBBBBBBBB,NYC_DOT_LIC,4616355,Queens,Belt Pkwy E 182nd St - Laurelton Pkwy N @ SSP
1,315,57.16,87,0,06/06/2020 08:33:04 AM,4616364,"40.73744001,-73.85188001 40.737015,-73.85373001 40.73673,-73.85543001 40.736526,-73.85631001 ...",_pswFfewaMtApJv@rIh@nDlFbXhH`]bJla@\\\\\\\\\\\\\\\\|@`FbHz\fBvG,BBBBBBBBBB,NYC_DOT_LIC,4616364,Queens,LIE W 108TH ST - 84TH ST
2,445,0.0,0,-101,06/06/2020 08:33:05 AM,4456494,"40.7262006,-74.01112 40.7252805,-74.01128 40.7218105,-74.011981 40.7195505,-74.012471 40.71654,-...",wiqwFnhvbMvD^xb@lFxQvCrb@hHlLjChHnBPgBh@gA\\\\\\\\\\\\\\\\|@WzANnDv@\\\\\\\\\\\\\\\\|Cl@\\\\\\\\...,BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB,NYC_DOT_LIC,4456494,Manhattan,West St S Spring St - BBT Manhattan Portal outbound
3,379,52.19,44,0,06/06/2020 08:33:05 AM,4763650,"40.60408,-74.052241 40.6036605,-74.05356 40.6033506,-74.05459 40.6029905,-74.055641 40.60213,-74...",onyvFni~bMrAfG\\\\\\\\\\\\\\\\|@lEfApEjDlOt@nDV`EBdDM~CKlD_@hFa@`C}Il\,BBBBBBBBBBBBB,NYC_DOT_LIC,4763650,Staten Island,SIE E VNB E FINGERBOARD ROAD - SI GANTRY LOWER LEVEL
4,410,52.19,89,0,06/06/2020 08:33:05 AM,4763656,"40.6039704,-74.052281 40.604651,-74.05021 40.60737,-74.04182 40.60795,-74.039971 40.608211,-74.0...",ymyvFvi~bMgC}K_Pms@sBqJs@{C,BBBBB,Verrazano-Narrows-Bridge,4763656,Brooklyn,VNB E SI GANTRY LOWER LEVEL - BROOLKYN GANTRY LOWER LEVEL


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42532970 entries, 0 to 42532969
Data columns (total 13 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   ID                      int64  
 1   SPEED                   float64
 2   TRAVEL_TIME             int64  
 3   STATUS                  int64  
 4   DATA_AS_OF              object 
 5   LINK_ID                 int64  
 6   LINK_POINTS             object 
 7   ENCODED_POLY_LINE       object 
 8   ENCODED_POLY_LINE_LVLS  object 
 9   OWNER                   object 
 10  TRANSCOM_ID             int64  
 11  BOROUGH                 object 
 12  LINK_NAME               object 
dtypes: float64(1), int64(5), object(7)
memory usage: 4.1+ GB


## Data Cleaning & Reduction<a class="anchor" id="topic-1"></a>
<hr/>

Notes on data:

* ID and LINK_ID are the same (according to the data dictionary) Drop one of these (I say ID so LINK_ID is easy to associate to LINK_POINTS
* Says both STATUS and TRANSCOM_ID are not useful (artifacts) -- drop these
* TRAVEL_TIME - units? -- drop


In [5]:
drop_cols = ['ID', 'TRAVEL_TIME', 'STATUS',
       'OWNER', 'TRANSCOM_ID', 'ENCODED_POLY_LINE','ENCODED_POLY_LINE_LVLS']
# # drop_cols = ['ID', 'TRAVEL_TIME', 'STATUS',
# #        'OWNER', 'TRANSCOM_ID']
# drop_cols = ['ENCODED_POLY_LINE','ENCODED_POLY_LINE_LVLS']
data.drop(columns=drop_cols, axis=1, inplace=True)

In [6]:
data.head()

Unnamed: 0,SPEED,DATA_AS_OF,LINK_ID,LINK_POINTS,BOROUGH,LINK_NAME
0,57.16,06/06/2020 08:33:04 AM,4616355,"40.66639,-73.76248 40.66579,-73.75706 40.66574,-73.7563 40.66571,-73.7494 40.66542,-73.74244 40....",Queens,Belt Pkwy E 182nd St - Laurelton Pkwy N @ SSP
1,57.16,06/06/2020 08:33:04 AM,4616364,"40.73744001,-73.85188001 40.737015,-73.85373001 40.73673,-73.85543001 40.736526,-73.85631001 ...",Queens,LIE W 108TH ST - 84TH ST
2,0.0,06/06/2020 08:33:05 AM,4456494,"40.7262006,-74.01112 40.7252805,-74.01128 40.7218105,-74.011981 40.7195505,-74.012471 40.71654,-...",Manhattan,West St S Spring St - BBT Manhattan Portal outbound
3,52.19,06/06/2020 08:33:05 AM,4763650,"40.60408,-74.052241 40.6036605,-74.05356 40.6033506,-74.05459 40.6029905,-74.055641 40.60213,-74...",Staten Island,SIE E VNB E FINGERBOARD ROAD - SI GANTRY LOWER LEVEL
4,52.19,06/06/2020 08:33:05 AM,4763656,"40.6039704,-74.052281 40.604651,-74.05021 40.60737,-74.04182 40.60795,-74.039971 40.608211,-74.0...",Brooklyn,VNB E SI GANTRY LOWER LEVEL - BROOLKYN GANTRY LOWER LEVEL


In [7]:
### rename columns
rename_cols = {
    'DATA_AS_OF' : 'DATE',
}

data.rename(columns=rename_cols, inplace=True)
data.head()

Unnamed: 0,SPEED,DATE,LINK_ID,LINK_POINTS,BOROUGH,LINK_NAME
0,57.16,06/06/2020 08:33:04 AM,4616355,"40.66639,-73.76248 40.66579,-73.75706 40.66574,-73.7563 40.66571,-73.7494 40.66542,-73.74244 40....",Queens,Belt Pkwy E 182nd St - Laurelton Pkwy N @ SSP
1,57.16,06/06/2020 08:33:04 AM,4616364,"40.73744001,-73.85188001 40.737015,-73.85373001 40.73673,-73.85543001 40.736526,-73.85631001 ...",Queens,LIE W 108TH ST - 84TH ST
2,0.0,06/06/2020 08:33:05 AM,4456494,"40.7262006,-74.01112 40.7252805,-74.01128 40.7218105,-74.011981 40.7195505,-74.012471 40.71654,-...",Manhattan,West St S Spring St - BBT Manhattan Portal outbound
3,52.19,06/06/2020 08:33:05 AM,4763650,"40.60408,-74.052241 40.6036605,-74.05356 40.6033506,-74.05459 40.6029905,-74.055641 40.60213,-74...",Staten Island,SIE E VNB E FINGERBOARD ROAD - SI GANTRY LOWER LEVEL
4,52.19,06/06/2020 08:33:05 AM,4763656,"40.6039704,-74.052281 40.604651,-74.05021 40.60737,-74.04182 40.60795,-74.039971 40.608211,-74.0...",Brooklyn,VNB E SI GANTRY LOWER LEVEL - BROOLKYN GANTRY LOWER LEVEL


In [9]:
data['DATE'] = pd.to_datetime(data['DATE'])

In [12]:
data.head()

Unnamed: 0,SPEED,DATE,LINK_ID,LINK_POINTS,BOROUGH,LINK_NAME
0,57.16,2020-06-06 08:33:04,4616355,"40.66639,-73.76248 40.66579,-73.75706 40.66574,-73.7563 40.66571,-73.7494 40.66542,-73.74244 40....",Queens,Belt Pkwy E 182nd St - Laurelton Pkwy N @ SSP
1,57.16,2020-06-06 08:33:04,4616364,"40.73744001,-73.85188001 40.737015,-73.85373001 40.73673,-73.85543001 40.736526,-73.85631001 ...",Queens,LIE W 108TH ST - 84TH ST
2,0.0,2020-06-06 08:33:05,4456494,"40.7262006,-74.01112 40.7252805,-74.01128 40.7218105,-74.011981 40.7195505,-74.012471 40.71654,-...",Manhattan,West St S Spring St - BBT Manhattan Portal outbound
3,52.19,2020-06-06 08:33:05,4763650,"40.60408,-74.052241 40.6036605,-74.05356 40.6033506,-74.05459 40.6029905,-74.055641 40.60213,-74...",Staten Island,SIE E VNB E FINGERBOARD ROAD - SI GANTRY LOWER LEVEL
4,52.19,2020-06-06 08:33:05,4763656,"40.6039704,-74.052281 40.604651,-74.05021 40.60737,-74.04182 40.60795,-74.039971 40.608211,-74.0...",Brooklyn,VNB E SI GANTRY LOWER LEVEL - BROOLKYN GANTRY LOWER LEVEL


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42532970 entries, 0 to 42532969
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   SPEED        float64       
 1   DATE         datetime64[ns]
 2   LINK_ID      int64         
 3   LINK_POINTS  object        
 4   BOROUGH      object        
 5   LINK_NAME    object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 1.9+ GB


In [None]:
# data.sort_values(by = ['DATE'], axis=1)


In [13]:
data.to_pickle('../data_large/speed_data.pickle')

In [None]:
### reduce down to years 2018-2019
data.sort()

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42532970 entries, 0 to 42532969
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   SPEED        float64       
 1   DATE         datetime64[ns]
 2   LINK_ID      int64         
 3   LINK_POINTS  object        
 4   BOROUGH      object        
 5   LINK_NAME    object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 1.9+ GB


In [14]:
speeds = data[['SPEED', 'DATE', 'LINK_ID']]

In [26]:
# speeds[(speeds['DATE'].dt.year == '2018-01-01')]
speeds_18_19 = speeds[(speeds['DATE'] > '2017-12-31') & (speeds['DATE'] < '2020-01-01')]

In [27]:
speeds_18_19.to_pickle('../data_large/speed_data_2018-2019.pickle')

In [54]:
speeds_18 = speeds[(speeds['DATE'] > '2018-01-01') & (speeds['DATE'] < '2019-01-01')]
speeds_19 = speeds[(speeds['DATE'] > '2019-01-01') & (speeds['DATE'] < '2020-01-01')]

In [50]:
speeds_18.shape

(13898987, 3)

In [49]:
speeds_19.shape

(13010735, 3)

In [55]:
speeds_18.to_pickle('../data_large/speed_data_2018.pickle')
speeds_19.to_pickle('../data_large/speed_data_2019.pickle')

## Topic 2 <a class="anchor" id="topic-2"></a>
<hr/>

In [56]:
speed_data_18 = pd.read_pickle('../data_large/speed_data_2018.pickle')
speed_data_19 = pd.read_pickle('../data_large/speed_data_2019.pickle')

In [59]:
speed_data_18.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13860107 entries, 4119727 to 24546348
Data columns (total 3 columns):
 #   Column   Dtype         
---  ------   -----         
 0   SPEED    float64       
 1   DATE     datetime64[ns]
 2   LINK_ID  int64         
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 423.0 MB


In [57]:
speed_data_18.sort_values(by=['LINK_ID'], ascending=False)

Unnamed: 0,SPEED,DATE,LINK_ID
16784299,44.11,2018-06-19 03:38:03,4763657
21662463,43.49,2018-10-20 06:13:39,4763657
11214302,47.84,2018-01-28 00:18:03,4763657
19948520,45.36,2018-09-06 17:38:22,4763657
14879977,45.36,2018-04-30 12:08:02,4763657
...,...,...,...
15758170,9.32,2018-05-24 09:23:03,4329472
17022422,38.52,2018-06-25 00:48:04,4329472
14592812,12.42,2018-04-23 09:38:03,4329472
12902384,13.04,2018-03-11 17:13:03,4329472


In [60]:
### filter data by some link ID

link_id = 4763657

link_data_18 = speed_data_18[speed_data_18['LINK_ID'] == link_id]

In [61]:
link_data_18.head(20)

Unnamed: 0,SPEED,DATE,LINK_ID
10149500,44.73,2018-01-01 00:02:10,4763657
10149637,44.73,2018-01-01 00:07:09,4763657
10149836,44.11,2018-01-01 00:12:12,4763657
10149902,44.73,2018-01-01 00:17:09,4763657
10150047,44.73,2018-01-01 00:22:11,4763657
10150193,45.36,2018-01-01 00:27:11,4763657
10150331,44.73,2018-01-01 00:32:12,4763657
10150442,44.11,2018-01-01 00:37:11,4763657
10150559,43.49,2018-01-01 00:42:13,4763657
10150774,42.25,2018-01-01 00:47:13,4763657


In [62]:
link_data_18['SPEED'].mean()

43.18222808426166

In [63]:
link_data_18['SPEED'].describe()

count    99700.000000
mean        43.182228
std          9.862680
min          0.000000
25%         43.490000
50%         45.360000
75%         47.220000
max         69.590000
Name: SPEED, dtype: float64