In [0]:
import pandas as pd
import numpy as np
url = 'https://raw.githubusercontent.com/safe-routes/Build-SafeRoutes-DS/master/CSV%20Files/safe-routes-model.csv'
ac = pd.read_csv(url, error_bad_lines = False)

In [0]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [115]:
!pip install category_encoders



In [116]:
ac.head()

Unnamed: 0,TWAY_ID,TWAY_ID2,COUNTY,LATITUDE,LONGITUD,DATE,DAY_WEEK,HOUR,LGT_COND,WEATHER,WRK_ZONE,FATALS,PEDS,MAN_COLL,FUNC_SYS,TYP_INT
0,SR-51,NO SECOND STREET,SAN MATEO,32.618239,-85.371383,2015-01-13,TUESDAY,17,NIGHT,CLEAR,0,1,0,ANGLED,ARTERY,NOT AN INTERSECTION
1,CR-40,NO SECOND STREET,ALAMEDA,32.524344,-86.672119,2015-01-16,FRIDAY,19,NIGHT,CLEAR,0,1,0,NOT APPLICABLE,COLLECTOR,NOT AN INTERSECTION
2,SR-14,NO SECOND STREET,ALAMEDA,32.430664,-86.517917,2015-01-10,SATURDAY,0,NIGHT,CLEAR,0,1,0,NOT APPLICABLE,ARTERY,NOT AN INTERSECTION
3,SR-147,NO SECOND STREET,SAN MATEO,32.615806,-85.507961,2015-02-11,WEDNESDAY,11,DAY,CLEAR,0,1,0,HEAD ON,ARTERY,NOT AN INTERSECTION
4,US-SR 1,NO SECOND STREET,SAN MATEO,32.679275,-85.370181,2015-02-23,MONDAY,18,NIGHT,CLEAR,0,1,1,NOT APPLICABLE,ARTERY,NOT AN INTERSECTION


In [0]:
ac['CRASH'] = np.ones(len(ac)).astype(int)

In [118]:
ac.head(1)

Unnamed: 0,TWAY_ID,TWAY_ID2,COUNTY,LATITUDE,LONGITUD,DATE,DAY_WEEK,HOUR,LGT_COND,WEATHER,WRK_ZONE,FATALS,PEDS,MAN_COLL,FUNC_SYS,TYP_INT,CRASH
0,SR-51,NO SECOND STREET,SAN MATEO,32.618239,-85.371383,2015-01-13,TUESDAY,17,NIGHT,CLEAR,0,1,0,ANGLED,ARTERY,NOT AN INTERSECTION,1


In [119]:
ac.shape

(6091, 17)

In [120]:
import random
from datetime import datetime, timedelta
def newdate():
    start = datetime(2015, 1, 1)
    end = datetime(2017, 12, 31)
    DATE = start + (end - start) * random.random()
    DATE = pd.to_datetime(DATE)
    return DATE
DATE = newdate()
DATE

Timestamp('2015-07-30 18:10:50.061385')

In [0]:
# Making copy of 'ac' as 'df'
df = ac.copy()

## Binning & Grouping First Round

In [122]:
(df.DATE).dtypes

dtype('O')

In [0]:
df['DATE'] =  pd.to_datetime(df['DATE'], format="%Y/%m/%d")
df['YEAR'] = df['DATE'].dt.year
df['MONTH'] = df['DATE'].dt.month
df['DAY'] = df['DATE'].dt.day
df = df.drop('DATE', axis=1)

In [0]:
bins = [-1, 4, 8, 12, 16, 20, 23]
labels = [1,2,3,4,5,6]
df['BINNED_HOUR'] = pd.cut(df['HOUR'], bins=bins, labels=labels)

In [125]:
# Number of rows with 'HOUR' > 23
len(df[df.HOUR > 23])

31

In [0]:
# Dropping all rows with hours above 23 (some were found to be 99)
df = df.drop(df[df.HOUR > 23].index)

In [127]:
print(df.shape)
df.head()

(6060, 20)


Unnamed: 0,TWAY_ID,TWAY_ID2,COUNTY,LATITUDE,LONGITUD,DAY_WEEK,HOUR,LGT_COND,WEATHER,WRK_ZONE,FATALS,PEDS,MAN_COLL,FUNC_SYS,TYP_INT,CRASH,YEAR,MONTH,DAY,BINNED_HOUR
0,SR-51,NO SECOND STREET,SAN MATEO,32.618239,-85.371383,TUESDAY,17,NIGHT,CLEAR,0,1,0,ANGLED,ARTERY,NOT AN INTERSECTION,1,2015,1,13,5
1,CR-40,NO SECOND STREET,ALAMEDA,32.524344,-86.672119,FRIDAY,19,NIGHT,CLEAR,0,1,0,NOT APPLICABLE,COLLECTOR,NOT AN INTERSECTION,1,2015,1,16,5
2,SR-14,NO SECOND STREET,ALAMEDA,32.430664,-86.517917,SATURDAY,0,NIGHT,CLEAR,0,1,0,NOT APPLICABLE,ARTERY,NOT AN INTERSECTION,1,2015,1,10,1
3,SR-147,NO SECOND STREET,SAN MATEO,32.615806,-85.507961,WEDNESDAY,11,DAY,CLEAR,0,1,0,HEAD ON,ARTERY,NOT AN INTERSECTION,1,2015,2,11,3
4,US-SR 1,NO SECOND STREET,SAN MATEO,32.679275,-85.370181,MONDAY,18,NIGHT,CLEAR,0,1,1,NOT APPLICABLE,ARTERY,NOT AN INTERSECTION,1,2015,2,23,5


In [128]:
df.WEATHER.value_counts()

CLEAR         5057
RAIN           425
UNREPORTED     423
SNOW            74
FOG             64
CROSSWIND        9
HAIL             8
Name: WEATHER, dtype: int64

In [0]:
# sum_df = df.groupby(['BINNED_HOUR']).agg({'CRASH': 'sum'})

In [0]:
# sum_df = sum_df.reset_index()

In [0]:
# sum_df

In [0]:
sum_county_df = df.groupby(['COUNTY','BINNED_HOUR']).agg({'CRASH': 'sum'}).reset_index()

In [133]:
sum_county_df.head()

Unnamed: 0,COUNTY,BINNED_HOUR,CRASH
0,ALAMEDA,1,397
1,ALAMEDA,2,278
2,ALAMEDA,3,276
3,ALAMEDA,4,350
4,ALAMEDA,5,519


In [0]:
def mean_crash(x):
    result = x.CRASH.sum() / 3
    return result

In [0]:
sum_big_df = df.groupby(['COUNTY','DAY_WEEK','BINNED_HOUR']).apply(mean_crash).reset_index()

In [136]:
sum_big_df.head()

Unnamed: 0,COUNTY,DAY_WEEK,BINNED_HOUR,0
0,ALAMEDA,FRIDAY,1,14.666667
1,ALAMEDA,FRIDAY,2,11.666667
2,ALAMEDA,FRIDAY,3,11.666667
3,ALAMEDA,FRIDAY,4,19.333333
4,ALAMEDA,FRIDAY,5,31.333333


In [137]:
sum_big_df.rename(columns={0: 'AVG_CRASH'}, inplace=True)
sum_big_df.head()

Unnamed: 0,COUNTY,DAY_WEEK,BINNED_HOUR,AVG_CRASH
0,ALAMEDA,FRIDAY,1,14.666667
1,ALAMEDA,FRIDAY,2,11.666667
2,ALAMEDA,FRIDAY,3,11.666667
3,ALAMEDA,FRIDAY,4,19.333333
4,ALAMEDA,FRIDAY,5,31.333333


In [0]:
# import category_encoders as ce

# encoder = ce.OneHotEncoder(cols=['COUNTY', 'DAY_WEEK'])
# sum_big_df1 = encoder.fit_transform(sum_big_df)
# sum_big_df1

In [0]:
# sum_big_df.head(50)

In [0]:
# sum_big_df.COUNTY.unique()

In [141]:
sum_big_df.shape

(210, 4)

In [157]:
sum_big_df.head(25)

Unnamed: 0,COUNTY,DAY_WEEK,BINNED_HOUR,AVG_CRASH
0,ALAMEDA,FRIDAY,1,14.666667
1,ALAMEDA,FRIDAY,2,11.666667
2,ALAMEDA,FRIDAY,3,11.666667
3,ALAMEDA,FRIDAY,4,19.333333
4,ALAMEDA,FRIDAY,5,31.333333
5,ALAMEDA,FRIDAY,6,24.0
6,ALAMEDA,MONDAY,1,13.666667
7,ALAMEDA,MONDAY,2,10.333333
8,ALAMEDA,MONDAY,3,17.333333
9,ALAMEDA,MONDAY,4,14.333333


In [0]:
#sum_big_df.to_csv('data_for_model.csv', index=False)

### Train-Test Split

In [0]:
def split_data(data):
    
    from sklearn.model_selection import train_test_split

    X = data.drop('AVG_CRASH', axis=1)
    y = data.AVG_CRASH

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.75, test_size=0.25)
    
    X_train = X_train.values
    X_test = X_test.values
    y_train = y_train.values
    y_test = y_test.values
    
    return X_train, X_test, y_train, y_test

In [0]:
X_train, X_test, y_train, y_test = split_data(sum_big_df)

In [0]:
# y_train

In [0]:
# y_test

In [0]:
# X_train

In [54]:
len(X_train), len(y_train), len(X_test), len(y_test)

(157, 157, 53, 53)

### AUTOML

In [0]:
!pip install tpot

Collecting tpot
[?25l  Downloading https://files.pythonhosted.org/packages/36/6f/9a400b0a7d32d13b1b9a565de481d10163c8b39d1bdf63ae0219922a24fb/TPOT-0.10.0-py3-none-any.whl (73kB)
[K    100% |████████████████████████████████| 81kB 3.1MB/s 
Collecting stopit>=1.1.1 (from tpot)
  Downloading https://files.pythonhosted.org/packages/35/58/e8bb0b0fb05baf07bbac1450c447d753da65f9701f551dca79823ce15d50/stopit-1.1.2.tar.gz
Collecting update-checker>=0.16 (from tpot)
  Downloading https://files.pythonhosted.org/packages/17/c9/ab11855af164d03be0ff4fddd4c46a5bd44799a9ecc1770e01a669c21168/update_checker-0.16-py2.py3-none-any.whl
Collecting deap>=1.0 (from tpot)
[?25l  Downloading https://files.pythonhosted.org/packages/af/29/e7f2ecbe02997b16a768baed076f5fc4781d7057cd5d9adf7c94027845ba/deap-1.2.2.tar.gz (936kB)
[K    100% |████████████████████████████████| 942kB 11.8MB/s 
Building wheels for collected packages: stopit, deap
  Building wheel for stopit (setup.py) ... [?25ldone
[?25h  Stored in di

In [0]:
from tpot import TPOTRegressor

In [0]:
tpot = TPOTRegressor(generations=10, population_size=60, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))



TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: GradientBoostingRegressor(RobustScaler(input_matrix), alpha=0.8, learning_rate=0.5, loss=ls, max_depth=2, max_features=0.9500000000000001, min_samples_leaf=6, min_samples_split=11, n_estimators=100, subsample=0.6000000000000001)
-7.08126114268589


In [0]:
tpot.predict(X_test)

array([10.99008984,  7.16405551, 10.77415507, 10.75293879, 14.0205755 ,
        3.59555041,  8.08490435, 10.92030026,  5.59505434, 20.38770751,
       12.2863865 , 24.5094146 , 23.4482324 ,  3.73782729,  3.6774826 ,
        4.21352639,  8.70684031,  5.42154055,  1.64424016,  6.3816128 ,
       11.43963064,  4.56417311, 17.41330166, 16.34572865,  5.60316523,
       12.93784682,  7.35219316,  6.43293482,  6.47091056,  6.37599663,
        7.84852294, 11.8670819 , 15.25980035, 11.42852984,  9.17549615,
       10.01410779,  5.8354363 ,  5.35459627,  4.45201364, 10.44919007,
        6.61990903,  6.78490444,  3.22207276,  6.43192611, 11.86897254,
        2.74718774,  6.64721434,  9.74579784,  7.21683326, 11.82778079,
       -0.49824436,  6.90163356,  8.51071578])

In [0]:
!pip install joblib



### Features from String

In [0]:
# s = 'http://crashpredictr-env.jjrxtdfaz3.us-east-2.elasticbeanstalk.com/predict/ALAMEDA?weather=FOG&month=August&day=MONDAY&lgt=DAY&isWorkZone=1'
s = 'MERIN?weather=FOG&month=May&day=MONDAY&lgt=DAY&isWorkZone=1'

In [84]:
# Getting COUNTY
county_str = s.split('?')[0]
county_str

'MERIN'

In [85]:
# Getting DAY
day_str = s[s.find("&day=")+5:s.find("&lgt=")]
day_str

'MONDAY'

In [86]:
# Getting WEATHER
weather_str = s[s.find("?weather=")+9:s.find("&month=")]
weather_str

'FOG'

In [0]:
# Getting MONTH
month_str = s[s.find("&month=")+7:s.find("&day=")]
month_str

'May'

In [0]:
# Getting WRK_ZONE
workzone_str = s[s.find("&isWorkZone=")+12:]
workzone_str

'1'

### Linear Regression

In [0]:
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
import category_encoders as ce

In [153]:
# Pipeline with encoder and regression
pipe = make_pipeline(ce.OneHotEncoder(use_cat_names=True),
                    LinearRegression())

param_grid = {
    'linearregression__n_jobs': [-1]
}

# Fit on the train set, with grid search cross-validation
gs = GridSearchCV(pipe, param_grid=param_grid, cv=2, 
                  scoring='neg_mean_absolute_error', 
                  verbose=1)

gs.fit(X_train, y_train)
validation_score = gs.best_score_

print()
print('Cross-Validation Score:', -validation_score)

print()
print('Best estimator:', gs.best_estimator_)
print()

Fitting 2 folds for each of 1 candidates, totalling 2 fits

Cross-Validation Score: 2.7845293698830407

Best estimator: Pipeline(memory=None,
     steps=[('onehotencoder', OneHotEncoder(cols=[0, 1], drop_invariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, use_cat_names=True, verbose=0)), ('linearregression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False))])



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s finished


In [154]:
from sklearn.metrics import mean_absolute_error

y_pred = gs.predict(X_test) 

# Compare predictions to y_test labels
# comparing the prediction above(y_pred) with y_test
test_score = mean_absolute_error(y_test, y_pred)
print('Test Score:', test_score)

Test Score: 2.1835592928436816


In [0]:
# from joblib import dump, load
# dump(pipe, 'pipeline.joblib')

In [0]:
# dump(gs.best_estimator_, 'gridsearch.joblib')

['gridsearch.joblib']

In [0]:
# del gs
# gs_test = load('gridsearch.joblib')

In [0]:
# y_pred

In [162]:
# CREATING A SAMPLE INPUT FOR PREDICTION
sample_input = (pd.DataFrame([('ALAMEDA', 'SUNDAY', 6)], columns=['COLUMNS', 'DAY_WEEK', 'BINNED_HOUR'])).values
sample_input

array([['ALAMEDA', 'SUNDAY', 6]], dtype=object)

In [163]:
# Prediction returned with 'sample_input'
y_sample_pred = gs.predict(sample_input)
y_sample_pred

array([17.93236985])

#### Testing How to Current Hour **PST**

'UTC'

'US/Pacific'

In [0]:
datetime.datetime.today().astimezone(pacific).hour

20

In [0]:
import datetime
import pytz
from pytz import timezone

utc = pytz.utc
utc.zone
pacific = timezone('US/Pacific')

time = datetime.datetime.today().astimezone(pacific)

hour = (time.hour)

weekday = time.isoweekday()

hour

20

In [0]:
counties_list = ['ALAMEDA', 'SAN FRANCISCO', 'MARTIN', 'ALAMEDA']

In [0]:
tuples_list = []
    
for county in counties_list:
    tuples_list.append((county, weekday, hour))

input_array = (pd.DataFrame(tuples_list, columns=['COLUMNS', 'DAY_WEEK', 'BINNED_HOUR'])).values

In [0]:
input_array

array([['ALAMEDA', 4, -4],
       ['SAN FRANCISCO', 4, -4],
       ['MARTIN', 4, -4],
       ['ALAMEDA', 4, -4]], dtype=object)

#### Functions to get input for model

In [0]:
# FUNCTION TO GET CURRENT DAY AND HOUR -- TO INCLUDE IN .py SCRIPT
def day_hour():
    # today's time
    time = datetime.datetime.today()
    # PST hour
    hour = (time.hour) -7
    
    if hour <= 4:
        hour = 1
    elif 4 > hour <= 8:
        hour = 2
    elif 8 > hour <= 12:
        hour = 3
    elif 12 > hour <= 16:
        hour = 4
    elif 16 > hour <= 20:
        hour = 5
    else:
        hour = 6
        
    # Day of the week
    weekday = time.isoweekday()
    d = {1: 'MONDAY', 2: 'TUESDAY', 3: 'WEDNESDAY', 4: 'THURSDAY', 
        5: 'FRIDAY', 6: 'SATURDAY', 7: 'SUNDAY'}
    for key, value in d.items():
        if key == weekday:
            weekday = value
            
    return weekday, hour


# FUNCTION TO FORMAT COUNTIES GIVEN TO US AND GIVE US INPUT FOR THE MODEL
# This is assuming we'll get a list of counties ('counties_list')
def format_input(counties_list):
    tuples_list = []
    weekday, hour = day_hour()
    
    for county in counties_list:
        tuples_list.append((county, weekday, hour))
        
    input_array = (pd.DataFrame(tuples_list, columns=['COLUMNS', 'DAY_WEEK', 'BINNED_HOUR'])).values
    
    return input_array

In [43]:
month_name = 'August'
import calendar
month_dict = dict((v,k) for k,v in enumerate(calendar.month_name))
for key, value in month_dict.items():
    if key == month_name:
        month_num = value
print(month_num)

8


In [0]:
counties = ['ALAMEDA', 'SAN FRANCISCO', 'MARIN', 'SANTA CLARA']
multi_input = format_input(counties)
multi_input

array([['ALAMEDA', 'THURSDAY', 1],
       ['SAN FRANCISCO', 'THURSDAY', 1],
       ['MARIN', 'THURSDAY', 1],
       ['SANTA CLARA', 'THURSDAY', 1]], dtype=object)

## Binning & Grouping by COUNTY, WEATHER, MONTH, DAY_WEEK, WRK_ZONE

In [159]:
grouped_df = df.groupby(['COUNTY', 'WEATHER', 'MONTH', 'DAY_WEEK','WRK_ZONE', 'BINNED_HOUR']).apply(mean_crash).reset_index()
grouped_df.rename(columns={0: 'AVG_CRASH'}, inplace=True)
grouped_df.head(25)

Unnamed: 0,COUNTY,WEATHER,MONTH,DAY_WEEK,WRK_ZONE,BINNED_HOUR,AVG_CRASH
0,ALAMEDA,CLEAR,1,FRIDAY,0,1,1.333333
1,ALAMEDA,CLEAR,1,FRIDAY,0,2,1.333333
2,ALAMEDA,CLEAR,1,FRIDAY,0,3,1.666667
3,ALAMEDA,CLEAR,1,FRIDAY,0,5,1.0
4,ALAMEDA,CLEAR,1,FRIDAY,0,6,1.0
5,ALAMEDA,CLEAR,1,MONDAY,0,1,0.666667
6,ALAMEDA,CLEAR,1,MONDAY,0,4,1.0
7,ALAMEDA,CLEAR,1,MONDAY,0,5,0.666667
8,ALAMEDA,CLEAR,1,MONDAY,0,6,1.333333
9,ALAMEDA,CLEAR,1,SATURDAY,0,1,1.666667


In [91]:
grouped_df.isnull().sum().sum()

0

In [45]:
grouped_df.shape

(2992, 7)

### Train, Test Split  for `grouped_df`

In [0]:
X_train1, X_test1, y_train1, y_test1 = split_data(grouped_df)

In [48]:
len(X_train1), len(y_train1), len(X_test1), len(y_test1)

(2244, 2244, 748, 748)

In [0]:
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
import category_encoders as ce

In [50]:
# Pipeline with encoder and regression
pipe1 = make_pipeline(ce.OneHotEncoder(use_cat_names=True),
                    LinearRegression())

param_grid1 = {
    'linearregression__n_jobs': [-1]
}

# Fit on the train set, with grid search cross-validation
gs1 = GridSearchCV(pipe1, param_grid=param_grid1, cv=3, 
                  scoring='neg_mean_absolute_error', 
                  verbose=1)

gs1.fit(X_train1, y_train1)
validation_score = gs1.best_score_

print()
print('Cross-Validation Score:', -validation_score)

print()
print('Best estimator:', gs1.best_estimator_)
print()

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.



Cross-Validation Score: 0.30092716785966017

Best estimator: Pipeline(memory=None,
     steps=[('onehotencoder', OneHotEncoder(cols=[0, 1, 3], drop_invariant=False, handle_unknown='impute',
       impute_missing=True, return_df=True, use_cat_names=True, verbose=0)), ('linearregression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False))])



[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.4s finished


In [51]:
from sklearn.metrics import mean_absolute_error

y_pred1 = gs1.predict(X_test1) 

# Compare predictions to y_test labels
# comparing the prediction above(y_pred) with y_test
test_score = mean_absolute_error(y_test1, y_pred1)
print('Test Score:', test_score)

Test Score: 0.30758767921976604


In [110]:
# CREATING A SAMPLE INPUT FOR PREDICTION
sample_input = (pd.DataFrame([('ALAMEDA', 'CLEAR', 8, 'SATURDAY', 0, 6)], columns=['COUNTY', 'WEATHER', 'MONTH', 'DAY_WEEK', 'WRK_ZONE', 'BINNED_HOUR'])).values
#sample_input = sample_input.values
sample_input

array([['ALAMEDA', 'CLEAR', 8, 'SATURDAY', 0, 6]], dtype=object)

In [112]:
# Prediction returned with 'sample_input'
y_pred_sample = gs1.predict(sample_input)

if y_pred_sample[0] < 0:
    y_pred_sample = 0
y_pred_sample

array([1.19429152])

In [66]:
# from joblib import dump, load
# dump(gs1.best_estimator_, 'gridsearch_v3.joblib')

['gridsearch_v3.joblib']

#### Saving GridSearch `gs1` with `joblib`

In [0]:
# dump(gs1.best_estimator_, 'gridsearch.joblib')

['gridsearch.joblib']

## Neural Network

In [58]:
X_train.shape, y_train.shape

((157, 3), (157,))

In [63]:
df.head(2)

Unnamed: 0,TWAY_ID,TWAY_ID2,COUNTY,LATITUDE,LONGITUD,DAY_WEEK,HOUR,LGT_COND,WEATHER,WRK_ZONE,FATALS,PEDS,MAN_COLL,FUNC_SYS,TYP_INT,CRASH,YEAR,MONTH,DAY,BINNED_HOUR
0,SR-51,NO SECOND STREET,SAN MATEO,32.618239,-85.371383,TUESDAY,17,NIGHT,CLEAR,0,1,0,ANGLED,ARTERY,NOT AN INTERSECTION,1,2015,1,13,5
1,CR-40,NO SECOND STREET,ALAMEDA,32.524344,-86.672119,FRIDAY,19,NIGHT,CLEAR,0,1,0,NOT APPLICABLE,COLLECTOR,NOT AN INTERSECTION,1,2015,1,16,5


In [0]:
def splits(data):
    
    from sklearn.model_selection import train_test_split

    X = data.drop('AVG_CRASH', axis=1)
    y = data.AVG_CRASH

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.75, test_size=0.25)
    
    return X_train, X_test, y_train, y_test

In [79]:
X_train2, X_test2, y_train2, y_test2 = splits(grouped_df)

X_train2.shape, X_test2.shape, y_train2.shape, y_test2.shape

((852, 5), (284, 5), (852,), (284,))

In [0]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error # mean_absolute_error <-- same metric as Linear Regression above
from sklearn.model_selection import StratifiedKFold, KFold
from keras.wrappers.scikit_learn import KerasRegressor

In [80]:
# Instantiating a Keras model
inputs = X_train2.shape[1]
epochs = 100
batch_size = 10

np.random.seed(42)

# define base model
#def baseline_model():
model = Sequential()
# Adding layers to model
# 'input_dim': only needed in first layer!
model.add(Dense(15, input_dim=inputs, activation='relu'))
model.add(Dropout(0.1))
# model.add(Dense(13, activation='relu'))
# model.add(Dropout(0.1))
model.add(Dense(1, activation='linear'))

model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 15)                90        
_________________________________________________________________
dropout_4 (Dropout)          (None, 15)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 16        
Total params: 106
Trainable params: 106
Non-trainable params: 0
_________________________________________________________________


In [81]:
history = model.fit(X_train2, y_train2, epochs=200, validation_split=.2)

Instructions for updating:
Use tf.cast instead.
Train on 681 samples, validate on 171 samples
Epoch 1/200


ValueError: ignored

In [0]:
y_pred = model.predict(X_test)
MAE = mean_absolute_error(y_test, y_pred)

print(f'MAE from NN: {MAE:.3f}')

MAE from NN: 3.043


### 1. Optimize batch size & Adding Epochs

In [0]:
%%time
# define the grid search parameters
batch_size = [10, 20, 40, 60]
epochs = [100]
param_grid = dict(batch_size=batch_size, epochs=epochs)

# fix random seed for reproducibility
seed = 42
np.random.seed(seed)

def create_model():
    model = Sequential()
    model.add(Dense(15, input_dim=15, activation='relu'))
    model.add(Dropout(0.1))
#     model.add(Dense(13, activation='relu'))
#     model.add(Dropout(0.1))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_absolute_error', optimizer='adam',
                  metrics=['mean_absolute_error'])
    return model
    
# create model
model = KerasRegressor(build_fn=create_model, verbose=2)

# define 5-fold cross validation test harness
kfold = KFold(n_splits=3, shuffle=True, random_state=seed)

# Create Grid Search
grid = GridSearchCV(estimator=model, param_grid=param_grid, 
                    n_jobs=1, cv=kfold)

# Fit
grid_result2 = grid.fit(X, y)

Epoch 1/100
 - 0s - loss: 10.6389 - mean_absolute_error: 10.6389
Epoch 2/100
 - 0s - loss: 10.2554 - mean_absolute_error: 10.2554
Epoch 3/100
 - 0s - loss: 9.8655 - mean_absolute_error: 9.8655
Epoch 4/100
 - 0s - loss: 9.5391 - mean_absolute_error: 9.5391
Epoch 5/100
 - 0s - loss: 9.1889 - mean_absolute_error: 9.1889
Epoch 6/100
 - 0s - loss: 8.8523 - mean_absolute_error: 8.8523
Epoch 7/100
 - 0s - loss: 8.5060 - mean_absolute_error: 8.5060
Epoch 8/100
 - 0s - loss: 8.1516 - mean_absolute_error: 8.1516
Epoch 9/100
 - 0s - loss: 7.8417 - mean_absolute_error: 7.8417
Epoch 10/100
 - 0s - loss: 7.3952 - mean_absolute_error: 7.3952
Epoch 11/100
 - 0s - loss: 7.0668 - mean_absolute_error: 7.0668
Epoch 12/100
 - 0s - loss: 6.6450 - mean_absolute_error: 6.6450
Epoch 13/100
 - 0s - loss: 6.3656 - mean_absolute_error: 6.3656
Epoch 14/100
 - 0s - loss: 5.8370 - mean_absolute_error: 5.8370
Epoch 15/100
 - 0s - loss: 5.2630 - mean_absolute_error: 5.2630
Epoch 16/100
 - 0s - loss: 5.1112 - mean_abso

In [0]:
grid_result = grid_result2

# Report Results
print(f"Best: {grid_result.best_score_:.3f} using {grid_result.best_params_}")
print()
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Means: {mean:.3f}, Stdev: {stdev:.3f} with: {param}")

Best: -2.653 using {'batch_size': 10, 'epochs': 100}

Means: -2.653, Stdev: 0.410 with: {'batch_size': 10, 'epochs': 100}
Means: -3.061, Stdev: 0.565 with: {'batch_size': 20, 'epochs': 100}
Means: -3.313, Stdev: 0.452 with: {'batch_size': 40, 'epochs': 100}
Means: -3.552, Stdev: 0.669 with: {'batch_size': 60, 'epochs': 100}


### 2. Optimize neurons in the hidden layer

In [0]:
%%time
# define the grid search parameters
n_neurons = [10, 20, 30, 40, 50, 60]
param_grid = dict(n_neurons=n_neurons)
n_cv_splits = 3
# -----------------------------------------------

# fix random seed for reproducibility
seed = 42
np.random.seed(seed)

def create_model(n_neurons):
    model = Sequential()
    model.add(Dense(n_neurons, input_dim=15, activation='relu'))
    model.add(Dropout(0.1))
#     model.add(Dense(13, activation='relu'))
#     model.add(Dropout(0.1))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_absolute_error', optimizer='adam',
                  metrics=['mean_absolute_error'])
    return model
    
# create model
model = KerasRegressor(build_fn=create_model, verbose=2,
                       batch_size=10, epochs=100) # <-- batch size and epochs based on last optimization run

# define 5-fold cross validation test harness
kfold = KFold(n_splits=n_cv_splits, shuffle=True, random_state=seed)

# Create Grid Search
grid = GridSearchCV(estimator=model, param_grid=param_grid, 
                    n_jobs=1, cv=kfold)

# Fit
grid_result3 = grid.fit(X, y)

Epoch 1/100
 - 1s - loss: 10.4976 - mean_absolute_error: 10.4976
Epoch 2/100
 - 0s - loss: 10.1606 - mean_absolute_error: 10.1606
Epoch 3/100
 - 0s - loss: 9.8188 - mean_absolute_error: 9.8188
Epoch 4/100
 - 0s - loss: 9.5170 - mean_absolute_error: 9.5170
Epoch 5/100
 - 0s - loss: 9.1680 - mean_absolute_error: 9.1680
Epoch 6/100
 - 0s - loss: 8.8744 - mean_absolute_error: 8.8744
Epoch 7/100
 - 0s - loss: 8.4142 - mean_absolute_error: 8.4142
Epoch 8/100
 - 0s - loss: 8.0251 - mean_absolute_error: 8.0251
Epoch 9/100
 - 0s - loss: 7.6885 - mean_absolute_error: 7.6885
Epoch 10/100
 - 0s - loss: 7.2538 - mean_absolute_error: 7.2538
Epoch 11/100
 - 0s - loss: 6.8477 - mean_absolute_error: 6.8477
Epoch 12/100
 - 0s - loss: 6.5007 - mean_absolute_error: 6.5007
Epoch 13/100
 - 0s - loss: 6.1495 - mean_absolute_error: 6.1495
Epoch 14/100
 - 0s - loss: 5.7976 - mean_absolute_error: 5.7976
Epoch 15/100
 - 0s - loss: 5.3292 - mean_absolute_error: 5.3292
Epoch 16/100
 - 0s - loss: 5.2076 - mean_abso

In [0]:
grid_result = grid_result3

# Report Results
print(f"Best: {grid_result.best_score_:.3f} using {grid_result.best_params_}")
print()
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Means: {mean:.3f}, Stdev: {stdev:.3f} with: {param}")

Best: -2.615 using {'n_neurons': 40}

Means: -2.694, Stdev: 0.462 with: {'n_neurons': 10}
Means: -2.647, Stdev: 0.408 with: {'n_neurons': 20}
Means: -2.645, Stdev: 0.374 with: {'n_neurons': 30}
Means: -2.615, Stdev: 0.329 with: {'n_neurons': 40}
Means: -2.616, Stdev: 0.364 with: {'n_neurons': 50}
Means: -2.628, Stdev: 0.340 with: {'n_neurons': 60}


### 3. Optimize the number of hidden layers - with 40 Neurons

In [0]:
%%time
# define the grid search parameters
n_hidden_layers = [1,2,3]
param_grid = dict(n_hidden_layers=n_hidden_layers)
n_cv_splits = 3
# -----------------------------------------------

# fix random seed for reproducibility
seed = 42
np.random.seed(seed)

def create_model(n_hidden_layers):
    model = Sequential()
    
    for i in range(n_hidden_layers):
        if i == 1:
            model.add(Dense(40, input_dim=15, activation='relu'))
            model.add(Dropout(0.1))
        else:
            model.add(Dense(40, activation='relu'))
            model.add(Dropout(0.1))
    
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_absolute_error', optimizer='adam',
                  metrics=['mean_absolute_error'])
    return model
    
# create model
model = KerasRegressor(build_fn=create_model, verbose=2,
                       batch_size=10, epochs=100) # <-- batch size and epochs based on previous optimization run

# define 5-fold cross validation test harness
kfold = KFold(n_splits=n_cv_splits, shuffle=True, random_state=seed)

# Create Grid Search
grid = GridSearchCV(estimator=model, param_grid=param_grid, 
                    n_jobs=1, cv=kfold)

# Fit
grid_result4 = grid.fit(X.values, y.values) # <-- Here changed X & y to arrays -- gave an error requesting array for some reason

Epoch 1/100
 - 2s - loss: 10.9770 - mean_absolute_error: 10.9770
Epoch 2/100
 - 0s - loss: 10.3825 - mean_absolute_error: 10.3825
Epoch 3/100
 - 0s - loss: 9.7608 - mean_absolute_error: 9.7608
Epoch 4/100
 - 0s - loss: 9.1331 - mean_absolute_error: 9.1331
Epoch 5/100
 - 0s - loss: 8.4469 - mean_absolute_error: 8.4469
Epoch 6/100
 - 0s - loss: 7.7734 - mean_absolute_error: 7.7734
Epoch 7/100
 - 0s - loss: 7.1677 - mean_absolute_error: 7.1677
Epoch 8/100
 - 0s - loss: 6.3897 - mean_absolute_error: 6.3897
Epoch 9/100
 - 0s - loss: 5.7767 - mean_absolute_error: 5.7767
Epoch 10/100
 - 0s - loss: 5.2066 - mean_absolute_error: 5.2066
Epoch 11/100
 - 0s - loss: 4.7049 - mean_absolute_error: 4.7049
Epoch 12/100
 - 0s - loss: 4.2916 - mean_absolute_error: 4.2916
Epoch 13/100
 - 0s - loss: 4.1097 - mean_absolute_error: 4.1097
Epoch 14/100
 - 0s - loss: 3.8792 - mean_absolute_error: 3.8792
Epoch 15/100
 - 0s - loss: 3.6764 - mean_absolute_error: 3.6764
Epoch 16/100
 - 0s - loss: 3.7606 - mean_abso

In [0]:
grid_result = grid_result4

# Report Results
print(f"Best: {grid_result.best_score_:.3f} using {grid_result.best_params_}")
print()
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Means: {mean:.3f}, Stdev: {stdev:.3f} with: {param}")

Best: -2.617 using {'n_hidden_layers': 1}

Means: -2.617, Stdev: 0.349 with: {'n_hidden_layers': 1}
Means: -2.804, Stdev: 0.336 with: {'n_hidden_layers': 2}
Means: -2.855, Stdev: 0.342 with: {'n_hidden_layers': 3}


In [11]:
df.head()

Unnamed: 0,TWAY_ID,TWAY_ID2,COUNTY,LATITUDE,LONGITUD,DATE,DAY_WEEK,HOUR,LGT_COND,WEATHER,WRK_ZONE,FATALS,PEDS,MAN_COLL,FUNC_SYS,TYP_INT,CRASH
0,SR-51,NO SECOND STREET,SAN MATEO,32.618239,-85.371383,2015-01-13,TUESDAY,17,NIGHT,CLEAR,0,1,0,ANGLED,ARTERY,NOT AN INTERSECTION,1
1,CR-40,NO SECOND STREET,ALAMEDA,32.524344,-86.672119,2015-01-16,FRIDAY,19,NIGHT,CLEAR,0,1,0,NOT APPLICABLE,COLLECTOR,NOT AN INTERSECTION,1
2,SR-14,NO SECOND STREET,ALAMEDA,32.430664,-86.517917,2015-01-10,SATURDAY,0,NIGHT,CLEAR,0,1,0,NOT APPLICABLE,ARTERY,NOT AN INTERSECTION,1
3,SR-147,NO SECOND STREET,SAN MATEO,32.615806,-85.507961,2015-02-11,WEDNESDAY,11,DAY,CLEAR,0,1,0,HEAD ON,ARTERY,NOT AN INTERSECTION,1
4,US-SR 1,NO SECOND STREET,SAN MATEO,32.679275,-85.370181,2015-02-23,MONDAY,18,NIGHT,CLEAR,0,1,1,NOT APPLICABLE,ARTERY,NOT AN INTERSECTION,1


In [47]:
df.describe()

Unnamed: 0,LATITUDE,LONGITUD,HOUR,WRK_ZONE,FATALS,PEDS,CRASH,YEAR,MONTH,DAY
count,6060.0,6060.0,6060.0,6060.0,6060.0,6060.0,6060.0,6060.0,6060.0,6060.0
mean,37.335407,-93.424212,12.786304,0.020627,1.083993,0.234653,1.0,2016.025743,6.790924,15.507756
std,4.837393,17.895435,6.890896,0.142144,0.345735,0.475566,0.0,0.809865,3.355586,8.71288
min,19.0601,-156.033044,0.0,0.0,1.0,0.0,1.0,2015.0,1.0,1.0
25%,35.003195,-106.40319,7.0,0.0,1.0,0.0,1.0,2015.0,4.0,8.0
50%,37.701688,-86.149636,14.0,0.0,1.0,0.0,1.0,2016.0,7.0,15.0
75%,40.626792,-79.7798,19.0,0.0,1.0,0.0,1.0,2017.0,10.0,23.0
max,48.747119,-69.966544,23.0,1.0,5.0,7.0,1.0,2017.0,12.0,31.0
