In [6]:
import numpy as np
import pandas as pd
import datetime as dt

from numpy.random import seed
from tensorflow import set_random_seed

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout

# __Preprocessing__

In [7]:
# Read data and save as dataframe
df = pd.read_csv('train.csv')

# Create list of public holidays in Chicago (source: https://publicholidays.us/illinois/2018-dates/ and https://publicholidays.us/illinois/2019-dates/)
holidays = ['2018-10-08', '2018-11-06', '2018-11-11', '2018-11-12', '2018-11-22', '2018-11-23', '2018-12-25', 
            '2019-01-01', '2019-01-21', '2019-02-12', '2019-02-18', '2019-05-27', '2019-07-04', '2019-09-02']

# Split timestamps into date and time
df.trip_start_timestamp = pd.to_datetime(df.trip_start_timestamp)

# Add weekday to new column and determine workdays and rushhours
df['trip_start_dow'] = df.trip_start_timestamp.dt.dayofweek
df['is_workday'] = np.where((df.trip_start_dow < 5) & (~df.trip_start_timestamp.dt.date.isin(holidays)), 1, 0)
df['is_rushhour'] = np.where(
    (df.is_workday == 1) & 
    (((df.trip_start_timestamp.dt.time >= dt.time(7, 0, 0)) & (df.trip_start_timestamp.dt.time <= dt.time(9, 0, 0))) | 
      ((df.trip_start_timestamp.dt.time >= dt.time(16, 0, 0)) & (df.trip_start_timestamp.dt.time <= dt.time(18, 0, 0)))), 1, 0)

# Drop columns
columns = ['id', 'tolls', 'extras', 'tips', 'company', 'taxi_id', 'pickup_community_area', 'dropoff_community_area', 'trip_end_timestamp', 
           'pickup_centroid_longitude', 'pickup_centroid_latitude', 'dropoff_centroid_longitude', 'dropoff_centroid_latitude']
df.drop(columns, axis=1, inplace=True)

# Create date column and delete time and id columns
df.trip_start_timestamp = df.trip_start_timestamp.dt.date

# Pre-clean data
df.dropna(inplace=True)
df = df[df.trip_miles > 0]
df = df[df.trip_miles < 100]
df = df[df.trip_seconds > 0]
df = df[df.trip_seconds < 7200]
df = df[df.trip_total > 0]
df = df[df.trip_total < 100]

# Create additional columns
df['price_per_mile'] = df.fare / df.trip_miles
df = df[df.price_per_mile > 0]
df = df[df.price_per_mile < 10]
df['table_fare'] = 3.25 + 2.25 * df.trip_miles + df.trip_seconds / 36 * 0.20 # source: https://www.chicago.gov/city/en/depts/bacp/supp_info/2012_passenger_information.html
df['overpriced'] = np.where(df.fare > df.table_fare, 1, 0)
df.drop('table_fare', axis=1, inplace=True)

# Drop columns
columns = ['trip_start_timestamp', 'fare', 'trip_start_dow']
df.drop(columns, axis=1, inplace=True)

# One-hot encode categorical data
df['payment_type'] = np.where(df.payment_type == 'Cash', 1, 0)

# Normalize data (feature scaling)
df = (df - df.min()) / (df.max() - df.min())

# Split data into training and test set
from sklearn.model_selection import train_test_split
X = df.loc[:, df.columns != 'is_rushhour'].to_numpy()
y = df.loc[:, df.columns == 'is_rushhour'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Reduce dimensions (PCA)
from sklearn.decomposition import PCA
pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

# __Model__

In [8]:
# Parameters determined by previous hyperparameter tuning
def create_model(lyrs=[8], act='linear', opt='Adagrad', dr=0.5):
    
    seed(2020)
    set_random_seed(2020)
    
    model = Sequential()
    
    # create first hidden layer
    model.add(Dense(lyrs[0], input_dim=X_train.shape[1], activation=act))
    
    # create additional hidden layers
    for i in range(1, len(lyrs)):
        model.add(Dense(lyrs[i], activation=act))
    
    # add dropout, default is none
    model.add(Dropout(dr))
    
    # create output layer
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    
    return model

# __Training__

In [9]:
model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=64, verbose=1)
model.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.2, verbose=1)
y_pred = model.predict(X_test)

Train on 97014 samples, validate on 24254 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/1

# __Metrics__

In [10]:
from sklearn.metrics import confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import cross_val_score

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
cm = pd.DataFrame([[tp, fp], [fn, tn]], index=['Real 1', 'Real 0'], columns=['Pred 1', 'Pred 0'])
ba = balanced_accuracy_score(y_test, y_pred)

print('Confusion Matrix:\n', cm)
print('\nPrecision:\n {0:.2f}%'.format(100 * tp / (tp + fp)))
print('\nAccuracy:\n {0:.2f}%'.format(100 * (tp + tn) / (tp + tn + fp + fn)))
print('\nBalanced Accuracy:\n {0:.2f}%'.format(ba * 100))
print('\nSensitivity (correctly detect positives):\n {0:.2f}%'.format(sensitivity * 100))
print('\nSpecificity (correctly reject negatives):\n {0:.2f}%'.format(specificity * 100))
print('\nFalse Positive Rate (falsely detected positives):\n {0:.2f}%'.format(100 * fp / (tp + tn + fp + fn)))
print('\nFalse Negative Rate (falsely detected negatives):\n {0:.2f}%'.format(100 * fn / (tp + tn + fp + fn)))

Confusion Matrix:
         Pred 1  Pred 0
Real 1      22      40
Real 0    6298   23958

Precision:
 35.48%

Accuracy:
 79.09%

Balanced Accuracy:
 50.09%

Sensitivity (correctly detect positives):
 0.35%

Specificity (correctly reject negatives):
 99.83%

False Positive Rate (falsely detected positives):
 0.13%

False Negative Rate (falsely detected negatives):
 20.77%
