In [152]:
import pandas as pd
import numpy as np
import datetime as dt

In [153]:
# Read data and save as dataframe
df = pd.read_csv('train.csv')

# Create list of public holidays in Chicago (source: https://publicholidays.us/illinois/2018-dates/ and https://publicholidays.us/illinois/2019-dates/)
holidays = ['2018-10-08', '2018-11-06', '2018-11-11', '2018-11-12', '2018-11-22', '2018-11-23', '2018-12-25', 
            '2019-01-01', '2019-01-21', '2019-02-12', '2019-02-18', '2019-05-27', '2019-07-04', '2019-09-02']

# Split timestamps into date and time
df['trip_start_timestamp'] = pd.to_datetime(df['trip_start_timestamp'])
df['trip_end_timestamp'] = pd.to_datetime(df['trip_end_timestamp'])

# Add weekday to new column and determine workdays and rushhours
df['trip_start_dow'] = df.trip_start_timestamp.dt.dayofweek
df['is_workday'] = np.where((df.trip_start_dow < 5) & (~df.trip_start_timestamp.dt.date.isin(holidays)), 1, 0)


df['is_rushhour'] = np.where(
    (df.is_workday == 1) & 
    (((df.trip_start_timestamp.dt.time >= dt.time(7, 0, 0)) & (df.trip_start_timestamp.dt.time <= dt.time(9, 0, 0))) | 
      ((df.trip_start_timestamp.dt.time >= dt.time(16, 0, 0)) & (df.trip_start_timestamp.dt.time <= dt.time(18, 0, 0)))), 1, 0)

# df['is_business_area'] = np.where((((df.dropoff_centroid_latitude >= 41.875800) & (df.dropoff_centroid_latitude <= 41.889000)) 
#                                    & ((df.dropoff_centroid_longitude >= -87.644300) & (df.dropoff_centroid_longitude <= -87.624200))) |
#                                   ((df.pickup_centroid_latitude >= 41.875800) & (df.pickup_centroid_latitude <= 41.889000)) 
#                                    & ((df.pickup_centroid_longitude >= -87.644300) & (df.pickup_centroid_longitude <= -87.624200)), 1, 0)

# Create date column and delete time and id columns
df['trip_start_date'] = df.trip_start_timestamp.dt.date
df['trip_end_date'] = df.trip_start_timestamp.dt.date
df.drop(['trip_start_timestamp', 'trip_end_timestamp', 'id'], axis=1, inplace=True)

# Pre-clean data
df.dropna(inplace=True)
df = df[df.trip_miles > 0]
df = df[df.trip_miles < 100]
df = df[df.trip_seconds > 0]
df = df[df.trip_seconds < 7200]
df = df[df.trip_total > 0]
df = df[df.trip_total < 100]

# Create additional columns
df['price_per_mile'] = df.fare / df.trip_miles
df = df[df.price_per_mile > 0]
df = df[df.price_per_mile < 10]
df['table_fare'] = 3.25 + 2.25 * df.trip_miles + df.trip_seconds / 36 * 0.20 # source: https://www.chicago.gov/city/en/depts/bacp/supp_info/2012_passenger_information.html
df['overpriced'] = np.where(df.fare > df.table_fare, 1, 0)
df.drop('table_fare', axis=1, inplace=True)

In [154]:
payment_attr = []
for pt in df.payment_type.unique():
    if pt != 'Cash':
        payment_attr.append('payment_type_' + pt)

In [155]:
df.head()

Unnamed: 0,taxi_id,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,...,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,trip_start_dow,is_workday,is_rushhour,trip_start_date,trip_end_date,price_per_mile,overpriced
1,32e3320b2ad8fc0e4c90c95d962029be24d38ca2,419.0,1.0,8.0,8.0,6.25,1.0,0.0,0.0,7.75,...,-87.631864,41.895033,-87.619711,2,1,1,2019-01-16,2019-01-16,6.25,0
3,90bd7186d83d9653bd0453dd16c143941e597f6b,609.0,2.52,28.0,24.0,9.75,2.0,0.0,0.0,12.25,...,-87.663518,41.901207,-87.676356,2,1,1,2019-09-25,2019-09-25,3.869048,0
5,cf7e98e9a85621161e475ffeef63e03326a5b07e,120.0,0.6,8.0,8.0,4.5,0.0,0.0,0.0,4.5,...,-87.626659,41.899156,-87.626211,2,1,0,2019-02-13,2019-02-13,7.5,0
6,4b97d7a131d31575a3d142ca2fb7b582a40e7099,1140.0,1.8,32.0,33.0,11.0,3.0,0.0,1.0,15.0,...,-87.632746,41.85935,-87.617358,3,1,1,2019-05-23,2019-05-23,6.111111,0
8,22a8fe769e17d81098b3f8cf4a7c6c91ffb8393f,2700.0,15.6,76.0,8.0,40.25,5.0,0.0,4.0,49.25,...,-87.90304,41.904935,-87.649907,3,1,0,2019-08-29,2019-08-29,2.580128,0


In [156]:
# Delete rows
# df.drop(df.index[50000:], inplace=True)

# Drop columns
columns = ['trip_start_date', 'trip_end_date', 'tolls', 'trip_start_dow', 'fare', 'extras', 'tips', 'company', 'taxi_id', 
          'pickup_community_area', 'dropoff_community_area', 'pickup_centroid_longitude', 'pickup_centroid_latitude', 'dropoff_centroid_longitude', 'dropoff_centroid_latitude']
df.drop(columns, axis=1, inplace=True)
    

# One-hot encode categorical data
# columns = ['payment_type']
# one_hot_df = pd.get_dummies(df[columns])
# one_hot_df.drop(payment_attr, axis=1, inplace=True)
# df.drop(columns, axis=1, inplace=True)
# df = df.join(one_hot_df)'
df['payment_type'] = np.where(df.payment_type == 'Cash', 1, 0)

# Normalize data (feature scaling)
df = (df - df.min()) / (df.max() - df.min())

In [157]:
df.head()

Unnamed: 0,trip_seconds,trip_miles,trip_total,payment_type,is_workday,is_rushhour,price_per_mile,overpriced
1,0.056786,0.009744,0.077719,0.0,1.0,1.0,0.628364,0.0
3,0.083427,0.028258,0.122904,0.0,1.0,1.0,0.388973,0.0
5,0.014863,0.004872,0.045085,1.0,1.0,0.0,0.754045,0.0
6,0.15788,0.019488,0.150517,0.0,1.0,1.0,0.6144,0.0
8,0.376612,0.187576,0.494427,0.0,1.0,0.0,0.259379,0.0


In [158]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 127467 entries, 1 to 199999
Data columns (total 8 columns):
trip_seconds      127467 non-null float64
trip_miles        127467 non-null float64
trip_total        127467 non-null float64
payment_type      127467 non-null float64
is_workday        127467 non-null float64
is_rushhour       127467 non-null float64
price_per_mile    127467 non-null float64
overpriced        127467 non-null float64
dtypes: float64(8)
memory usage: 8.8 MB


In [159]:
# Split data into training and test set
from sklearn.model_selection import train_test_split
X = df.loc[:, df.columns != 'is_rushhour'].to_numpy()
y = df.loc[:, df.columns == 'is_rushhour'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020)

# Reduce dimensions (PCA)
from sklearn.decomposition import PCA
pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [160]:
# XGBoost
# from xgboost import XGBClassifier
# classifier = XGBClassifier()

# Logistic Regression
# from sklearn.linear_model import LogisticRegression
# classifier = LogisticRegression(random_state = 0, multi_class='ovr')

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()

# Decision Tree
# from sklearn.tree import DecisionTreeClassifier
# classifier = DecisionTreeClassifier()

# Random Forest
# from sklearn.ensemble import RandomForestClassifier
# classifier = RandomForestClassifier()

# Fit model and predict test set
classifier.fit(X_train, y_train.ravel())
y_pred = classifier.predict(X_test)

In [161]:
# Metrics
from sklearn.metrics import confusion_matrix, balanced_accuracy_score
from sklearn.model_selection import cross_val_score

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
cm = pd.DataFrame([[tp, fp], [fn, tn]], index=['Real 1', 'Real 0'], columns=['Pred 1', 'Pred 0'])
kfcv = cross_val_score(estimator=classifier, X=X_train, y=y_train.ravel(), cv=10)
ba = balanced_accuracy_score(y_test, y_pred)

print('Confusion Matrix:\n', cm)
print('\nPrecision:\n {0:.2f}%'.format(100 * tp / (tp + fp)))
print('\nAccuracy:\n {0:.2f}%'.format(100 * (tp + tn) / (tp + tn + fp + fn)))
print('\nBalanced Accuracy:\n {0:.2f}%'.format(ba * 100))
print('\nSensitivity (correctly detect positives):\n {0:.2f}%'.format(sensitivity * 100))
print('\nSpecificity (correctly reject negatives):\n {0:.2f}%'.format(specificity * 100))
print('\nFalse Positive Rate (falsely detected positives):\n {0:.2f}%'.format(100 * fp / (tp + tn + fp + fn)))
print('\nFalse Negative Rate (falsely detected negatives):\n {0:.2f}%'.format(100 * fn / (tp + tn + fp + fn)))
# print('\n10-fold Cross Validation:')
# for j in kfcv:
#     print(' {0:.2f}'.format(j))

Confusion Matrix:
         Pred 1  Pred 0
Real 1    4728   10632
Real 0     641    9493

Precision:
 30.78%

Accuracy:
 55.78%

Balanced Accuracy:
 67.62%

Sensitivity (correctly detect positives):
 88.06%

Specificity (correctly reject negatives):
 47.17%

False Positive Rate (falsely detected positives):
 41.70%

False Negative Rate (falsely detected negatives):
 2.51%
