In [4]:
import numpy as np
import pandas as pd
import math
from collections import defaultdict
import matplotlib.pyplot as plt
import sklearn
import csv

import warnings
warnings.filterwarnings("ignore")

In [5]:
features = defaultdict()

features['jan'] = pd.read_csv("../preprocessed_data/jan_data.csv")
features['feb'] = pd.read_csv("../preprocessed_data/feb_data.csv")
features['mar'] = pd.read_csv("../preprocessed_data/mar_data.csv")
features['apr'] = pd.read_csv("../preprocessed_data/apr_data.csv")
features['may'] = pd.read_csv("../preprocessed_data/may_data.csv")
features['jun'] = pd.read_csv("../preprocessed_data/jun_data.csv")
features['jul'] = pd.read_csv("../preprocessed_data/jul_data.csv")
features['aug'] = pd.read_csv("../preprocessed_data/aug_data.csv")
features['sep'] = pd.read_csv("../preprocessed_data/sep_data.csv")
features['oct'] = pd.read_csv("../preprocessed_data/oct_data.csv")
features['nov'] = pd.read_csv("../preprocessed_data/nov_data.csv")
features['dec'] = pd.read_csv("../preprocessed_data/dec_data.csv")

# Sorting Training (Jan-Sep) and Test (Oct-Dec) Sets

In [6]:
# remove the 6 exclusion zones (0, 103, 104, 110, 131, 137) as stated in Outlier Detection notebook
exclusion_zones = [0,102,103,109,130,136]

months = ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']
train_data = []
test_data = []

for i in range(len(months)):
    if (i < 9): # all months before October
        features[months[i]] = features[months[i]].drop(labels = exclusion_zones)
        train_data.append(features[months[i]])
    else: # months from October onwards
        features[months[i]] = features[months[i]].drop(labels = exclusion_zones)
        test_data.append(features[months[i]])
    
train_set = pd.concat(train_data)
train_set = train_set.reset_index()
del train_set['index']

test_set = pd.concat(test_data)
test_set = test_set.reset_index()
del test_set['index']

# 1st Attempt (Logistic Regression, Default Parameters)

In [7]:
# allocation bins with relatively equal frequency
ranges = [-1, 100, 450, 850, 1550, 3250, 14000, 90000, 350000] # -1 to include zones with 0 pickups

for i in range(len(ranges) - 1):
    lower = ranges[i]
    upper = ranges[i+1]
    length = len(train_set['pickups'].loc[(train_set['pickups'] > lower) & (train_set['pickups'] < upper)])
    print(str(lower) + ' - ' + str(upper) + ' pickups: ' + str(length) + ' zones')

-1 - 100 pickups: 284 zones
100 - 450 pickups: 262 zones
450 - 850 pickups: 308 zones
850 - 1550 pickups: 292 zones
1550 - 3250 pickups: 283 zones
3250 - 14000 pickups: 293 zones
14000 - 90000 pickups: 298 zones
90000 - 350000 pickups: 290 zones


In [8]:
X_train = train_set.drop("pickups", axis = 1)
y_train = train_set['pickups']

X_test = test_set.drop("pickups", axis = 1)
y_test = test_set['pickups']

X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# assign pickup bin categories for both training and test labels

new_y_train = []
new_y_test = []

for i in y_train:
    if (i <= 100):
        new_y_train.append(1)
    elif ((i > 100) and (i <= 450)):
        new_y_train.append(2)
    elif ((i > 450) and (i <= 850)):
        new_y_train.append(3)
    elif ((i > 850) and (i <= 1550)):
        new_y_train.append(4)
    elif ((i > 1550) and (i <= 3250)):
        new_y_train.append(5)
    elif ((i > 3250) and (i <= 14000)):
        new_y_train.append(6)
    elif ((i > 14000) and (i <= 90000)):
        new_y_train.append(7)
    else:
        new_y_train.append(8)

for i in y_test:
    if (i <= 100):
        new_y_test.append(1)
    elif ((i > 100) and (i <= 450)):
        new_y_test.append(2)
    elif ((i > 450) and (i <= 850)):
        new_y_test.append(3)
    elif ((i > 850) and (i <= 1550)):
        new_y_test.append(4)
    elif ((i > 1550) and (i <= 3250)):
        new_y_test.append(5)
    elif ((i > 3250) and (i <= 14000)):
        new_y_test.append(6)
    elif ((i > 14000) and (i <= 90000)):
        new_y_test.append(7)
    else:
        new_y_test.append(8)

y_train = np.array(new_y_train)
y_test = np.array(new_y_test)

In [9]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
acc = 0

for i in range(10):
    pred_y = []

    clf.fit(X_train,y_train)
    pred_y.extend(clf.predict(X_test).tolist())

    for i in range(len(pred_y)):
        if (pred_y[i] == y_test[i]):
            acc += 1
        
print("Logistic Regression Accuracy: " + str(acc/(3*257*10)))

Logistic Regression Accuracy: 0.3463035019455253


# 2nd Attempt (Logistic Regression, Optimal Parameter Values)

In [10]:
# use GridSearch to refine the model by finding the most suitable set of parameters for activation function 
# and solver method (takes awhile to run, wouldn't recommend)
from sklearn.model_selection import GridSearchCV

param_grid = [{'penalty' : ['none', 'l2'],
               'C' : np.logspace(-4, 4, 20),
               'solver' : ['sag', 'newton-cg', 'lbfgs', 'saga']}]

clf = GridSearchCV(LogisticRegression(), param_grid, cv = 10, scoring = 'accuracy')
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print(clf.best_params_)

Best parameters set found on development set:
{'C': 0.00026366508987303583, 'penalty': 'l2', 'solver': 'newton-cg'}


In [11]:
clf = LogisticRegression(penalty = 'l2', C= 0.00026366508987303583, solver = 'newton-cg', multi_class = 'multinomial')
acc = 0

for i in range(10):
    pred_y = []

    clf.fit(X_train,y_train)
    pred_y.extend(clf.predict(X_test).tolist())

    for i in range(len(pred_y)):
        if (pred_y[i] == y_test[i]):
            acc += 1
        
print("Logistic Regression Accuracy: " + str(acc/(3*257*10)))

Logistic Regression Accuracy: 0.6575875486381323


In [12]:
# calculating the macro, micro, and weighted averaging for Precision
from sklearn.metrics import precision_score

average = ['macro', 'micro', 'weighted']
precisions = defaultdict()
    
for method in average:
    result = precision_score(y_test, pred_y, average = method)
    precisions[method] = result
    print(method[0].upper() + method[1:] + " Averaging Precision: " + str(result))

Macro Averaging Precision: 0.6759853431555487
Micro Averaging Precision: 0.6575875486381323
Weighted Averaging Precision: 0.670467628345071


In [13]:
# calculating the macro, micro, and weighted averaging for Recall
from sklearn.metrics import recall_score

recalls = defaultdict()

for method in average:
    result = recall_score(y_test, pred_y, average = method)
    recalls[method] = result
    print(method[0].upper() + method[1:] + " Averaging Recall: " + str(result))

Macro Averaging Recall: 0.6769198969906156
Micro Averaging Recall: 0.6575875486381323
Weighted Averaging Recall: 0.6575875486381323


In [14]:
# calculating the macro, micro, and weighted averaging for F1 Score
for method in average:
    result = (2*(precisions[method]*recalls[method])/(precisions[method]+recalls[method]))
    print(method[0].upper() + method[1:] + " Averaging F1-Score: " + str(result))

Macro Averaging F1-Score: 0.6764522972896215
Micro Averaging F1-Score: 0.6575875486381323
Weighted Averaging F1-Score: 0.6639651300726548


In [15]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred_y)

array([[101,   0,   0,   0,   0,   0,   0,   0],
       [ 17,  52,   1,   0,   0,   0,   0,   0],
       [  0,  64,  45,   0,   0,   0,   0,   0],
       [  0,  10,  84,  16,   1,   0,   0,   0],
       [  0,   0,  20,  36,  39,   4,   0,   0],
       [  0,   0,   0,   0,  14,  81,   6,   0],
       [  0,   0,   0,   0,   0,   2,  80,   4],
       [  0,   0,   0,   0,   0,   0,   1,  93]], dtype=int64)

# Feature Selection

In [24]:
X_train = train_set.drop(labels = ['pickups'], axis = 1)
y_train = train_set['pickups']
X_test = test_set.drop(labels = ['pickups'], axis = 1)
y_test = test_set['pickups']

In [19]:
clf = LogisticRegression(penalty = 'l2', C= 0.00026366508987303583, solver = 'newton-cg', multi_class = 'multinomial')
clf.fit(X_train, y_train)

LogisticRegression(C=0.00026366508987303583, multi_class='multinomial',
                   solver='newton-cg')

In [20]:
from sklearn.feature_selection import SelectFromModel

refined_model = SelectFromModel(clf, prefit = True)

In [21]:
refined_features = []
for bool, feat in zip(refined_model.get_support(), list(X_train.columns.values)):
    if bool:
        refined_features.append(feat)

In [97]:
refined_features

['a.num',
 'a.p.age: 18-24',
 'a.p.age: 25-44',
 'a.p.age: 45-64',
 'a.p.sex: M',
 'a.p.sex: F',
 'a.p.race: BLACK',
 'a.p.race: WHITE',
 'a.p.race: BLACK HISPANIC',
 'a.p.race: WHITE HISPANIC',
 'a.p.race: ASIAN / PACIFIC ISLANDER',
 'a.law: F',
 'a.law: M',
 'c.num',
 'average trip distance',
 'credit payment',
 'cash payment',
 'no payment',
 'dispute payment',
 'average fare']

In [25]:
# only using the new set of features deemed optimal by the SelectFromModel function
X_train = train_set.filter(items = refined_features, axis = 1)
X_test = test_set.filter(items = refined_features, axis = 1)

X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# assign pickup bin categories for both training and test labels

new_y_train = []
new_y_test = []

for i in y_train:
    if (i <= 100):
        new_y_train.append(1)
    elif ((i > 100) and (i <= 450)):
        new_y_train.append(2)
    elif ((i > 450) and (i <= 850)):
        new_y_train.append(3)
    elif ((i > 850) and (i <= 1550)):
        new_y_train.append(4)
    elif ((i > 1550) and (i <= 3250)):
        new_y_train.append(5)
    elif ((i > 3250) and (i <= 14000)):
        new_y_train.append(6)
    elif ((i > 14000) and (i <= 90000)):
        new_y_train.append(7)
    else:
        new_y_train.append(8)

for i in y_test:
    if (i <= 100):
        new_y_test.append(1)
    elif ((i > 100) and (i <= 450)):
        new_y_test.append(2)
    elif ((i > 450) and (i <= 850)):
        new_y_test.append(3)
    elif ((i > 850) and (i <= 1550)):
        new_y_test.append(4)
    elif ((i > 1550) and (i <= 3250)):
        new_y_test.append(5)
    elif ((i > 3250) and (i <= 14000)):
        new_y_test.append(6)
    elif ((i > 14000) and (i <= 90000)):
        new_y_test.append(7)
    else:
        new_y_test.append(8)

y_train = np.array(new_y_train)
y_test = np.array(new_y_test)

In [26]:
# same model with only with the top features deemed useful by the SelectFromModel function
clf = LogisticRegression(penalty = 'l2', C= 0.00026366508987303583, solver = 'newton-cg', multi_class = 'multinomial')
acc = 0

for i in range(10):
    pred_y = []

    clf.fit(X_train,y_train)
    pred_y.extend(clf.predict(X_test).tolist())

    for i in range(len(pred_y)):
        if (pred_y[i] == y_test[i]):
            acc += 1
        
print("Logistic Regression Accuracy: " + str(acc/(3*257*10)))

Logistic Regression Accuracy: 0.6562905317769131


In [27]:
confusion_matrix(y_test, pred_y)

array([[101,   0,   0,   0,   0,   0,   0,   0],
       [ 17,  52,   1,   0,   0,   0,   0,   0],
       [  0,  64,  45,   0,   0,   0,   0,   0],
       [  0,  10,  84,  16,   1,   0,   0,   0],
       [  0,   0,  20,  36,  39,   4,   0,   0],
       [  0,   0,   0,   0,  14,  80,   7,   0],
       [  0,   0,   0,   0,   0,   2,  80,   4],
       [  0,   0,   0,   0,   0,   0,   1,  93]], dtype=int64)

# Error Analysis

In [102]:
# rearranging bin sizes again to provide more zones/data to bins 3,4, and 5
ranges = [-1, 50, 300, 775, 1750, 5000, 17000, 100000, 350000] # -1 to include zones with 0 pickups

for i in range(len(ranges) - 1):
    lower = ranges[i]
    upper = ranges[i+1]
    length = len(train_set['pickups'].loc[(train_set['pickups'] > lower) & (train_set['pickups'] < upper)])
    print(str(lower) + ' - ' + str(upper) + ' pickups: ' + str(length) + ' zones')

-1 - 50 pickups: 195 zones
50 - 300 pickups: 219 zones
300 - 775 pickups: 387 zones
775 - 1750 pickups: 403 zones
1750 - 5000 pickups: 332 zones
5000 - 17000 pickups: 242 zones
17000 - 100000 pickups: 262 zones
100000 - 350000 pickups: 268 zones


In [91]:
# only using the top features deemed useful by the SelectFromModel function
X_train = train_set.filter(items = refined_features, axis = 1)
X_test = test_set.filter(items = refined_features, axis = 1)
y_train = train_set['pickups']
y_test = test_set['pickups']

X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# assign pickup bin categories for both training and test labels

new_y_train = []
new_y_test = []

for i in y_train:
    if (i <= 50):
        new_y_train.append(1)
    elif ((i > 50) and (i <= 300)):
        new_y_train.append(2)
    elif ((i > 300) and (i <= 775)):
        new_y_train.append(3)
    elif ((i > 775) and (i <= 1750)):
        new_y_train.append(4)
    elif ((i > 1750) and (i <= 5000)):
        new_y_train.append(5)
    elif ((i > 5000) and (i <= 17000)):
        new_y_train.append(6)
    elif ((i > 17000) and (i <= 100000)):
        new_y_train.append(7)
    else:
        new_y_train.append(8)

for i in y_test:
    if (i <= 50):
        new_y_test.append(1)
    elif ((i > 50) and (i <= 300)):
        new_y_test.append(2)
    elif ((i > 300) and (i <= 775)):
        new_y_test.append(3)
    elif ((i > 775) and (i <= 1750)):
        new_y_test.append(4)
    elif ((i > 1750) and (i <= 5000)):
        new_y_test.append(5)
    elif ((i > 5000) and (i <= 17000)):
        new_y_test.append(6)
    elif ((i > 17000) and (i <= 100000)):
        new_y_test.append(7)
    else:
        new_y_test.append(8)

y_train = np.array(new_y_train)
y_test = np.array(new_y_test)

In [92]:
clf = LogisticRegression(penalty = 'l2', C= 0.00026366508987303583, solver = 'newton-cg', multi_class = 'multinomial')
acc = 0

for i in range(10):
    pred_y = []

    clf.fit(X_train,y_train)
    pred_y.extend(clf.predict(X_test).tolist())

    for i in range(len(pred_y)):
        if (pred_y[i] == y_test[i]):
            acc += 1
        
print("Logistic Regression Accuracy: " + str(acc/(3*257*10)))

Logistic Regression Accuracy: 0.7016861219195849


In [93]:
confusion_matrix(y_test, pred_y)

array([[75,  0,  0,  0,  0,  0,  0,  0],
       [11, 43,  0,  0,  0,  0,  0,  0],
       [ 0, 57, 82,  0,  0,  0,  0,  0],
       [ 0,  3, 91, 42,  0,  0,  0,  0],
       [ 0,  0, 11, 39, 69,  3,  0,  0],
       [ 0,  0,  0,  0,  5, 70,  0,  0],
       [ 0,  0,  0,  0,  0,  7, 70,  3],
       [ 0,  0,  0,  0,  0,  0,  0, 90]], dtype=int64)