<h1>Imports</h1>

In [1]:
from pathlib import Path

import numpy as np
import sklearn as slearn
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import fbeta_score
from sklearn.metrics import classification_report

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.utils.class_weight import compute_class_weight

from imblearn.combine import SMOTETomek

import glob
import os

Using TensorFlow backend.


In [2]:
pd.set_option("display.max_columns", 100)
%matplotlib inline

In [3]:
RANDOM_SEED = 6

<h1>Dataset Importing</h1>

In [4]:
DATA_PATH = Path.cwd() / "../../datasets"

In [5]:
training_df = pd.read_csv(DATA_PATH / "train.csv", index_col="tripid")
training_df.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,correct
189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,correct
189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,correct
189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,correct
189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.929,358.39,correct


In [6]:
training_df = training_df.dropna(subset=['fare'])

<h1>Data Preprocessing</h1>

<h2>Data Conversions</h2>

In [None]:
# Converting date time objects from Objects to datetime64
training_df['pickup_time'] = pd.to_datetime(training_df['pickup_time'], format="%m/%d/%Y %H:%M")
training_df['drop_time'] = pd.to_datetime(training_df['drop_time'], format="%m/%d/%Y %H:%M")

<h2>Adding Features</h2>
Adding features deemed useful from Feature Engineering

In [None]:
durations = []
for index,row in training_df.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = 60
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

training_df.insert(4,"time_dif",durations)



In [None]:
def dist_from_coordinates(lat1, lon1, lat2, lon2):
  R = 6371  # Earth radius in km

  #conversion to radians
  d_lat = np.radians(lat2-lat1)
  d_lon = np.radians(lon2-lon1)

  r_lat1 = np.radians(lat1)
  r_lat2 = np.radians(lat2)

  #haversine formula
  a = np.sin(d_lat/2.) **2 + np.cos(r_lat1) * np.cos(r_lat2) * np.sin(d_lon/2.)**2

  haversine = 2 * R * np.arcsin(np.sqrt(a))

  return haversine

In [None]:
training_df = training_df.assign(timeOfDay=pd.cut(training_df.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [None]:
new_column = []                    #empty column for distance
for index,row in training_df.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

training_df.insert(4,"distance",new_column)

In [None]:
training_df['avg_speed'] = (training_df['distance'] /  ( training_df['time_dif']) * 3600 )

In [None]:
training_df = training_df.drop(index=190167541)

In [None]:
training_df.nlargest(100,'avg_speed')

<h1>Model Training</h1>

In [8]:
# Extract the useful features deemed in feature engineering
features_classifier = training_df[['duration', 'meter_waiting', 'meter_waiting_fare',
       'additional_fare','fare']]
label_classifier = training_df[['label']]
mapping = {'correct': 1, 'incorrect': 0}
label_classifier = label_classifier.replace({'label':mapping})

In [9]:
features_classifier.dtypes

duration              float64
meter_waiting         float64
meter_waiting_fare    float64
additional_fare       float64
fare                  float64
dtype: object

In [10]:
numerical_features = features_classifier.columns[features_classifier.dtypes == "float64"].values
categorical_features = features_classifier.columns[features_classifier.dtypes == "category" ].values

In [11]:
numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='median'))
])

non_numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot_encoder', OneHotEncoder(handle_unknown='ignore')),
#     ('label_encoder', LabelEncoder())
])

## create preprocessor stage of the final pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('non_numeric', non_numeric_preprocessing_steps, categorical_features),
        ('numeric', numeric_preprocessing_steps, numerical_features)
    ],
    remainder = 'drop'
)

In [12]:
preprocessed_features = preprocessor.fit_transform(features_classifier)

In [21]:
X_train, X_eval, y_train, y_eval = train_test_split(preprocessed_features, label_classifier.values, test_size=0.3, shuffle=True,stratify=label_classifier)

<h2>Random Hypertuning</h2>

In [15]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [16]:
classifier = RandomForestClassifier(random_state=42, class_weight='balanced')

In [18]:
rf_random = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid, n_iter = 100, cv = 4, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 24.8min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 52.4min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 57.7min finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=4, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight='balanced',
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
         

In [20]:
rf_random.best_params_

{'n_estimators': 800,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 40,
 'bootstrap': True}

In [22]:
rf_random.best_score_

0.9420646938772297

<h3>Evaluating random search</h3>

In [23]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [28]:
base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_eval, y_eval)

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random,  X_eval, y_eval)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))


  


ValueError: Unable to coerce to Series, length must be 1: given 5112

<h2>Grid search to narrow down</h2>

In [30]:
classifier = RandomForestClassifier()

In [32]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = classifier, param_grid = param_grid, 
                          cv = 4, n_jobs = -1, verbose = 2)

In [33]:
grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 288 candidates, totalling 1152 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 22.7min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 39.5min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 60.4min
[Parallel(n_jobs=-1)]: Done 1152 out of 1152 | elapsed: 70.4min finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=4, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [47]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 80,
 'max_features': 3,
 'min_samples_leaf': 3,
 'min_samples_split': 8,
 'n_estimators': 200}

<h2>Training the final classifier</h2>

In [13]:
classifier = RandomForestClassifier(bootstrap= True,
 class_weight='balanced',
 max_depth= 80,
 max_features= 3,
 min_samples_leaf= 3,
 min_samples_split= 8,
 n_estimators= 200)

In [14]:
kf = StratifiedKFold()

In [15]:
for train_index, test_index in kf.split(preprocessed_features,label_classifier.values):
    X_train_fold, X_eval_fold = preprocessed_features[train_index], preprocessed_features[test_index]
    y_train_fold, y_eval_fold =label_classifier.values[train_index], label_classifier.values[test_index]
    # Transform training data columns 
    classifier.fit(X_train_fold,y_train_fold)
    
    
    ## Train the model
    

  """
  """
  """
  """
  """


In [22]:
# Predict for the evaluation set
print("Training Accuracy: %.2f" % (classifier.score(X_eval, y_eval)*100), "%")
y_pred = classifier.predict(X_eval)

Training Accuracy: 97.63 %


In [23]:
confusion_matrix = confusion_matrix(y_eval, y_pred)
confusion_matrix

array([[ 423,   40],
       [  81, 4568]])

In [24]:
print(classification_report(y_eval, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.91      0.87       463
           1       0.99      0.98      0.99      4649

    accuracy                           0.98      5112
   macro avg       0.92      0.95      0.93      5112
weighted avg       0.98      0.98      0.98      5112



<h1>Model Validation</h1>

<h2>Loading the test data set</h2>

In [25]:
test_set = pd.read_csv(DATA_PATH / "test.csv", index_col="tripid")
test_set.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
213284604,10.5,924,42,2.4486,148,2/1/2020 0:38,2/1/2020 0:53,6.83454,79.875,6.7749,79.884,289.27
213286352,10.5,4249,20,0.0,91,2/1/2020 1:02,2/1/2020 2:13,6.91168,79.8723,6.55091,79.9706,1912.7
213293973,10.5,1552,255,2.6588,23,2/1/2020 5:02,2/1/2020 5:28,6.92145,79.8478,6.90539,79.8989,394.0
213294622,10.5,462,16,0.0,198,2/1/2020 5:30,2/1/2020 5:38,6.77433,79.9416,6.80401,79.9407,154.32
213298687,10.5,814,392,12.3692,69,2/1/2020 7:00,2/1/2020 7:14,6.97968,79.913,6.98875,79.8914,147.47


In [26]:
test_set.isna().sum()

additional_fare              0
duration                     0
meter_waiting                0
meter_waiting_fare           0
meter_waiting_till_pickup    0
pickup_time                  0
drop_time                    0
pick_lat                     0
pick_lon                     0
drop_lat                     0
drop_lon                     0
fare                         0
dtype: int64

<h2>Feature Addition for the Test Dataset</h2>

In [None]:
test_set['pickup_time'] = pd.to_datetime(test_set['pickup_time'], format="%m/%d/%Y %H:%M")
test_set['drop_time'] = pd.to_datetime(test_set['drop_time'], format="%m/%d/%Y %H:%M")

In [None]:
test_set = test_set.assign(timeOfDay=pd.cut(test_set.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [None]:
new_column = []                    #empty column for distance
for index,row in test_set.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

test_set .insert(4,"distance",new_column)

In [None]:
durations = []
for index,row in test_set.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = 60
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

test_set.insert(4,"time_dif",durations)



In [None]:
test_set['avg_speed'] = (test_set['distance'] /  ( test_set['time_dif']) * 3600 )

In [46]:
test_set

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
213284604,10.5,924,42,2.448600,148,2/1/2020 0:38,2/1/2020 0:53,6.83454,79.8750,6.77490,79.8840,289.27
213286352,10.5,4249,20,0.000000,91,2/1/2020 1:02,2/1/2020 2:13,6.91168,79.8723,6.55091,79.9706,1912.70
213293973,10.5,1552,255,2.658800,23,2/1/2020 5:02,2/1/2020 5:28,6.92145,79.8478,6.90539,79.8989,394.00
213294622,10.5,462,16,0.000000,198,2/1/2020 5:30,2/1/2020 5:38,6.77433,79.9416,6.80401,79.9407,154.32
213298687,10.5,814,392,12.369200,69,2/1/2020 7:00,2/1/2020 7:14,6.97968,79.9130,6.98875,79.8914,147.47
213299545,10.5,2495,351,16.530800,9,2/1/2020 7:13,2/1/2020 7:55,6.99819,79.9378,7.13916,79.8726,1156.97
213302332,10.5,1108,454,23.929200,43,2/1/2020 7:47,2/1/2020 8:05,6.79064,79.8878,6.81875,79.8859,196.81
213302671,10.5,2737,320,18.496000,17,2/1/2020 7:48,2/1/2020 8:33,6.81545,79.9707,6.82144,79.8662,688.43
213305594,10.5,1154,29,0.000000,130,2/1/2020 8:11,2/1/2020 8:30,6.82920,79.9798,6.79732,79.9309,288.77
213305134,10.5,1372,277,16.046498,63,2/1/2020 8:12,2/1/2020 8:35,6.05588,80.2391,6.04033,80.2043,199.57


In [28]:
test_features = test_set[['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare','fare']]
preprocessed_test_features = preprocessor.fit_transform(test_features)

<h2>Fare prediction and correctness prediction using Test Dataset </h2>

In [29]:
predicted_labels = classifier.predict(preprocessed_test_features)

<h2>Writing to the Submission File</h2>

In [30]:
predicted_labels

array([1, 0, 1, ..., 1, 1, 1])

In [31]:
submission_set = pd.read_csv(DATA_PATH / "sample_submission.csv", index_col="tripid")
submission_set.head()

submission_set['prediction']= predicted_labels

In [32]:
%%javascript
var kernel = IPython.notebook.kernel;
var thename = window.document.getElementById("notebook_name").innerHTML;
var command = "theNotebook = " + "'"+thename+"'";
kernel.execute(command);

<IPython.core.display.Javascript object>

In [33]:
filename = '../../submissions/'+theNotebook+'/'+theNotebook+'_{%i}.csv'
dirname = '../../submissions/'+theNotebook
fileversion = 1

if not os.path.exists(dirname):
    os.makedirs(dirname)
while glob.glob(filename.replace('{%i}',str(fileversion))) :
    fileversion+=1
submission_set.to_csv(filename.replace('{%i}',str(fileversion)), index=True)
print("Completed!")

Completed!


In [34]:
submission_set['prediction'].idxmin()

213286352

In [35]:
submission_set['prediction'].value_counts()

1    8019
0     557
Name: prediction, dtype: int64