<h1>Imports</h1>

In [33]:
from pathlib import Path

import numpy as np
import sklearn as slearn
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import fbeta_score
from sklearn.metrics import classification_report

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from imblearn.combine import SMOTETomek

import glob
import os

In [2]:
pd.set_option("display.max_columns", 100)
%matplotlib inline

In [3]:
RANDOM_SEED = 6

<h1>Dataset Importing</h1>

In [4]:
DATA_PATH = Path.cwd() / "../../datasets"

In [5]:
training_df = pd.read_csv(DATA_PATH / "train.csv", index_col="tripid")
training_df.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,correct
189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,correct
189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,correct
189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,correct
189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.929,358.39,correct


In [6]:
training_df['fare'] = training_df['fare'].fillna(0)

<h1>Data Preprocessing</h1>

<h2>Data Conversions</h2>

In [7]:
# Converting date time objects from Objects to datetime64
training_df['pickup_time'] = pd.to_datetime(training_df['pickup_time'], format="%m/%d/%Y %H:%M")
training_df['drop_time'] = pd.to_datetime(training_df['drop_time'], format="%m/%d/%Y %H:%M")

<h2>Adding Features</h2>
Adding features deemed useful from Feature Engineering

In [8]:
durations = []
for index,row in training_df.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = np.nan
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

training_df.insert(4,"time_dif",durations)



In [9]:
def dist_from_coordinates(lat1, lon1, lat2, lon2):
  R = 6371  # Earth radius in km

  #conversion to radians
  d_lat = np.radians(lat2-lat1)
  d_lon = np.radians(lon2-lon1)

  r_lat1 = np.radians(lat1)
  r_lat2 = np.radians(lat2)

  #haversine formula
  a = np.sin(d_lat/2.) **2 + np.cos(r_lat1) * np.cos(r_lat2) * np.sin(d_lon/2.)**2

  haversine = 2 * R * np.arcsin(np.sqrt(a))

  return haversine

In [10]:
training_df = training_df.assign(timeOfDay=pd.cut(training_df.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [11]:
new_column = []                    #empty column for distance
for index,row in training_df.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

training_df.insert(4,"distance",new_column)

In [12]:
training_df['time_driven'] = training_df['duration']  - training_df['meter_waiting']

In [13]:
training_df['charge_per_hour'] = training_df['meter_waiting_fare'] / training_df['meter_waiting'] * 3600

In [14]:
training_df['driving_fare'] = training_df['fare']  - training_df['meter_waiting_fare'] - training_df['additional_fare']

In [15]:
avgspeeds = []
for index,row in training_df.iterrows():
    if(row['time_driven'] == 0):
        if( row['label'] == 'correct'):
            avgspeed = np.nan
        else:
            avgspeed = 0
    else:
        avgspeed = (row['distance'] / row['time_driven'])
    avgspeeds.append(avgspeed)

training_df.insert(4,"avg_speed",avgspeeds)


In [16]:
costsperkm = []
for index,row in training_df.iterrows():
    if row['driving_fare'] == 0:
        if( row['label'] == 'correct'):
            costperkm = np.nan
        else:
            costperkm = 0
    else:
        costperkm = (row['distance'] / row['driving_fare'])
    costsperkm.append(costperkm)

training_df.insert(4,"cost_per_km",costsperkm)


In [17]:
training_df = training_df.drop(index=190167541)

In [18]:
training_df.nlargest(100,'avg_speed')

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,cost_per_km,avg_speed,distance,time_dif,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label,timeOfDay,time_driven,charge_per_hour,driving_fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
206894975,10.5,33.0,32.0,0.0000,0.208716,33.601116,33.601116,33.0,102.0,2020-01-04 02:38:00,2020-01-04 02:38:00,6.90634,79.9005,6.60862,79.9526,171.49,incorrect,dawn,1.0,0.00,160.9900
190289312,10.5,78.0,77.0,0.0000,0.318200,15.910017,15.910017,78.0,94.0,2019-11-06 21:01:00,2019-11-06 21:02:00,6.95046,79.8723,6.95682,80.0163,60.50,correct,night,1.0,0.00,50.0000
193653784,10.5,27.0,26.0,0.0000,0.193749,9.687464,9.687464,27.0,148.0,2019-11-21 16:38:00,2019-11-21 16:38:00,6.93254,79.8437,6.85074,79.8739,60.50,incorrect,day,1.0,0.00,50.0000
189749297,10.5,26.0,24.0,0.0000,0.020789,8.618900,17.237800,26.0,26.0,2019-11-03 21:56:00,2019-11-03 21:56:00,6.90625,79.9275,6.75374,79.8995,839.68,incorrect,night,2.0,0.00,829.1800
190426786,10.5,29.0,28.0,0.0000,0.155922,8.178098,8.178098,29.0,40.0,2019-11-07 16:51:00,2019-11-07 16:52:00,6.92871,79.9842,6.86833,79.9419,62.95,incorrect,day,1.0,0.00,52.4500
192943896,10.5,24.0,23.0,0.0000,0.018467,8.175502,8.175502,24.0,1035.0,2019-11-18 22:37:00,2019-11-18 22:37:00,6.80766,79.8990,6.87929,79.8823,453.21,incorrect,night,1.0,0.00,442.7100
212680966,10.5,33.0,31.0,0.0000,0.022105,7.243337,14.486675,33.0,113.0,2020-01-27 23:06:00,2020-01-27 23:07:00,6.89294,79.8661,6.77324,79.9179,665.85,incorrect,night,2.0,0.00,655.3500
190233219,10.5,25.0,24.0,0.0000,0.136286,6.814310,6.814310,25.0,210.0,2019-11-07 09:09:00,2019-11-07 09:10:00,6.87008,79.8789,6.80884,79.8812,60.50,incorrect,day,1.0,0.00,50.0000
193760421,10.5,31.0,29.0,1.6907,0.234687,5.867098,11.734197,31.0,522.0,2019-11-21 22:20:00,2019-11-21 22:20:00,6.93650,79.8451,6.84872,79.9041,62.19,incorrect,night,2.0,209.88,49.9993
196943952,10.5,27.0,24.0,0.0000,0.213689,5.682693,17.048079,27.0,40.0,2019-12-01 10:20:00,2019-12-01 10:21:00,6.93344,79.8548,6.87023,79.9955,90.28,incorrect,day,3.0,0.00,79.7800


<h1>Model Training</h1>

In [20]:
# Set of new columns
new_columns = ['time_dif','fare',"cost_per_km",'time_driven','driving_fare']

In [21]:
# Extract the useful features deemed in feature engineering
features_classifier = training_df[new_columns]
label_classifier = training_df[['label']]
mapping = {'correct': 1, 'incorrect': 0}
label_classifier = label_classifier.replace({'label':mapping})

In [22]:
features_classifier.dtypes

time_dif        float64
fare            float64
cost_per_km     float64
time_driven     float64
driving_fare    float64
dtype: object

In [23]:
numerical_features = features_classifier.columns[features_classifier.dtypes == "float64"].values
categorical_features = features_classifier.columns[features_classifier.dtypes == "category" ].values

In [24]:
kf = StratifiedKFold(n_splits=10)

In [25]:
numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='median'))
])

non_numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot_encoder', OneHotEncoder(handle_unknown='ignore')),
#     ('label_encoder', LabelEncoder())
])

## create preprocessor stage of the final pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('non_numeric', non_numeric_preprocessing_steps, categorical_features),
        ('numeric', numeric_preprocessing_steps, numerical_features)
    ],
    remainder = 'drop'
)

In [26]:
X_train, X_eval, y_train, y_eval = train_test_split(features_classifier, label_classifier, test_size=0.3, shuffle=True,stratify=label_classifier)


In [39]:
classifier = LogisticRegression(penalty='l2',random_state=42, class_weight='balanced')

In [40]:
preprocessed_train_features = preprocessor.fit_transform(X_train)
preprocessed_train_features_data_frame = pd.DataFrame(data=preprocessed_train_features, columns=new_columns)
    
    
    ## Train the model
classifier.fit(preprocessed_train_features_data_frame, y_train)


  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [41]:
preprocessed_test_features = preprocessor.fit_transform(X_eval)
preprocessed_test_features_data_frame = pd.DataFrame(data=preprocessed_test_features, columns=new_columns)

In [42]:
# Predict for the evaluation set
print("Training Accuracy: %.2f" % (classifier.score(preprocessed_test_features_data_frame, y_eval)*100), "%")
y_pred = classifier.predict(preprocessed_test_features_data_frame)

Training Accuracy: 80.38 %


In [None]:
classifier.fit(preprocessed_test_features_data_frame, y_eval)

In [None]:
classifier.fit(preprocessed_train_features_data_frame, y_train_fold)(pd.Series(classifier.feature_importances_, index=new_columns)
   .nlargest(30)
   .plot(kind='barh'))

In [43]:
confusion_matrix = confusion_matrix(y_eval, y_pred)
confusion_matrix

array([[ 271,  233],
       [ 778, 3871]])

In [None]:
print(classification_report(y_eval, y_pred))

<h1>Model Validation</h1>

<h2>Loading the test data set</h2>

In [None]:
test_set = pd.read_csv(DATA_PATH / "test.csv", index_col="tripid")
test_set.head()

In [None]:
test_set.isna().sum()

<h2>Feature Addition for the Test Dataset</h2>

In [None]:
test_set['pickup_time'] = pd.to_datetime(test_set['pickup_time'], format="%m/%d/%Y %H:%M")
test_set['drop_time'] = pd.to_datetime(test_set['drop_time'], format="%m/%d/%Y %H:%M")

In [None]:
test_set = test_set.assign(timeOfDay=pd.cut(test_set.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [None]:
new_column = []                    #empty column for distance
for index,row in test_set.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

test_set .insert(4,"distance",new_column)

In [None]:
test_set['driving_fare'] = test_set['fare']  - test_set['meter_waiting_fare'] - test_set['additional_fare']

In [None]:
costsperkm = []
for index,row in test_set.iterrows():
    if row['driving_fare'] == 0:
        costperkm = 0 
    else:
        costperkm = (row['distance'] / row['driving_fare'])
    costsperkm.append(costperkm)

test_set.insert(4,"cost_per_km",costsperkm)


In [None]:
durations = []
for index,row in test_set.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = np.nan
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

test_set.insert(4,"time_dif",durations)



In [None]:
test_set['time_driven'] = test_set['duration']  - test_set['meter_waiting']

In [None]:
avgspeeds = []
for index,row in test_set.iterrows():
    avgspeed = 0
    avgspeeds.append(avgspeed)

test_set.insert(4,"avg_speed",avgspeeds)


In [None]:
test_set['charge_per_hour'] = test_set['meter_waiting_fare'] / test_set['meter_waiting'] * 3600

In [None]:
test_set

In [None]:
test_features = test_set[new_columns]
preprocessed_test_features = preprocessor.fit_transform(test_features)
preprocessed_test_features_data_frame = pd.DataFrame(data=preprocessed_test_features, columns=new_columns)

<h2>Fare prediction and correctness prediction using Test Dataset </h2>

In [None]:
predicted_labels = classifier.predict(preprocessed_test_features_data_frame)

<h2>Writing to the Submission File</h2>

In [None]:
submission_set = pd.read_csv(DATA_PATH / "sample_submission.csv", index_col="tripid")
submission_set.head()

submission_set['prediction']= predicted_labels

In [None]:
%%javascript
var kernel = IPython.notebook.kernel;
var thename = window.document.getElementById("notebook_name").innerHTML;
var command = "theNotebook = " + "'"+thename+"'";
kernel.execute(command);

In [None]:
filename = '../../submissions/'+theNotebook+'/'+theNotebook+'_{%i}.csv'
dirname = '../../submissions/'+theNotebook
fileversion = 1

if not os.path.exists(dirname):
    os.makedirs(dirname)
while glob.glob(filename.replace('{%i}',str(fileversion))) :
    fileversion+=1
submission_set.to_csv(filename.replace('{%i}',str(fileversion)), index=True)
print("Completed!")

In [None]:
submission_set['prediction'].idxmin()

In [None]:
submission_set['prediction'].value_counts()