<h1>Imports</h1>

In [1]:
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import math
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import os
import glob

In [2]:
from lightgbm import LGBMRegressor
from lightgbm import LGBMClassifier

In [3]:
pd.set_option("display.max_columns", 100)
%matplotlib inline

In [4]:
RANDOM_SEED = 6

<h1>Dataset Importing</h1>

In [5]:
data_path='/home/madnisal/Documents/ML_Project/datasets/'
training_df = pd.read_csv(data_path+'train.csv', index_col="tripid")

<h1>Data Preprocessing</h1>

In [6]:
def dist_from_coordinates(lat1, lon1, lat2, lon2):
  R = 6371  # Earth radius in km

  #conversion to radians
  d_lat = np.radians(lat2-lat1)
  d_lon = np.radians(lon2-lon1)

  r_lat1 = np.radians(lat1)
  r_lat2 = np.radians(lat2)

  #haversine formula
  a = np.sin(d_lat/2.) **2 + np.cos(r_lat1) * np.cos(r_lat2) * np.sin(d_lon/2.)**2

  haversine = 2 * R * np.arcsin(np.sqrt(a))

  return haversine

In [7]:
training_df['pickup_time'] = pd.to_datetime(training_df['pickup_time'], format="%m/%d/%Y %H:%M")
training_df['drop_time'] = pd.to_datetime(training_df['drop_time'], format="%m/%d/%Y %H:%M")

In [8]:
training_df = training_df.assign(timeOfDay=pd.cut(training_df.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [9]:
training_df.loc[training_df['timeOfDay'] == 'day', 'isNormalCharge'] = 1
training_df.loc[training_df['timeOfDay'] != 'day', 'isNormalCharge'] = 0

In [10]:
durations = []
for index,row in training_df.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = np.nan
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

training_df.insert(4,"time_dif",durations)

In [11]:
new_column = []                    #empty column for distance
for index,row in training_df.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

training_df.insert(4,"distance",new_column)

In [12]:
training_df['time_driven'] = training_df['duration']  - training_df['meter_waiting']

In [13]:
chargeperhours = []
for index,row in training_df.iterrows():
    if(row['meter_waiting'] == 0):
        chargeperhour = np.nan
    else:
        chargeperhour = (row['meter_waiting_fare'] / row['meter_waiting'] * 3600)
    chargeperhours.append(chargeperhour)

training_df.insert(4,'charge_per_hour',chargeperhours)


In [14]:
training_df['driving_fare'] = training_df['fare']  - training_df['meter_waiting_fare'] - training_df['additional_fare']

In [15]:
avgspeeds = []
for index,row in training_df.iterrows():
    if(row['time_driven'] == 0):
        avgspeed = np.nan
    else:
        avgspeed = (row['distance'] / row['time_driven'] * 3600)
    avgspeeds.append(avgspeed)

training_df.insert(4,"avg_speed",avgspeeds)



In [16]:
costsperkm = []
for index,row in training_df.iterrows():
    if row['distance'] == 0:
        costperkm = np.nan
            
    else:
        costperkm = (row['driving_fare'] / row['distance'])
    costsperkm.append(costperkm)

training_df.insert(4,"cost_per_km",costsperkm)

In [17]:
training_df = training_df.replace({'label': {'incorrect': 0, 'correct' : 1}})

In [18]:
training_df = training_df.drop(190167541)

<h1>Model Training</h1>

<h2> Part 1 : Fare Predictor </h2>
Predict fare using features

In [19]:
# Extract Correctly predicted fares
correct_training_df = training_df[training_df['label'] == 1]

In [20]:
fare_prediction_features =  ['additional_fare','cost_per_km', 'avg_speed', 'charge_per_hour',  'isNormalCharge']

In [21]:
fare_prediction_target_column = ['fare']

In [22]:
predictor_x = correct_training_df[fare_prediction_features].values
predictor_y = correct_training_df[fare_prediction_target_column].values

In [23]:
x, x_test, y, y_test = train_test_split(predictor_x, predictor_y, test_size=0.2, random_state=RANDOM_SEED)

In [24]:
gkf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

In [25]:
estimator = LGBMRegressor()

In [26]:
for training_index, testing_index in gkf.split(X=x, y=y):
    x_train_fold, y_train_fold = x[training_index], y[training_index]
    x_test_fold, y_test_fold = x[testing_index], y[testing_index]
    estimator.fit(x_train_fold, y_train_fold)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [27]:
predicted_prices = estimator.predict(training_df[fare_prediction_features])

In [28]:
training_df.insert(13,'predicted_price',predicted_prices)

In [29]:
# Training dataset after new features inserted
training_df[training_df['label']==1]

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,cost_per_km,avg_speed,charge_per_hour,distance,time_dif,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,predicted_price,pick_lon,drop_lat,drop_lon,fare,label,timeOfDay,isNormalCharge,time_driven,driving_fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
189123628,10.5,834.0,56.0,0.000000,51.017424,23.565516,0.000000,5.092770,834.0,64.0,2019-11-01 00:20:00,2019-11-01 00:34:00,6.86252,372.778959,79.8993,6.90330,79.8783,270.32,1,dawn,0.0,778.0,259.820000
189125358,10.5,791.0,47.0,0.000000,59.137183,15.329311,0.000000,3.168058,791.0,134.0,2019-11-01 00:56:00,2019-11-01 01:09:00,6.88589,197.166856,79.8984,6.91373,79.8923,197.85,1,dawn,0.0,744.0,187.350000
189125719,10.5,1087.0,80.0,0.000000,46.173157,22.541631,0.000000,6.305395,1087.0,61.0,2019-11-01 01:08:00,2019-11-01 01:26:00,6.90839,328.388893,79.8651,6.93669,79.9146,301.64,1,dawn,0.0,1007.0,291.140000
189127273,10.5,598.0,271.0,15.663800,65.127273,9.489315,208.080000,0.861946,598.0,68.0,2019-11-01 02:27:00,2019-11-01 02:37:00,6.92570,292.707083,79.8895,6.92748,79.8971,82.30,1,dawn,0.0,327.0,56.136200
189128020,,,,,,,,8.147782,1020.0,,2019-11-01 03:34:00,2019-11-01 03:51:00,6.87441,250.812603,79.8615,6.84478,79.9290,358.39,1,dawn,0.0,,
189129552,10.5,3407.0,182.0,0.000000,43.562535,27.021811,0.000000,24.207039,3407.0,112.0,2019-11-01 05:38:00,2019-11-01 06:35:00,7.13402,437.869899,79.8969,6.91865,79.8649,1065.02,1,dawn,0.0,3225.0,1054.520000
189132829,10.5,1246.0,487.0,0.000000,53.608242,22.660665,0.000000,4.777624,1246.0,133.0,2019-11-01 06:29:00,2019-11-01 06:49:00,6.84371,322.736174,79.9051,6.85069,79.8624,266.62,1,dawn,0.0,759.0,256.120000
189135103,10.5,1333.0,295.0,17.198500,54.551267,18.459689,209.880000,5.322544,1333.0,212.0,2019-11-01 06:50:00,2019-11-01 07:12:00,6.90760,195.772906,79.9524,6.90634,79.9042,318.05,1,dawn,0.0,1038.0,290.351500
189139296,10.5,360.0,80.0,4.664000,82.252341,13.311024,209.880000,1.035302,360.0,3.0,2019-11-01 07:00:00,2019-11-01 07:06:00,7.26706,148.990574,80.6064,7.27422,80.6124,100.32,1,dawn,0.0,280.0,85.156000
189138671,10.5,1539.0,588.0,33.986400,72.816218,11.094190,208.080000,2.930715,1539.0,43.0,2019-11-01 07:02:00,2019-11-01 07:28:00,6.85137,294.744364,79.9537,6.84779,79.9274,257.89,1,dawn,0.0,951.0,213.403600


<h2>Part 2: Training the classifier</h2>

In [30]:
from sklearn.metrics import f1_score
def evaluate_macroF1_lgb(y_true, y_pred):  
    y_hat = np.where(y_pred < 0.5, 0, 1) 
    f1 = f1_score(y_true, y_hat, average='macro')
    return ('macroF1', f1, True) 

In [31]:
def focal_loss_lgb_eval_error(y_true, y_pred, alpha=.25, gamma=2.):
    a,g = alpha, gamma
    p = 1/(1+np.exp(-y_pred))
    loss = -( a*y_true + (1-a)*(1-y_true) ) * (( 1 - ( y_true*p + (1-y_true)*(1-p)) )**g) * ( y_true*np.log(p)+(1-y_true)*np.log(1-p) )
    return 'focal_loss', np.mean(loss), False

In [32]:
classifier_features = ['predicted_price','fare']
classifier_label = ['label']

In [33]:
class_x = training_df[classifier_features].values
class_y = training_df[classifier_label].values

In [34]:
x, x_test, y, y_test = train_test_split(class_x, class_y ,test_size=0.2, random_state=RANDOM_SEED)

In [35]:
sgkf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

In [36]:
model = lightgbm.LGBMClassifier(class_weight={0:3,1:1}, learning_rate=0.1)

In [37]:
for training_index, testing_index in sgkf.split(X=x, y=y):
    x_train_fold, y_train_fold = x[training_index], y[training_index]
    x_test_fold, y_test_fold = x[testing_index], y[testing_index]
    model.fit(x_train_fold, y_train_fold, eval_set=(x_test_fold,y_test_fold),eval_metric = lambda y_true, y_pred: [evaluate_macroF1_lgb(y_true,y_pred)])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[1]	valid_0's binary_logloss: 0.371076	valid_0's macroF1: 0.474268
[2]	valid_0's binary_logloss: 0.355005	valid_0's macroF1: 0.474268
[3]	valid_0's binary_logloss: 0.341902	valid_0's macroF1: 0.474268
[4]	valid_0's binary_logloss: 0.33091	valid_0's macroF1: 0.597088
[5]	valid_0's binary_logloss: 0.321741	valid_0's macroF1: 0.671744
[6]	valid_0's binary_logloss: 0.313766	valid_0's macroF1: 0.710228
[7]	valid_0's binary_logloss: 0.306864	valid_0's macroF1: 0.715136
[8]	valid_0's binary_logloss: 0.301029	valid_0's macroF1: 0.719918
[9]	valid_0's binary_logloss: 0.296251	valid_0's macroF1: 0.721245
[10]	valid_0's binary_logloss: 0.291769	valid_0's macroF1: 0.721245
[11]	valid_0's binary_logloss: 0.287806	valid_0's macroF1: 0.724581
[12]	valid_0's binary_logloss: 0.284668	valid_0's macroF1: 0.720751
[13]	valid_0's binary_logloss: 0.281611	valid_0's macroF1: 0.724996
[14]	valid_0's binary_logloss: 0.279124	valid_0's macroF1: 0.72382
[15]	valid_0's binary_logloss: 0.276785	valid_0's macroF1: 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[30]	valid_0's binary_logloss: 0.257369	valid_0's macroF1: 0.762798
[31]	valid_0's binary_logloss: 0.256932	valid_0's macroF1: 0.759785
[32]	valid_0's binary_logloss: 0.256385	valid_0's macroF1: 0.763183
[33]	valid_0's binary_logloss: 0.256294	valid_0's macroF1: 0.763183
[34]	valid_0's binary_logloss: 0.256129	valid_0's macroF1: 0.759316
[35]	valid_0's binary_logloss: 0.256114	valid_0's macroF1: 0.759316
[36]	valid_0's binary_logloss: 0.256045	valid_0's macroF1: 0.759316
[37]	valid_0's binary_logloss: 0.255851	valid_0's macroF1: 0.759316
[38]	valid_0's binary_logloss: 0.255882	valid_0's macroF1: 0.758648
[39]	valid_0's binary_logloss: 0.25577	valid_0's macroF1: 0.756444
[40]	valid_0's binary_logloss: 0.255743	valid_0's macroF1: 0.757108
[41]	valid_0's binary_logloss: 0.255694	valid_0's macroF1: 0.757108
[42]	valid_0's binary_logloss: 0.2556	valid_0's macroF1: 0.756444
[43]	valid_0's binary_logloss: 0.255769	valid_0's macroF1: 0.756444
[44]	valid_0's binary_logloss: 0.255539	valid_0's m

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[10]	valid_0's binary_logloss: 0.285635	valid_0's macroF1: 0.75882
[11]	valid_0's binary_logloss: 0.281643	valid_0's macroF1: 0.76159
[12]	valid_0's binary_logloss: 0.278236	valid_0's macroF1: 0.758353
[13]	valid_0's binary_logloss: 0.2752	valid_0's macroF1: 0.760101
[14]	valid_0's binary_logloss: 0.272552	valid_0's macroF1: 0.758716
[15]	valid_0's binary_logloss: 0.269889	valid_0's macroF1: 0.758252
[16]	valid_0's binary_logloss: 0.267698	valid_0's macroF1: 0.763596
[17]	valid_0's binary_logloss: 0.265836	valid_0's macroF1: 0.762892
[18]	valid_0's binary_logloss: 0.26413	valid_0's macroF1: 0.761765
[19]	valid_0's binary_logloss: 0.262432	valid_0's macroF1: 0.761765
[20]	valid_0's binary_logloss: 0.261276	valid_0's macroF1: 0.765765
[21]	valid_0's binary_logloss: 0.259959	valid_0's macroF1: 0.764185
[22]	valid_0's binary_logloss: 0.25886	valid_0's macroF1: 0.764185
[23]	valid_0's binary_logloss: 0.25798	valid_0's macroF1: 0.765068
[24]	valid_0's binary_logloss: 0.257131	valid_0's macro

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[52]	valid_0's binary_logloss: 0.265379	valid_0's macroF1: 0.734002
[53]	valid_0's binary_logloss: 0.265551	valid_0's macroF1: 0.734002
[54]	valid_0's binary_logloss: 0.265634	valid_0's macroF1: 0.734002
[55]	valid_0's binary_logloss: 0.265648	valid_0's macroF1: 0.734979
[56]	valid_0's binary_logloss: 0.265629	valid_0's macroF1: 0.733721
[57]	valid_0's binary_logloss: 0.265811	valid_0's macroF1: 0.733721
[58]	valid_0's binary_logloss: 0.266004	valid_0's macroF1: 0.73492
[59]	valid_0's binary_logloss: 0.265956	valid_0's macroF1: 0.734349
[60]	valid_0's binary_logloss: 0.266003	valid_0's macroF1: 0.734349
[61]	valid_0's binary_logloss: 0.265981	valid_0's macroF1: 0.732117
[62]	valid_0's binary_logloss: 0.265751	valid_0's macroF1: 0.732117
[63]	valid_0's binary_logloss: 0.265783	valid_0's macroF1: 0.732117
[64]	valid_0's binary_logloss: 0.265985	valid_0's macroF1: 0.732655
[65]	valid_0's binary_logloss: 0.266	valid_0's macroF1: 0.73329
[66]	valid_0's binary_logloss: 0.265992	valid_0's mac

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



[6]	valid_0's binary_logloss: 0.309702	valid_0's macroF1: 0.761227
[7]	valid_0's binary_logloss: 0.303004	valid_0's macroF1: 0.764631
[8]	valid_0's binary_logloss: 0.297262	valid_0's macroF1: 0.771762
[9]	valid_0's binary_logloss: 0.292479	valid_0's macroF1: 0.770981
[10]	valid_0's binary_logloss: 0.288186	valid_0's macroF1: 0.770203
[11]	valid_0's binary_logloss: 0.284525	valid_0's macroF1: 0.772155
[12]	valid_0's binary_logloss: 0.28105	valid_0's macroF1: 0.770611
[13]	valid_0's binary_logloss: 0.278461	valid_0's macroF1: 0.770809
[14]	valid_0's binary_logloss: 0.276145	valid_0's macroF1: 0.767772
[15]	valid_0's binary_logloss: 0.274048	valid_0's macroF1: 0.765913
[16]	valid_0's binary_logloss: 0.27191	valid_0's macroF1: 0.763096
[17]	valid_0's binary_logloss: 0.27008	valid_0's macroF1: 0.76349
[18]	valid_0's binary_logloss: 0.268438	valid_0's macroF1: 0.76349
[19]	valid_0's binary_logloss: 0.266746	valid_0's macroF1: 0.762798
[20]	valid_0's binary_logloss: 0.26554	valid_0's macroF1

In [38]:
predicted_y = model.predict(x_test)
train_pred_y = model.predict(x)

In [39]:
print(); print(metrics.classification_report(y,train_pred_y))
print(); print(metrics.confusion_matrix(y, train_pred_y))

print(); print(metrics.classification_report(y_test, predicted_y))
print(); print(metrics.confusion_matrix(y_test, predicted_y))


              precision    recall  f1-score   support

           0       0.66      0.57      0.61      1346
           1       0.95      0.97      0.96     12394

    accuracy                           0.93     13740
   macro avg       0.81      0.77      0.79     13740
weighted avg       0.92      0.93      0.93     13740


[[  767   579]
 [  402 11992]]

              precision    recall  f1-score   support

           0       0.64      0.53      0.58       335
           1       0.95      0.97      0.96      3100

    accuracy                           0.93      3435
   macro avg       0.80      0.75      0.77      3435
weighted avg       0.92      0.93      0.92      3435


[[ 176  159]
 [  98 3002]]


<h1>Model Validation</h1>

<h2>Loading the test data set</h2>

In [None]:
test_set = pd.read_csv(DATA_PATH / "test.csv", index_col="tripid")
test_set.head()

In [None]:
test_set.isna().sum()

<h2>Feature Addition for the Test Dataset</h2>

In [None]:
test_set['pickup_time'] = pd.to_datetime(test_set['pickup_time'], format="%m/%d/%Y %H:%M")
test_set['drop_time'] = pd.to_datetime(test_set['drop_time'], format="%m/%d/%Y %H:%M")

In [None]:
test_set = test_set.assign(timeOfDay=pd.cut(test_set.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [None]:
new_column = []                    #empty column for distance
for index,row in test_set.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

test_set .insert(4,"distance",new_column)

In [None]:
durations = []
for index,row in test_set.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = 60
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

test_set.insert(4,"time_dif",durations)



In [None]:
test_set['avg_speed'] = (test_set['distance'] /  ( test_set['time_dif']) * 3600 )

In [None]:
test_set

In [None]:
test_features = test_set[['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'meter_waiting_till_pickup','distance','avg_speed']]
preprocessed_test_features = preprocessor.fit_transform(test_features)
preprocessed_test_features_data_frame = pd.DataFrame(data=preprocessed_test_features, columns=new_columns)

<h2>Fare prediction and correctness prediction using Test Dataset </h2>

In [None]:
test_probs = estimator.predict(preprocessed_test_features_data_frame)

In [None]:
test_set.insert(10,'predicted_price',test_probs)

In [None]:
classifier_test_features = test_set[['predicted_price','fare']]

In [None]:
predicted_labels = classifier.predict(classifier_test_features)

<h2>Writing to the Submission File</h2>

In [None]:
submission_set = pd.read_csv(DATA_PATH / "sample_submission.csv", index_col="tripid")
submission_set.head()

submission_set['prediction']= predicted_labels

In [None]:
%%javascript
var kernel = IPython.notebook.kernel;
var thename = window.document.getElementById("notebook_name").innerHTML;
var command = "theNotebook = " + "'"+thename+"'";
kernel.execute(command);

In [None]:
filename = '../../submissions/'+theNotebook+'/teamCluster_submission_{%i}.csv'
dirname = '../../submissions/'+theNotebook
fileversion = 1

if not os.path.exists(dirname):
    os.makedirs(dirname)
while glob.glob(filename.replace('{%i}',str(fileversion))) :
    fileversion+=1
submission_set.to_csv(filename.replace('{%i}',str(fileversion)), index=True)
print("Completed!")

In [None]:
submission_set['prediction'].idxmin()

In [None]:
submission_set['prediction'].value_counts()