In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import math
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import os
import glob

In [2]:

data_path='/home/madnisal/Documents/ML_Project/datasets/'
training_df = pd.read_csv(data_path+'train.csv', index_col="tripid")

In [3]:
def dist_from_coordinates(lat1, lon1, lat2, lon2):
  R = 6371  # Earth radius in km

  #conversion to radians
  d_lat = np.radians(lat2-lat1)
  d_lon = np.radians(lon2-lon1)

  r_lat1 = np.radians(lat1)
  r_lat2 = np.radians(lat2)

  #haversine formula
  a = np.sin(d_lat/2.) **2 + np.cos(r_lat1) * np.cos(r_lat2) * np.sin(d_lon/2.)**2

  haversine = 2 * R * np.arcsin(np.sqrt(a))

  return haversine

In [4]:
training_df['pickup_time'] = pd.to_datetime(training_df['pickup_time'], format="%m/%d/%Y %H:%M")
training_df['drop_time'] = pd.to_datetime(training_df['drop_time'], format="%m/%d/%Y %H:%M")

In [5]:
training_df = training_df.assign(timeOfDay=pd.cut(training_df.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [6]:
training_df.loc[training_df['timeOfDay'] == 'day', 'isNormalCharge'] = 1
training_df.loc[training_df['timeOfDay'] != 'day', 'isNormalCharge'] = 0

In [7]:
durations = []
for index,row in training_df.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = np.nan
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

training_df.insert(4,"time_dif",durations)

In [8]:
new_column = []                    #empty column for distance
for index,row in training_df.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

training_df.insert(4,"distance",new_column)

In [9]:
training_df['time_driven'] = training_df['duration']  - training_df['meter_waiting']

In [10]:
chargeperhours = []
for index,row in training_df.iterrows():
    if(row['meter_waiting'] == 0):
        chargeperhour = np.nan
    else:
        chargeperhour = (row['meter_waiting_fare'] / row['meter_waiting'] * 3600)
    chargeperhours.append(chargeperhour)

training_df.insert(4,'charge_per_hour',chargeperhours)


In [11]:
training_df['driving_fare'] = training_df['fare']  - training_df['meter_waiting_fare'] - training_df['additional_fare']

In [12]:
avgspeeds = []
for index,row in training_df.iterrows():
    if(row['time_driven'] == 0):
        avgspeed = np.nan
    else:
        avgspeed = (row['distance'] / row['time_driven'] * 3600)
    avgspeeds.append(avgspeed)

training_df.insert(4,"avg_speed",avgspeeds)



In [13]:
costsperkm = []
for index,row in training_df.iterrows():
    if row['distance'] == 0:
        costperkm = np.nan
            
    else:
        costperkm = (row['driving_fare'] / row['distance'])
    costsperkm.append(costperkm)

training_df.insert(4,"cost_per_km",costsperkm)

In [14]:
training_df = training_df.replace({'label': {'incorrect': 0, 'correct' : 1}})

In [15]:
training_df.columns

Index(['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'cost_per_km', 'avg_speed', 'charge_per_hour', 'distance', 'time_dif',
       'meter_waiting_till_pickup', 'pickup_time', 'drop_time', 'pick_lat',
       'pick_lon', 'drop_lat', 'drop_lon', 'fare', 'label', 'timeOfDay',
       'isNormalCharge', 'time_driven', 'driving_fare'],
      dtype='object')

In [16]:
training_columns = ['meter_waiting','meter_waiting_fare','fare','additional_fare', 'distance','cost_per_km', 'avg_speed',  'time_dif','time_driven', 'charge_per_hour', 'driving_fare', 'isNormalCharge','pick_lat','pick_lon','drop_lat','drop_lon']

In [17]:
target_column = ['label']

In [18]:
training_df = training_df.drop(190167541)

In [19]:
x = training_df[training_columns].values
y = training_df[target_column].values

In [20]:
x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [21]:
training_df['label'].value_counts()

1    15494
0     1681
Name: label, dtype: int64

<h2>Choosing the best model</h2>

In [22]:
def xgb_f1(y, t, threshold=0.5):
    t = t.get_label()
    y_bin = [1. if y_cont > threshold else 0. for y_cont in y] # binarizing your output
    return 'f1',f1_score(t,y_bin)

In [23]:
gkf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [24]:
model = XGBClassifier(class_weight={0:2,1:1}, learning_rate=0.1)

In [25]:
for training_index, testing_index in gkf.split(X=x, y=y):
    x_train_fold, y_train_fold = x[training_index], y[training_index]
    x_test_fold, y_test_fold = x[testing_index], y[testing_index]
    model.fit(x_train_fold, y_train_fold, eval_set=[(x_test_fold,y_test_fold)])

Parameters: { class_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-error:0.05422
[1]	validation_0-error:0.05604
[2]	validation_0-error:0.05568
[3]	validation_0-error:0.05531
[4]	validation_0-error:0.05386


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[5]	validation_0-error:0.05349
[6]	validation_0-error:0.05204
[7]	validation_0-error:0.05022
[8]	validation_0-error:0.05131
[9]	validation_0-error:0.04985
[10]	validation_0-error:0.04840
[11]	validation_0-error:0.04949
[12]	validation_0-error:0.04949
[13]	validation_0-error:0.04840
[14]	validation_0-error:0.04804
[15]	validation_0-error:0.04658
[16]	validation_0-error:0.04585
[17]	validation_0-error:0.04658
[18]	validation_0-error:0.04731
[19]	validation_0-error:0.04658
[20]	validation_0-error:0.04585
[21]	validation_0-error:0.04585
[22]	validation_0-error:0.04476
[23]	validation_0-error:0.04512
[24]	validation_0-error:0.04549
[25]	validation_0-error:0.04621
[26]	validation_0-error:0.04549
[27]	validation_0-error:0.04585
[28]	validation_0-error:0.04585
[29]	validation_0-error:0.04585
[30]	validation_0-error:0.04621
[31]	validation_0-error:0.04585
[32]	validation_0-error:0.04585
[33]	validation_0-error:0.04621
[34]	validation_0-error:0.04621
[35]	validation_0-error:0.04585
[36]	validati

[41]	validation_0-error:0.04585
[42]	validation_0-error:0.04585
[43]	validation_0-error:0.04549
[44]	validation_0-error:0.04549
[45]	validation_0-error:0.04549
[46]	validation_0-error:0.04585
[47]	validation_0-error:0.04585
[48]	validation_0-error:0.04621
[49]	validation_0-error:0.04658
[50]	validation_0-error:0.04658
[51]	validation_0-error:0.04621
[52]	validation_0-error:0.04621
[53]	validation_0-error:0.04621
[54]	validation_0-error:0.04585
[55]	validation_0-error:0.04621
[56]	validation_0-error:0.04585
[57]	validation_0-error:0.04549
[58]	validation_0-error:0.04621
[59]	validation_0-error:0.04658
[60]	validation_0-error:0.04621
[61]	validation_0-error:0.04549
[62]	validation_0-error:0.04549
[63]	validation_0-error:0.04549
[64]	validation_0-error:0.04549
[65]	validation_0-error:0.04585
[66]	validation_0-error:0.04585
[67]	validation_0-error:0.04585
[68]	validation_0-error:0.04585
[69]	validation_0-error:0.04585
[70]	validation_0-error:0.04621
[71]	validation_0-error:0.04585
[72]	val

[77]	validation_0-error:0.04258
[78]	validation_0-error:0.04258
[79]	validation_0-error:0.04258
[80]	validation_0-error:0.04330
[81]	validation_0-error:0.04330
[82]	validation_0-error:0.04294
[83]	validation_0-error:0.04367
[84]	validation_0-error:0.04294
[85]	validation_0-error:0.04330
[86]	validation_0-error:0.04221
[87]	validation_0-error:0.04221
[88]	validation_0-error:0.04221
[89]	validation_0-error:0.04149
[90]	validation_0-error:0.04149
[91]	validation_0-error:0.04149
[92]	validation_0-error:0.04149
[93]	validation_0-error:0.04185
[94]	validation_0-error:0.04185
[95]	validation_0-error:0.04185
[96]	validation_0-error:0.04185
[97]	validation_0-error:0.04185
[98]	validation_0-error:0.04185
[99]	validation_0-error:0.04149


In [26]:
predicted_y = model.predict(x_test)
train_pred_y = model.predict(x)

In [27]:
print(); print(metrics.classification_report(y,train_pred_y))
print(); print(metrics.confusion_matrix(y, train_pred_y))

print(); print(metrics.classification_report(y_test, predicted_y))
print(); print(metrics.confusion_matrix(y_test, predicted_y))


              precision    recall  f1-score   support

           0       0.97      0.77      0.86      1345
           1       0.98      1.00      0.99     12395

    accuracy                           0.97     13740
   macro avg       0.97      0.88      0.92     13740
weighted avg       0.97      0.97      0.97     13740


[[ 1030   315]
 [   31 12364]]

              precision    recall  f1-score   support

           0       0.90      0.68      0.77       336
           1       0.97      0.99      0.98      3099

    accuracy                           0.96      3435
   macro avg       0.93      0.83      0.88      3435
weighted avg       0.96      0.96      0.96      3435


[[ 227  109]
 [  25 3074]]


<h2>Testing</h2>

In [28]:

test_set = pd.read_csv(data_path+'test.csv', index_col="tripid")


In [29]:
test_set['pickup_time'] = pd.to_datetime(test_set['pickup_time'], format="%m/%d/%Y %H:%M")
test_set['drop_time'] = pd.to_datetime(test_set['drop_time'], format="%m/%d/%Y %H:%M")

In [30]:
test_set = test_set.assign(timeOfDay=pd.cut(test_set.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [31]:
test_set.loc[test_set['timeOfDay'] == 'day', 'isNormalCharge'] = 1
test_set.loc[test_set['timeOfDay'] != 'day', 'isNormalCharge'] = 0

In [32]:
durations = []
for index,row in test_set.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = np.nan
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

test_set.insert(4,"time_dif",durations)

In [33]:
new_column = []                    #empty column for distance
for index,row in test_set.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

test_set.insert(4,"distance",new_column)

In [34]:
test_set['time_driven'] = test_set['duration']  - test_set['meter_waiting']

In [35]:
chargeperhours = []
for index,row in test_set.iterrows():
    if(row['meter_waiting'] == 0):
        chargeperhour = np.nan
    else:
        chargeperhour = (row['meter_waiting_fare'] / row['meter_waiting'] * 3600)
    chargeperhours.append(chargeperhour)

test_set.insert(4,'charge_per_hour',chargeperhours)


In [36]:
test_set['driving_fare'] = test_set['fare']  - test_set['meter_waiting_fare'] - test_set['additional_fare']

In [37]:
avgspeeds = []
for index,row in test_set.iterrows():
    if(row['time_driven'] == 0):
        avgspeed = np.nan    
    else:
        avgspeed = (row['distance'] / row['time_driven'] * 3600)
    avgspeeds.append(avgspeed)

test_set.insert(4,"avg_speed",avgspeeds)



In [38]:
costsperkm = []
for index,row in test_set.iterrows():
    if row['distance'] == 0:
        costperkm = np.nan
            
    else:
        costperkm = (row['driving_fare'] / row['distance'])
    costsperkm.append(costperkm)

test_set.insert(4,"cost_per_km",costsperkm)

In [39]:
test_features = test_set[training_columns].values

In [40]:
predicted_labels = model.predict(test_features)

In [41]:
predicted_labels_df = pd.DataFrame(predicted_labels )

In [42]:
sub_path =os.path.abspath(os.path.join(data_path+'/sample_submission.csv'))
submission_set = pd.read_csv(sub_path, index_col="tripid")

In [43]:
submission_set['prediction']= predicted_labels_df.values[:,0]

In [44]:
%%javascript
var kernel = IPython.notebook.kernel;
var thename = window.document.getElementById("notebook_name").innerHTML;
var command = "theNotebook = " + "'"+thename+"'";
kernel.execute(command);

<IPython.core.display.Javascript object>

In [46]:
filename = '../../submissions/'+theNotebook+'/'+theNotebook+'_{%i}.csv'
dirname = '../../submissions/'+theNotebook
fileversion = 1

if not os.path.exists(dirname):
    os.makedirs(dirname)
while glob.glob(filename.replace('{%i}',str(fileversion))) :
    fileversion+=1
submission_set.to_csv(filename.replace('{%i}',str(fileversion)), index=True)
print("Completed!")

Completed!


In [45]:
submission_set['prediction'].value_counts()

1    8143
0     433
Name: prediction, dtype: int64