In [1]:
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import math
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import os
import glob

In [2]:

data_path='/home/madnisal/Documents/ML_Project/datasets/'
training_df = pd.read_csv(data_path+'train.csv', index_col="tripid")

In [3]:
def dist_from_coordinates(lat1, lon1, lat2, lon2):
  R = 6371  # Earth radius in km

  #conversion to radians
  d_lat = np.radians(lat2-lat1)
  d_lon = np.radians(lon2-lon1)

  r_lat1 = np.radians(lat1)
  r_lat2 = np.radians(lat2)

  #haversine formula
  a = np.sin(d_lat/2.) **2 + np.cos(r_lat1) * np.cos(r_lat2) * np.sin(d_lon/2.)**2

  haversine = 2 * R * np.arcsin(np.sqrt(a))

  return haversine

In [4]:
training_df['pickup_time'] = pd.to_datetime(training_df['pickup_time'], format="%m/%d/%Y %H:%M")
training_df['drop_time'] = pd.to_datetime(training_df['drop_time'], format="%m/%d/%Y %H:%M")

In [5]:
training_df = training_df.assign(timeOfDay=pd.cut(training_df.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [6]:
training_df.loc[training_df['timeOfDay'] == 'day', 'isNormalCharge'] = 1
training_df.loc[training_df['timeOfDay'] != 'day', 'isNormalCharge'] = 0

In [7]:
durations = []
for index,row in training_df.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = np.nan
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

training_df.insert(4,"time_dif",durations)

In [8]:
new_column = []                    #empty column for distance
for index,row in training_df.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

training_df.insert(4,"distance",new_column)

In [9]:
training_df['time_driven'] = training_df['duration']  - training_df['meter_waiting']

In [10]:
chargeperhours = []
for index,row in training_df.iterrows():
    if(row['meter_waiting'] == 0):
        chargeperhour = np.nan
    else:
        chargeperhour = (row['meter_waiting_fare'] / row['meter_waiting'] * 3600)
    chargeperhours.append(chargeperhour)

training_df.insert(4,'charge_per_hour',chargeperhours)


In [11]:
training_df['driving_fare'] = training_df['fare']  - training_df['meter_waiting_fare'] - training_df['additional_fare']

In [12]:
avgspeeds = []
for index,row in training_df.iterrows():
    if(row['time_driven'] == 0):
        avgspeed = np.nan
    else:
        avgspeed = (row['distance'] / row['time_driven'] * 3600)
    avgspeeds.append(avgspeed)

training_df.insert(4,"avg_speed",avgspeeds)



In [13]:
costsperkm = []
for index,row in training_df.iterrows():
    if row['distance'] == 0:
        costperkm = np.nan
            
    else:
        costperkm = (row['driving_fare'] / row['distance'])
    costsperkm.append(costperkm)

training_df.insert(4,"cost_per_km",costsperkm)

In [14]:
training_df = training_df.replace({'label': {'incorrect': 0, 'correct' : 1}})

In [15]:
training_df.columns

Index(['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'cost_per_km', 'avg_speed', 'charge_per_hour', 'distance', 'time_dif',
       'meter_waiting_till_pickup', 'pickup_time', 'drop_time', 'pick_lat',
       'pick_lon', 'drop_lat', 'drop_lon', 'fare', 'label', 'timeOfDay',
       'isNormalCharge', 'time_driven', 'driving_fare'],
      dtype='object')

In [16]:
#training_columns = ['duration','meter_waiting','meter_waiting_fare','fare','additional_fare']

In [17]:
training_columns = ['meter_waiting','meter_waiting_fare','fare','additional_fare', 'distance','cost_per_km', 'avg_speed', 'pick_lat',
       'pick_lon', 'drop_lat', 'drop_lon', 'isNormalCharge',  'time_dif','time_driven', 'charge_per_hour', 'driving_fare']

In [18]:
#training_columns = ['additional_fare', 'meter_waiting','cost_per_km', 'avg_speed', 'charge_per_hour', 'time_dif','isNormalCharge', 'time_driven']

In [19]:
#training_columns = ['duration','meter_waiting','meter_waiting_fare','fare','additional_fare', 'distance','cost_per_km', 'avg_speed',  'time_dif','time_driven', 'charge_per_hour', 'driving_fare', 'isNormalCharge']

In [20]:
target_column = ['label']

In [None]:
del_list = [189160756, 189213407, 189307311, 189345964, 189418240, 189431382, 189445521, 189475280, 189663502, 189667686, 189670152, 189765236, 189800308, 189820422, 189841958, 189842329, 189989861, 190017339, 190059653, 190100669, 190108454, 190167541, 190189374, 190240764, 190371740, 190390160, 190392777, 190439600, 190465231, 190579195, 190616564, 190630270, 190632321, 190679701, 190757034, 190768625, 190871768, 191004988, 191056435, 191062441, 191067600, 191098402, 191125782, 191175923, 191245326, 191253555, 191315141, 191328606, 191362317, 191364887, 191367425, 191371136, 191409182, 191425016, 191486731, 191494949, 191557599, 191569507, 191584433, 191949862, 191961350, 192118009, 192118666, 192121471, 192190626, 192227971, 192342254, 192424245, 192603743, 192666104, 192680516, 192749835, 192930989, 193029201, 193333812, 193341290, 193363687, 193453026, 193464877, 193484505, 193659984, 193677619, 193755306, 193781025, 193827380, 193841338, 193848497, 193874272, 193886201, 193898575, 193912639, 194037315, 194218927, 194220232, 194256573, 194329792, 194337863, 194348937, 194361105, 194399192, 194468034, 194479410, 194554416, 194556014, 194579174, 194590549, 194653084, 194683585, 194713405, 194732335, 194742273, 195063269, 195068585, 195113569, 195171803, 195205081, 195229926, 195315462, 195411360, 195473412, 195505361, 195506533, 195519176, 195591902, 195593822, 195618766, 195749648, 195770562, 195790876, 196371958, 196472675, 196474232, 196476814, 196489680, 196502361, 196538137, 196661640, 196856208, 196940311, 196948918, 197034128, 197075221, 197150903, 197191446, 197242047, 197269750, 197302262, 197404081, 197421930, 197580331, 197583750, 197600970, 197814149, 197835993, 197976560, 197984654, 198205664, 198395566, 198411902, 198429392, 198486416, 198510589, 198585995, 198659662, 198685804, 198704554, 198711403, 198717134, 198732301, 198792329, 198823534, 198833045, 198836780, 198940643, 199006853, 199068515, 199119071, 199156085, 199211192, 199229511, 199281693, 199550110, 199593615, 199660617, 199689558, 199868018, 199986019, 200120578, 200169960, 200203616, 200235706, 200258292, 200259718, 200573263, 200620282, 200718521, 200760625, 200764288, 200829063, 200943864, 201159410, 201196145, 201211325, 201321715, 201478900, 201540569, 201611493, 201622567, 201683301, 201711200, 201718751, 201727138, 201737874, 201784877, 201930790, 201952821, 201966970, 201982478, 202210947, 202374449, 202465541, 202477661, 202622084, 202716809, 202770147, 202771469, 202780691, 202830659, 202895839, 202901039, 202942020, 203049098, 203067687, 203275241, 203289347, 203329568, 203358177, 203475249, 203526068, 203623210, 203641274, 203653944, 203783539, 203838464, 204022612, 204029214, 204081147, 204295195, 204297059, 204305036, 204308730, 204323575, 204353372, 204383964, 204407357, 204637231, 204654436, 204745747, 204857540, 204877975, 204944899, 205028512, 205035393, 205110890, 205328050, 205356666, 205639292, 205677503, 205897073, 206009807, 206295700, 206342353, 206347118, 206394611, 206410922, 206441957, 206734980, 206779752, 206833330, 206896653, 206909672, 206925971, 207105204, 207124327, 207133463, 207211758, 207405568, 207498583, 207557863, 207637448, 207739129, 207766255, 207772884, 207826142, 207848915, 207895791, 208039159, 208061508, 208081080, 208215823, 208400084, 208409330, 208506440, 208520089, 208543920, 208633480, 208638800, 208649108, 208649859, 208670051, 208813654, 208823368, 208824970, 208867641, 208922571, 208949007, 209005243, 209105215, 209157284, 209365576, 209391412, 209714447, 209817987, 209830326, 209830968, 209833884, 209881355, 209899881, 209908453, 209930461, 209969323, 209971365, 209981901, 210073545, 210158716, 210159461, 210315120, 210316480, 210423048, 210427333, 210441714, 210527836, 210573839, 210582668, 210615078, 210617878, 210734578, 210771451, 210786384, 210894816, 210905360, 210911847, 210924086, 210924331, 211031561, 211067285, 211090933, 211121898, 211157832, 211159693, 211211419, 211226655, 211409420, 211412322, 211428449, 211449915, 211471146, 211478629, 211496213, 211632914, 211645955, 211749880, 211819633, 211939462, 211963440, 212032780, 212075792, 212150421, 212157536, 212336274, 212362219, 212369304, 212390182, 212472158, 212573729, 212707890, 212781236, 212942286, 212954387, 212961544, 212995896, 213017985, 213133599, 213138977, 213206900, 213467729, 213481304, 213552291, 213647286, 213667074, 213787716]

In [None]:
for i in del_list :
    training_df = training_df.drop(i)

In [21]:
training_df = training_df.drop(190167541)

In [22]:
x = training_df[training_columns].values
y = training_df[target_column].values

In [23]:
x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=41, stratify=y)

In [24]:
training_df['label'].value_counts()

1    15494
0     1681
Name: label, dtype: int64

<h2>Hyper tuning</h2>

In [None]:
param_grid = {
    'num_leaves': [15,31, 63],
    'learning_rate': [0.1, 0.01],
    'class_weight' : [{0:4,1:1}, {0:3,1:1}],
    'num_boosting_rounds': [100,500],
    'max_bins': [10,100,1000],
    'n_estimators': [50,100,200],
    'reg_alpha': [0.1, 0.5],
    'random_state': [1,8,16,64],
    'min_data_in_leaf': [30,100,400],
    'lambda_l2': [0, 1]
    }

In [None]:
model = lightgbm.LGBMClassifier()

In [None]:
gkf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(X=x, y=y)

In [None]:
gs = GridSearchCV(
    estimator=model, param_grid=param_grid, cv=gkf, scoring='f1_macro',
    verbose=True, n_jobs=3)

In [None]:
gs.fit(x, y.ravel())

In [None]:
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

In [None]:
predicted_y = gs.predict(x_test)
train_pred_y = gs.predict(x)

In [None]:
print(); print(metrics.classification_report(y,train_pred_y))
print(); print(metrics.confusion_matrix(y, train_pred_y))

print(); print(metrics.classification_report(y_test, predicted_y))
print(); print(metrics.confusion_matrix(y_test, predicted_y))

<h2>Choosing the best model</h2>

In [25]:
import math

In [26]:
scale_pos_weight = math.sqrt(1681/15494)

In [27]:
scale_pos_weight

0.32938368270125923

In [28]:
from sklearn.metrics import f1_score
def evaluate_macroF1_lgb(y_true, y_pred):  
    y_hat = np.where(y_pred < 0.5, 0, 1) 
    f1 = f1_score(y_true, y_hat, average='macro')
    return ('macroF1', f1, True) 

In [29]:
def focal_loss_lgb_eval_error(y_true, y_pred, alpha=.25, gamma=2.):
    a,g = alpha, gamma
    p = 1/(1+np.exp(-y_pred))
    loss = -( a*y_true + (1-a)*(1-y_true) ) * (( 1 - ( y_true*p + (1-y_true)*(1-p)) )**g) * ( y_true*np.log(p)+(1-y_true)*np.log(1-p) )
    return 'focal_loss', np.mean(loss), False

In [30]:
gkf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [31]:
model = lightgbm.LGBMClassifier(boosting_type='gbdt',scale_pos_weight = scale_pos_weight, learning_rate=0.01, max_bins=10, min_data_in_leaf=60, n_estimators=100, num_iterations=1000, num_leaves=63, random_state=1, reg_alpha=0.1, metric=["custom",'binary_logloss'],early_stopping_rounds=250)

In [32]:
model = lightgbm.LGBMClassifier(class_weight={0:3,1:1}, learning_rate=0.1)

In [33]:
for training_index, testing_index in gkf.split(X=x, y=y):
    x_train_fold, y_train_fold = x[training_index], y[training_index]
    x_test_fold, y_test_fold = x[testing_index], y[testing_index]
    model.fit(x_train_fold, y_train_fold, eval_set=(x_test_fold,y_test_fold),eval_metric = lambda y_true, y_pred: [evaluate_macroF1_lgb(y_true,y_pred)])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[1]	valid_0's binary_logloss: 0.360745	valid_0's macroF1: 0.474268
[2]	valid_0's binary_logloss: 0.335604	valid_0's macroF1: 0.474268
[3]	valid_0's binary_logloss: 0.314253	valid_0's macroF1: 0.474268
[4]	valid_0's binary_logloss: 0.296994	valid_0's macroF1: 0.701927
[5]	valid_0's binary_logloss: 0.28144	valid_0's macroF1: 0.77926
[6]	valid_0's binary_logloss: 0.269333	valid_0's macroF1: 0.806606
[7]	valid_0's binary_logloss: 0.25789	valid_0's macroF1: 0.815125
[8]	valid_0's binary_logloss: 0.24731	valid_0's macroF1: 0.823174
[9]	valid_0's binary_logloss: 0.238513	valid_0's macroF1: 0.825657
[10]	valid_0's binary_logloss: 0.231176	valid_0's macroF1: 0.834603
[11]	valid_0's binary_logloss: 0.22463	valid_0's macroF1: 0.828972
[12]	valid_0's binary_logloss: 0.218119	valid_0's macroF1: 0.838678
[13]	valid_0's binary_logloss: 0.212492	valid_0's macroF1: 0.838082
[14]	valid_0's binary_logloss: 0.207409	valid_0's macroF1: 0.838678
[15]	valid_0's binary_logloss: 0.202752	valid_0's macroF1: 0.8

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[4]	valid_0's binary_logloss: 0.298021	valid_0's macroF1: 0.645785
[5]	valid_0's binary_logloss: 0.28217	valid_0's macroF1: 0.779572
[6]	valid_0's binary_logloss: 0.269259	valid_0's macroF1: 0.810983
[7]	valid_0's binary_logloss: 0.257884	valid_0's macroF1: 0.823258
[8]	valid_0's binary_logloss: 0.248126	valid_0's macroF1: 0.831628
[9]	valid_0's binary_logloss: 0.238867	valid_0's macroF1: 0.835649
[10]	valid_0's binary_logloss: 0.231303	valid_0's macroF1: 0.83417
[11]	valid_0's binary_logloss: 0.224715	valid_0's macroF1: 0.845042
[12]	valid_0's binary_logloss: 0.218682	valid_0's macroF1: 0.851323
[13]	valid_0's binary_logloss: 0.21306	valid_0's macroF1: 0.853512
[14]	valid_0's binary_logloss: 0.207729	valid_0's macroF1: 0.852895
[15]	valid_0's binary_logloss: 0.203315	valid_0's macroF1: 0.851221
[16]	valid_0's binary_logloss: 0.199392	valid_0's macroF1: 0.848451
[17]	valid_0's binary_logloss: 0.19603	valid_0's macroF1: 0.850632
[18]	valid_0's binary_logloss: 0.192778	valid_0's macroF1:

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[22]	valid_0's binary_logloss: 0.182535	valid_0's macroF1: 0.850098
[23]	valid_0's binary_logloss: 0.180672	valid_0's macroF1: 0.851458
[24]	valid_0's binary_logloss: 0.178764	valid_0's macroF1: 0.852812
[25]	valid_0's binary_logloss: 0.176382	valid_0's macroF1: 0.854162
[26]	valid_0's binary_logloss: 0.175132	valid_0's macroF1: 0.853334
[27]	valid_0's binary_logloss: 0.173757	valid_0's macroF1: 0.850867
[28]	valid_0's binary_logloss: 0.172269	valid_0's macroF1: 0.852509
[29]	valid_0's binary_logloss: 0.171093	valid_0's macroF1: 0.850867
[30]	valid_0's binary_logloss: 0.169823	valid_0's macroF1: 0.85005
[31]	valid_0's binary_logloss: 0.168392	valid_0's macroF1: 0.851161
[32]	valid_0's binary_logloss: 0.166959	valid_0's macroF1: 0.85034
[33]	valid_0's binary_logloss: 0.166022	valid_0's macroF1: 0.847893
[34]	valid_0's binary_logloss: 0.165314	valid_0's macroF1: 0.848706
[35]	valid_0's binary_logloss: 0.164727	valid_0's macroF1: 0.848953
[36]	valid_0's binary_logloss: 0.164581	valid_0's 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[12]	valid_0's binary_logloss: 0.216	valid_0's macroF1: 0.851519
[13]	valid_0's binary_logloss: 0.210408	valid_0's macroF1: 0.851221
[14]	valid_0's binary_logloss: 0.205445	valid_0's macroF1: 0.854265
[15]	valid_0's binary_logloss: 0.201017	valid_0's macroF1: 0.853952
[16]	valid_0's binary_logloss: 0.19688	valid_0's macroF1: 0.852286
[17]	valid_0's binary_logloss: 0.192905	valid_0's macroF1: 0.856847
[18]	valid_0's binary_logloss: 0.189326	valid_0's macroF1: 0.859016
[19]	valid_0's binary_logloss: 0.186374	valid_0's macroF1: 0.857351
[20]	valid_0's binary_logloss: 0.183501	valid_0's macroF1: 0.859175
[21]	valid_0's binary_logloss: 0.181269	valid_0's macroF1: 0.858348
[22]	valid_0's binary_logloss: 0.178763	valid_0's macroF1: 0.857851
[23]	valid_0's binary_logloss: 0.17647	valid_0's macroF1: 0.861325
[24]	valid_0's binary_logloss: 0.173918	valid_0's macroF1: 0.866093
[25]	valid_0's binary_logloss: 0.172133	valid_0's macroF1: 0.866558
[26]	valid_0's binary_logloss: 0.1703	valid_0's macro

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[24]	valid_0's binary_logloss: 0.178986	valid_0's macroF1: 0.846752
[25]	valid_0's binary_logloss: 0.17694	valid_0's macroF1: 0.848307
[26]	valid_0's binary_logloss: 0.175141	valid_0's macroF1: 0.846752
[27]	valid_0's binary_logloss: 0.173857	valid_0's macroF1: 0.845978
[28]	valid_0's binary_logloss: 0.172066	valid_0's macroF1: 0.850085
[29]	valid_0's binary_logloss: 0.170385	valid_0's macroF1: 0.851357
[30]	valid_0's binary_logloss: 0.169298	valid_0's macroF1: 0.849801
[31]	valid_0's binary_logloss: 0.168033	valid_0's macroF1: 0.852139
[32]	valid_0's binary_logloss: 0.166199	valid_0's macroF1: 0.852923
[33]	valid_0's binary_logloss: 0.16516	valid_0's macroF1: 0.854499
[34]	valid_0's binary_logloss: 0.163692	valid_0's macroF1: 0.855932
[35]	valid_0's binary_logloss: 0.162536	valid_0's macroF1: 0.8583
[36]	valid_0's binary_logloss: 0.161708	valid_0's macroF1: 0.85577
[37]	valid_0's binary_logloss: 0.160896	valid_0's macroF1: 0.85577
[38]	valid_0's binary_logloss: 0.160424	valid_0's macr

In [34]:
predicted_y = model.predict(x_test)
train_pred_y = model.predict(x)

In [35]:
train_pred_y = model.predict(x)

In [36]:
print(); print(metrics.classification_report(y,train_pred_y))
print(); print(metrics.confusion_matrix(y, train_pred_y))

print(); print(metrics.classification_report(y_test, predicted_y))
print(); print(metrics.confusion_matrix(y_test, predicted_y))


              precision    recall  f1-score   support

           0       0.90      0.89      0.90      1345
           1       0.99      0.99      0.99     12395

    accuracy                           0.98     13740
   macro avg       0.94      0.94      0.94     13740
weighted avg       0.98      0.98      0.98     13740


[[ 1202   143]
 [  135 12260]]

              precision    recall  f1-score   support

           0       0.77      0.73      0.75       336
           1       0.97      0.98      0.97      3099

    accuracy                           0.95      3435
   macro avg       0.87      0.85      0.86      3435
weighted avg       0.95      0.95      0.95      3435


[[ 245   91]
 [  72 3027]]


In [37]:
print(); print(metrics.classification_report(y,train_pred_y))
print(); print(metrics.confusion_matrix(y, train_pred_y))


              precision    recall  f1-score   support

           0       0.90      0.89      0.90      1345
           1       0.99      0.99      0.99     12395

    accuracy                           0.98     13740
   macro avg       0.94      0.94      0.94     13740
weighted avg       0.98      0.98      0.98     13740


[[ 1202   143]
 [  135 12260]]


<h2>Testing</h2>

In [38]:

test_set = pd.read_csv(data_path+'test.csv', index_col="tripid")


In [39]:
test_set['pickup_time'] = pd.to_datetime(test_set['pickup_time'], format="%m/%d/%Y %H:%M")
test_set['drop_time'] = pd.to_datetime(test_set['drop_time'], format="%m/%d/%Y %H:%M")

In [40]:
test_set = test_set.assign(timeOfDay=pd.cut(test_set.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [41]:
test_set.loc[test_set['timeOfDay'] == 'day', 'isNormalCharge'] = 1
test_set.loc[test_set['timeOfDay'] != 'day', 'isNormalCharge'] = 0

In [42]:
durations = []
for index,row in test_set.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = np.nan
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

test_set.insert(4,"time_dif",durations)

In [43]:
new_column = []                    #empty column for distance
for index,row in test_set.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

test_set.insert(4,"distance",new_column)

In [44]:
test_set['time_driven'] = test_set['duration']  - test_set['meter_waiting']

In [45]:
chargeperhours = []
for index,row in test_set.iterrows():
    if(row['meter_waiting'] == 0):
        chargeperhour = np.nan
    else:
        chargeperhour = (row['meter_waiting_fare'] / row['meter_waiting'] * 3600)
    chargeperhours.append(chargeperhour)

test_set.insert(4,'charge_per_hour',chargeperhours)


In [46]:
test_set['driving_fare'] = test_set['fare']  - test_set['meter_waiting_fare'] - test_set['additional_fare']

In [47]:
avgspeeds = []
for index,row in test_set.iterrows():
    if(row['time_driven'] == 0):
        avgspeed = np.nan    
    else:
        avgspeed = (row['distance'] / row['time_driven'] * 3600)
    avgspeeds.append(avgspeed)

test_set.insert(4,"avg_speed",avgspeeds)



In [48]:
costsperkm = []
for index,row in test_set.iterrows():
    if row['distance'] == 0:
        costperkm = np.nan
            
    else:
        costperkm = (row['driving_fare'] / row['distance'])
    costsperkm.append(costperkm)

test_set.insert(4,"cost_per_km",costsperkm)

In [49]:
test_features = test_set[training_columns]

In [50]:
test_features.isna().sum()

meter_waiting           0
meter_waiting_fare      0
fare                    0
additional_fare         0
distance                0
cost_per_km            25
avg_speed               9
pick_lat                0
pick_lon                0
drop_lat                0
drop_lon                0
isNormalCharge          0
time_dif                0
time_driven             0
charge_per_hour       298
driving_fare            0
dtype: int64

In [51]:
predicted_labels = model.predict(test_features)

In [52]:
predicted_labels_df = pd.DataFrame(predicted_labels )

In [53]:
sub_path =os.path.abspath(os.path.join(data_path+'/sample_submission.csv'))
submission_set = pd.read_csv(sub_path, index_col="tripid")

In [54]:
submission_set['prediction']= predicted_labels_df.values[:,0]

In [55]:
%%javascript
var kernel = IPython.notebook.kernel;
var thename = window.document.getElementById("notebook_name").innerHTML;
var command = "theNotebook = " + "'"+thename+"'";
kernel.execute(command);

<IPython.core.display.Javascript object>

In [57]:
filename = '../../submissions/'+theNotebook+'/'+theNotebook+'_{%i}.csv'
dirname = '../../submissions/'+theNotebook
fileversion = 1

if not os.path.exists(dirname):
    os.makedirs(dirname)
while glob.glob(filename.replace('{%i}',str(fileversion))) :
    fileversion+=1
submission_set.to_csv(filename.replace('{%i}',str(fileversion)), index=True)
print("Completed!")

Completed!


In [56]:
submission_set['prediction'].value_counts()

1    8024
0     552
Name: prediction, dtype: int64

<h2>testing against others</h2>

In [None]:
sub_path =os.path.abspath(os.path.join('/home/madnisal/Documents/ML_Project/submissions/grid-lgbm/lgbm3_19.csv'))
submission_set = pd.read_csv(sub_path, index_col="tripid")

In [None]:
submission_set['prediction'].value_counts()