In [1]:
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import math
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import os
import glob

In [2]:

data_path='/home/madnisal/Documents/ML_Project/datasets/'
training_df = pd.read_csv(data_path+'train.csv', index_col="tripid")

In [3]:
def dist_from_coordinates(lat1, lon1, lat2, lon2):
  R = 6371  # Earth radius in km

  #conversion to radians
  d_lat = np.radians(lat2-lat1)
  d_lon = np.radians(lon2-lon1)

  r_lat1 = np.radians(lat1)
  r_lat2 = np.radians(lat2)

  #haversine formula
  a = np.sin(d_lat/2.) **2 + np.cos(r_lat1) * np.cos(r_lat2) * np.sin(d_lon/2.)**2

  haversine = 2 * R * np.arcsin(np.sqrt(a))

  return haversine

In [4]:
training_df['pickup_time'] = pd.to_datetime(training_df['pickup_time'], format="%m/%d/%Y %H:%M")
training_df['drop_time'] = pd.to_datetime(training_df['drop_time'], format="%m/%d/%Y %H:%M")

In [5]:
training_df = training_df.assign(timeOfDay=pd.cut(training_df.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [6]:
training_df.loc[training_df['timeOfDay'] == 'day', 'isNormalCharge'] = 1
training_df.loc[training_df['timeOfDay'] != 'day', 'isNormalCharge'] = 0

In [7]:
durations = []
for index,row in training_df.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = np.nan
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

training_df.insert(4,"time_dif",durations)

In [8]:
new_column = []                    #empty column for distance
for index,row in training_df.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

training_df.insert(4,"distance",new_column)

In [9]:
training_df['time_driven'] = training_df['duration']  - training_df['meter_waiting']

In [10]:
chargeperhours = []
for index,row in training_df.iterrows():
    if(row['meter_waiting'] == 0):
        chargeperhour = np.nan
    else:
        chargeperhour = (row['meter_waiting_fare'] / row['meter_waiting'] * 3600)
    chargeperhours.append(chargeperhour)

training_df.insert(4,'charge_per_hour',chargeperhours)


In [11]:
training_df['driving_fare'] = training_df['fare']  - training_df['meter_waiting_fare'] - training_df['additional_fare']

In [12]:
avgspeeds = []
for index,row in training_df.iterrows():
    if(row['time_driven'] == 0):
        avgspeed = np.nan
    else:
        avgspeed = (row['distance'] / row['time_driven'] * 3600)
    avgspeeds.append(avgspeed)

training_df.insert(4,"avg_speed",avgspeeds)



In [13]:
costsperkm = []
for index,row in training_df.iterrows():
    if row['distance'] == 0:
        costperkm = np.nan
            
    else:
        costperkm = (row['driving_fare'] / row['distance'])
    costsperkm.append(costperkm)

training_df.insert(4,"cost_per_km",costsperkm)

In [14]:
training_df = training_df.replace({'label': {'incorrect': 0, 'correct' : 1}})

In [15]:
training_df.columns

Index(['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'cost_per_km', 'avg_speed', 'charge_per_hour', 'distance', 'time_dif',
       'meter_waiting_till_pickup', 'pickup_time', 'drop_time', 'pick_lat',
       'pick_lon', 'drop_lat', 'drop_lon', 'fare', 'label', 'timeOfDay',
       'isNormalCharge', 'time_driven', 'driving_fare'],
      dtype='object')

In [16]:
training_columns = ['meter_waiting','meter_waiting_fare','fare','additional_fare', 'distance','cost_per_km', 'avg_speed',  'time_dif','time_driven', 'charge_per_hour', 'driving_fare', 'isNormalCharge','pick_lat','pick_lon','drop_lat','drop_lon']

In [17]:
target_column = ['label']

In [18]:
outlier_train_labels = [189248046, 189372731, 189582749, 189639329, 189877805, 189919496, 189951367, 190042045, 190050729, 190167541, 190220945, 190222595, 190289312, 190377182, 190602146, 190757593, 190932899, 191112330, 191130876, 191258602, 191276742, 191335143, 191409182, 191411060, 191432473, 191457855, 191524024, 191538456, 191576199, 191589327, 191629242, 191861723, 191894473, 191999785, 192047312, 192063114, 192064386, 192194086, 192342422, 192680516, 192732272, 192742531, 193016644, 193047517, 193092341, 193261350, 193317324, 193353297, 193363504, 193363687, 193396395, 193410014, 193585269, 193598400, 193609201, 193640628, 193749854, 193837260, 194305074, 194320960, 194359125, 194514310, 194551382, 194581698, 194597635, 194636909, 194714168, 194726213, 195038854, 195057633, 195097288, 195260495, 195345783, 195346858, 195568649, 195633799, 195882991, 195968211, 196402188, 196553262, 196587403, 196615301, 196855195, 196858407, 196974159, 196997372, 197020223, 197163368, 197183927, 197195612, 197218848, 197274347, 197373571, 197421930, 197495364, 197570477, 197583546, 197698631, 197971580, 197984654, 198012950, 198345947, 198548950, 198607847, 198621653, 198640410, 198661199, 198818188, 198910537, 198920531, 198965022, 198985082, 199037961, 199171419, 199196443, 199214406, 199225357, 199715083, 199727133, 199738570, 199913349, 199966663, 199999511, 200025918, 200154190, 200289693, 200577470, 200686856, 200708149, 200771103, 200893706, 200979082, 201036658, 201171900, 201206769, 201276666, 201378977, 201418800, 201478900, 201496100, 201508022, 201638255, 201668590, 201672623, 201679378, 201679540, 201699206, 201783012, 201900543, 201997944, 202037981, 202115222, 202154929, 202375533, 202402921, 202416098, 202434563, 202642997, 202649316, 202787247, 202880624, 203183989, 203296106, 203556991, 203614817, 203648103, 203695038, 203698426, 203735817, 203889464, 203962665, 204225418, 204362145, 204425668, 204486493, 204570020, 204655606, 204708821, 204807428, 204844294, 204905077, 204923587, 204932258, 205088473, 205107636, 205135085, 205312460, 205328050, 205615659, 205862594, 205897806, 206344270, 206345862, 206690799, 206709794, 206806232, 206841298, 207125672, 207154783, 207159446, 207207631, 207209349, 207248547, 207302185, 207304945, 207365752, 207405568, 207545172, 207588922, 207598568, 207604246, 207614597, 207630522, 207663133, 207680286, 207684582, 207793985, 207887491, 208095798, 208129297, 208183402, 208206935, 208237121, 208262311, 208453649, 208610223, 208647188, 208753668, 208756243, 208800679, 208849308, 208922571, 209015312, 209093530, 209105889, 209497213, 209499033, 209601489, 209621552, 209809748, 209854939, 209929387, 210002331, 210077969, 210086904, 210089028, 210154017, 210340254, 210411446, 210421960, 210552623, 210573839, 210582668, 210591969, 210601978, 210614878, 210636674, 210689525, 210740396, 210745681, 210757114, 210924847, 211004948, 211065192, 211078046, 211207421, 211212751, 211299418, 211441481, 211454056, 211456165, 211605950, 211619444, 211644465, 211663710, 211664080, 211687328, 211700651, 211840921, 211843223, 211897469, 211910775, 212064554, 212099096, 212132090, 212151161, 212358989, 212389240, 212389971, 212391554, 212427437, 212592035, 212606591, 212629700, 212707890, 212738468, 212739051, 212741154, 212749726, 212798138, 212804113, 212827986, 212837339, 212879459, 212961544, 212972465, 212975841, 213031411, 213179435, 213219467, 213226420, 213244500, 213266336, 213296209, 213319817, 213412611, 213483583, 213636965, 213652819]


In [19]:
for i in outlier_train_labels :
    training_df = training_df.drop(i)

In [20]:
207947602 in outlier_train_labels

False

In [21]:
training_df = training_df.drop(190167541)
training_df = training_df.drop(191841099)
training_df = training_df.drop(207947602)

KeyError: '[190167541] not found in axis'

In [22]:
x = training_df[training_columns].values
y = training_df[target_column].values

In [23]:
x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [24]:
training_df['label'].value_counts()

1    15176
0     1681
Name: label, dtype: int64

<h2>Choosing the best model</h2>

In [25]:
def focal_loss_lgb_eval_error(y_true, y_pred, alpha=.25, gamma=2.):
    a,g = alpha, gamma
    p = 1/(1+np.exp(-y_pred))
    loss = -( a*y_true + (1-a)*(1-y_true) ) * (( 1 - ( y_true*p + (1-y_true)*(1-p)) )**g) * ( y_true*np.log(p)+(1-y_true)*np.log(1-p) )
    return 'focal_loss', np.mean(loss), False

In [26]:
from sklearn.metrics import f1_score
def evaluate_macroF1_lgb(y_true, y_pred):  
    y_hat = np.where(y_pred < 0.5, 0, 1) 
    f1 = f1_score(y_true, y_hat, average='macro')
    return ('macroF1', f1, True) 

In [27]:
gkf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [28]:
model = lightgbm.LGBMClassifier(class_weight={0:1.5,1:1} ,learning_rate=0.1)

In [29]:
for training_index, testing_index in gkf.split(X=x, y=y):
    x_train_fold, y_train_fold = x[training_index], y[training_index]
    x_test_fold, y_test_fold = x[testing_index], y[testing_index]
    model.fit(x_train_fold, y_train_fold, eval_set=(x_test_fold,y_test_fold),eval_metric = lambda y_true, y_pred: [evaluate_macroF1_lgb(y_true,y_pred)])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[1]	valid_0's binary_logloss: 0.294682	valid_0's macroF1: 0.473756
[2]	valid_0's binary_logloss: 0.272244	valid_0's macroF1: 0.473756
[3]	valid_0's binary_logloss: 0.254907	valid_0's macroF1: 0.473756
[4]	valid_0's binary_logloss: 0.241292	valid_0's macroF1: 0.473756
[5]	valid_0's binary_logloss: 0.22961	valid_0's macroF1: 0.719451
[6]	valid_0's binary_logloss: 0.220176	valid_0's macroF1: 0.776717
[7]	valid_0's binary_logloss: 0.212199	valid_0's macroF1: 0.790528
[8]	valid_0's binary_logloss: 0.205085	valid_0's macroF1: 0.807221
[9]	valid_0's binary_logloss: 0.199337	valid_0's macroF1: 0.80627
[10]	valid_0's binary_logloss: 0.192962	valid_0's macroF1: 0.812855
[11]	valid_0's binary_logloss: 0.188466	valid_0's macroF1: 0.818494
[12]	valid_0's binary_logloss: 0.184009	valid_0's macroF1: 0.8217
[13]	valid_0's binary_logloss: 0.180024	valid_0's macroF1: 0.823869
[14]	valid_0's binary_logloss: 0.175711	valid_0's macroF1: 0.834501
[15]	valid_0's binary_logloss: 0.173025	valid_0's macroF1: 0.

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[12]	valid_0's binary_logloss: 0.181414	valid_0's macroF1: 0.844976
[13]	valid_0's binary_logloss: 0.178015	valid_0's macroF1: 0.844172
[14]	valid_0's binary_logloss: 0.17515	valid_0's macroF1: 0.845928
[15]	valid_0's binary_logloss: 0.17273	valid_0's macroF1: 0.850194
[16]	valid_0's binary_logloss: 0.170877	valid_0's macroF1: 0.85075
[17]	valid_0's binary_logloss: 0.168809	valid_0's macroF1: 0.854092
[18]	valid_0's binary_logloss: 0.167046	valid_0's macroF1: 0.85548
[19]	valid_0's binary_logloss: 0.16535	valid_0's macroF1: 0.854626
[20]	valid_0's binary_logloss: 0.163687	valid_0's macroF1: 0.854626
[21]	valid_0's binary_logloss: 0.162482	valid_0's macroF1: 0.856008
[22]	valid_0's binary_logloss: 0.160581	valid_0's macroF1: 0.857901
[23]	valid_0's binary_logloss: 0.159295	valid_0's macroF1: 0.861127
[24]	valid_0's binary_logloss: 0.158205	valid_0's macroF1: 0.861127
[25]	valid_0's binary_logloss: 0.157099	valid_0's macroF1: 0.859427
[26]	valid_0's binary_logloss: 0.156402	valid_0's mac

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[30]	valid_0's binary_logloss: 0.138391	valid_0's macroF1: 0.876066
[31]	valid_0's binary_logloss: 0.137416	valid_0's macroF1: 0.876527
[32]	valid_0's binary_logloss: 0.136817	valid_0's macroF1: 0.875625
[33]	valid_0's binary_logloss: 0.136057	valid_0's macroF1: 0.878337
[34]	valid_0's binary_logloss: 0.13552	valid_0's macroF1: 0.881058
[35]	valid_0's binary_logloss: 0.134739	valid_0's macroF1: 0.880148
[36]	valid_0's binary_logloss: 0.1344	valid_0's macroF1: 0.880148
[37]	valid_0's binary_logloss: 0.133579	valid_0's macroF1: 0.883323
[38]	valid_0's binary_logloss: 0.133345	valid_0's macroF1: 0.882409
[39]	valid_0's binary_logloss: 0.132865	valid_0's macroF1: 0.883755
[40]	valid_0's binary_logloss: 0.132974	valid_0's macroF1: 0.88059
[41]	valid_0's binary_logloss: 0.132392	valid_0's macroF1: 0.882409
[42]	valid_0's binary_logloss: 0.13207	valid_0's macroF1: 0.881498
[43]	valid_0's binary_logloss: 0.131367	valid_0's macroF1: 0.881058
[44]	valid_0's binary_logloss: 0.131309	valid_0's mac

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



[8]	valid_0's binary_logloss: 0.204423	valid_0's macroF1: 0.797528
[9]	valid_0's binary_logloss: 0.197875	valid_0's macroF1: 0.799385
[10]	valid_0's binary_logloss: 0.191864	valid_0's macroF1: 0.806374
[11]	valid_0's binary_logloss: 0.186911	valid_0's macroF1: 0.811514
[12]	valid_0's binary_logloss: 0.182831	valid_0's macroF1: 0.815702
[13]	valid_0's binary_logloss: 0.179657	valid_0's macroF1: 0.815527
[14]	valid_0's binary_logloss: 0.176093	valid_0's macroF1: 0.819355
[15]	valid_0's binary_logloss: 0.173372	valid_0's macroF1: 0.820838
[16]	valid_0's binary_logloss: 0.170605	valid_0's macroF1: 0.820838
[17]	valid_0's binary_logloss: 0.168277	valid_0's macroF1: 0.821569
[18]	valid_0's binary_logloss: 0.165837	valid_0's macroF1: 0.828554
[19]	valid_0's binary_logloss: 0.163464	valid_0's macroF1: 0.832315
[20]	valid_0's binary_logloss: 0.161667	valid_0's macroF1: 0.835581
[21]	valid_0's binary_logloss: 0.159821	valid_0's macroF1: 0.837752
[22]	valid_0's binary_logloss: 0.158714	valid_0's

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[21]	valid_0's binary_logloss: 0.155347	valid_0's macroF1: 0.848163
[22]	valid_0's binary_logloss: 0.153445	valid_0's macroF1: 0.852701
[23]	valid_0's binary_logloss: 0.151819	valid_0's macroF1: 0.851792
[24]	valid_0's binary_logloss: 0.150724	valid_0's macroF1: 0.850886
[25]	valid_0's binary_logloss: 0.149232	valid_0's macroF1: 0.849983
[26]	valid_0's binary_logloss: 0.148134	valid_0's macroF1: 0.849983
[27]	valid_0's binary_logloss: 0.147569	valid_0's macroF1: 0.850886
[28]	valid_0's binary_logloss: 0.146596	valid_0's macroF1: 0.85297
[29]	valid_0's binary_logloss: 0.145829	valid_0's macroF1: 0.852068
[30]	valid_0's binary_logloss: 0.145596	valid_0's macroF1: 0.850579
[31]	valid_0's binary_logloss: 0.144464	valid_0's macroF1: 0.852068
[32]	valid_0's binary_logloss: 0.144054	valid_0's macroF1: 0.85117
[33]	valid_0's binary_logloss: 0.143921	valid_0's macroF1: 0.85117
[34]	valid_0's binary_logloss: 0.143095	valid_0's macroF1: 0.852652
[35]	valid_0's binary_logloss: 0.142749	valid_0's m

In [30]:
predicted_y = model.predict(x_test)
train_pred_y = model.predict(x)

In [31]:
print(); print(metrics.classification_report(y,train_pred_y))
print(); print(metrics.confusion_matrix(y, train_pred_y))

print(); print(metrics.classification_report(y_test, predicted_y))
print(); print(metrics.confusion_matrix(y_test, predicted_y))


              precision    recall  f1-score   support

           0       0.96      0.85      0.90      1345
           1       0.98      1.00      0.99     12140

    accuracy                           0.98     13485
   macro avg       0.97      0.92      0.95     13485
weighted avg       0.98      0.98      0.98     13485


[[ 1144   201]
 [   49 12091]]

              precision    recall  f1-score   support

           0       0.85      0.72      0.78       336
           1       0.97      0.99      0.98      3036

    accuracy                           0.96      3372
   macro avg       0.91      0.85      0.88      3372
weighted avg       0.96      0.96      0.96      3372


[[ 241   95]
 [  42 2994]]


<h2>Testing</h2>

In [32]:

test_set = pd.read_csv(data_path+'test.csv', index_col="tripid")


In [33]:
test_set['pickup_time'] = pd.to_datetime(test_set['pickup_time'], format="%m/%d/%Y %H:%M")
test_set['drop_time'] = pd.to_datetime(test_set['drop_time'], format="%m/%d/%Y %H:%M")

In [34]:
test_set = test_set.assign(timeOfDay=pd.cut(test_set.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [35]:
test_set.loc[test_set['timeOfDay'] == 'day', 'isNormalCharge'] = 1
test_set.loc[test_set['timeOfDay'] != 'day', 'isNormalCharge'] = 0

In [36]:
durations = []
for index,row in test_set.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = np.nan
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

test_set.insert(4,"time_dif",durations)

In [37]:
new_column = []                    #empty column for distance
for index,row in test_set.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

test_set.insert(4,"distance",new_column)

In [38]:
test_set['time_driven'] = test_set['duration']  - test_set['meter_waiting']

In [39]:
chargeperhours = []
for index,row in test_set.iterrows():
    if(row['meter_waiting'] == 0):
        chargeperhour = np.nan
    else:
        chargeperhour = (row['meter_waiting_fare'] / row['meter_waiting'] * 3600)
    chargeperhours.append(chargeperhour)

test_set.insert(4,'charge_per_hour',chargeperhours)


In [40]:
test_set['driving_fare'] = test_set['fare']  - test_set['meter_waiting_fare'] - test_set['additional_fare']

In [41]:
avgspeeds = []
for index,row in test_set.iterrows():
    if(row['time_driven'] == 0):
        avgspeed = np.nan    
    else:
        avgspeed = (row['distance'] / row['time_driven'] * 3600)
    avgspeeds.append(avgspeed)

test_set.insert(4,"avg_speed",avgspeeds)



In [42]:
costsperkm = []
for index,row in test_set.iterrows():
    if row['distance'] == 0:
        costperkm = np.nan
            
    else:
        costperkm = (row['driving_fare'] / row['distance'])
    costsperkm.append(costperkm)

test_set.insert(4,"cost_per_km",costsperkm)

In [43]:
test_features = test_set[training_columns]

In [44]:
test_features.isna().sum()

meter_waiting           0
meter_waiting_fare      0
fare                    0
additional_fare         0
distance                0
cost_per_km            25
avg_speed               9
time_dif                0
time_driven             0
charge_per_hour       298
driving_fare            0
isNormalCharge          0
pick_lat                0
pick_lon                0
drop_lat                0
drop_lon                0
dtype: int64

In [45]:
predicted_labels = model.predict(test_features)

In [46]:
predicted_labels_df = pd.DataFrame(predicted_labels )

In [47]:
sub_path =os.path.abspath(os.path.join(data_path+'/sample_submission.csv'))
submission_set = pd.read_csv(sub_path, index_col="tripid")

In [48]:
submission_set['prediction']= predicted_labels_df.values[:,0]

In [49]:
%%javascript
var kernel = IPython.notebook.kernel;
var thename = window.document.getElementById("notebook_name").innerHTML;
var command = "theNotebook = " + "'"+thename+"'";
kernel.execute(command);

<IPython.core.display.Javascript object>

In [51]:
filename = '../../submissions/'+theNotebook+'/'+theNotebook+'_{%i}.csv'
dirname = '../../submissions/'+theNotebook
fileversion = 1

if not os.path.exists(dirname):
    os.makedirs(dirname)
while glob.glob(filename.replace('{%i}',str(fileversion))) :
    fileversion+=1
submission_set.to_csv(filename.replace('{%i}',str(fileversion)), index=True)
print("Completed!")

Completed!


In [50]:
submission_set['prediction'].value_counts()

1    7969
0     607
Name: prediction, dtype: int64