In [0]:
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import math
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import os
import glob

from sklearn.cluster import DBSCAN, KMeans

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import os
import pandas as pd

train_path=os.path.abspath(os.path.join(os.path.dirname("__file__"), '..', '/content/drive/My Drive/datasets/train.csv'))
training_df = pd.read_csv(train_path, index_col="tripid")


In [0]:
def dist_from_coordinates(lat1, lon1, lat2, lon2):
  R = 6371  # Earth radius in km

  #conversion to radians
  d_lat = np.radians(lat2-lat1)
  d_lon = np.radians(lon2-lon1)

  r_lat1 = np.radians(lat1)
  r_lat2 = np.radians(lat2)

  #haversine formula
  a = np.sin(d_lat/2.) **2 + np.cos(r_lat1) * np.cos(r_lat2) * np.sin(d_lon/2.)**2

  haversine = 2 * R * np.arcsin(np.sqrt(a))

  return haversine

In [0]:
training_df['pickup_time'] = pd.to_datetime(training_df['pickup_time'], format="%m/%d/%Y %H:%M")
training_df['drop_time'] = pd.to_datetime(training_df['drop_time'], format="%m/%d/%Y %H:%M")

In [0]:
training_df = training_df.assign(timeOfDay=pd.cut(training_df.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [0]:
training_df.loc[training_df['timeOfDay'] == 'day', 'isNormalCharge'] = 1
training_df.loc[training_df['timeOfDay'] != 'day', 'isNormalCharge'] = 0

In [0]:
durations = []
for index,row in training_df.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = np.nan
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

training_df.insert(4,"time_dif",durations)

In [0]:
new_column = []                    #empty column for distance
for index,row in training_df.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

training_df.insert(4,"distance",new_column)

In [0]:
training_df['time_driven'] = training_df['duration']  - training_df['meter_waiting']

In [0]:
chargeperhours = []
for index,row in training_df.iterrows():
    if(row['meter_waiting'] == 0):
        chargeperhour = np.nan
    else:
        chargeperhour = (row['meter_waiting_fare'] / row['meter_waiting'] * 3600)
    chargeperhours.append(chargeperhour)

training_df.insert(4,'charge_per_hour',chargeperhours)


In [0]:
training_df['driving_fare'] = training_df['fare']  - training_df['meter_waiting_fare'] - training_df['additional_fare']

In [0]:
avgspeeds = []
for index,row in training_df.iterrows():
    if(row['time_driven'] == 0):
        avgspeed = np.nan
    else:
        avgspeed = (row['distance'] / row['time_driven'] * 3600)
    avgspeeds.append(avgspeed)

training_df.insert(4,"avg_speed",avgspeeds)



In [0]:
costsperkm = []
for index,row in training_df.iterrows():
    if row['distance'] == 0:
        costperkm = np.nan
            
    else:
        costperkm = (row['driving_fare'] / row['distance'])
    costsperkm.append(costperkm)

training_df.insert(4,"cost_per_km",costsperkm)

In [0]:
training_df = training_df.replace({'label': {'incorrect': 0, 'correct' : 1}})

<h2>Geolocation clustering</h2>

In [0]:
test_path =os.path.abspath(os.path.join(os.path.dirname("__file__"), '..', '/content/drive/My Drive/datasets/test.csv'))
test_set = pd.read_csv(test_path, index_col="tripid")

In [0]:
pickup_locations = pd.DataFrame()
drop_locations = pd.DataFrame()
test_pickup_locations = pd.DataFrame()
test_drop_locations = pd.DataFrame()

In [0]:
pickup_locations[['lat','lon']] = training_df[['pick_lat','pick_lon']]
drop_locations[['lat','lon']] = training_df[['drop_lat','drop_lon']]

In [0]:
test_pickup_locations[['lat','lon']] = test_set[['pick_lat','pick_lon']]
test_drop_locations[['lat','lon']] = test_set[['drop_lat','drop_lon']]

In [0]:
test_set.shape

(8576, 12)

In [0]:
training_df.shape

(17176, 22)

In [0]:
pickup_locations = pickup_locations.reset_index()
drop_locations = drop_locations.reset_index()
test_pickup_locations = test_pickup_locations.reset_index()
test_drop_locations = test_drop_locations.reset_index()

In [0]:
df = pd.concat([pickup_locations,drop_locations,test_pickup_locations,test_drop_locations],ignore_index=True )

In [0]:
db_pick = DBSCAN(eps=1/6371., min_samples=5, algorithm='ball_tree', metric='haversine').fit(np.radians(df[['lat','lon']]))

In [23]:
db_pick.labels_[0:17176]

array([0, 0, 0, ..., 0, 0, 0])

In [0]:
training_df['pickup_cluster_label'] = db_pick.labels_[0:17176]

In [0]:
training_df['drop_cluster_label'] = db_pick.labels_[17176:17176*2]

In [26]:
training_df['drop_cluster_label'].value_counts()

 0     14315
 1      1818
 4       307
 2       262
-1       244
 6        67
 3        55
 5        25
 15        9
 17        8
 11        8
 7         6
 21        6
 20        5
 16        4
 25        4
 18        3
 24        3
 23        3
 26        3
 13        3
 19        3
 14        2
 10        2
 28        2
 12        2
 22        2
 8         2
 9         2
 27        1
Name: drop_cluster_label, dtype: int64

<h2>Selecting the features</h2>

In [27]:
training_df.columns

Index(['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'cost_per_km', 'avg_speed', 'charge_per_hour', 'distance', 'time_dif',
       'meter_waiting_till_pickup', 'pickup_time', 'drop_time', 'pick_lat',
       'pick_lon', 'drop_lat', 'drop_lon', 'fare', 'label', 'timeOfDay',
       'isNormalCharge', 'time_driven', 'driving_fare', 'pickup_cluster_label',
       'drop_cluster_label'],
      dtype='object')

In [0]:
training_columns = ['meter_waiting','meter_waiting_fare','fare','additional_fare', 'distance','cost_per_km', 'avg_speed',  'time_dif','time_driven', 'charge_per_hour', 'driving_fare', 'isNormalCharge','drop_cluster_label','pickup_cluster_label']

In [0]:
training_columns = ['meter_waiting','meter_waiting_fare','fare','additional_fare', 'distance','cost_per_km', 'avg_speed',  'time_dif','time_driven', 'charge_per_hour', 'driving_fare', 'isNormalCharge','pick_lat','pick_lon','drop_lat','drop_lon','drop_cluster_label','pickup_cluster_label']

In [0]:
target_column = ['label']

In [0]:
training_df = training_df.drop(190167541)

In [0]:
x = training_df[training_columns].values
y = training_df[target_column].values

In [0]:
x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [34]:
training_df['label'].value_counts()

1    15494
0     1681
Name: label, dtype: int64

<h2>Choosing the best model</h2>

In [0]:
def focal_loss_lgb_eval_error(y_true, y_pred, alpha=.25, gamma=2.):
    a,g = alpha, gamma
    p = 1/(1+np.exp(-y_pred))
    loss = -( a*y_true + (1-a)*(1-y_true) ) * (( 1 - ( y_true*p + (1-y_true)*(1-p)) )**g) * ( y_true*np.log(p)+(1-y_true)*np.log(1-p) )
    return 'focal_loss', np.mean(loss), False

In [0]:
from sklearn.metrics import f1_score
def evaluate_macroF1_lgb(y_true, y_pred):  
    y_hat = np.where(y_pred < 0.5, 0, 1) 
    f1 = f1_score(y_true, y_hat, average='macro')
    return ('macroF1', f1, True) 

In [0]:
gkf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [0]:
model = lightgbm.LGBMClassifier(class_weight={0:1.5,1:1}, learning_rate=0.1)

In [39]:
for training_index, testing_index in gkf.split(X=x, y=y):
    x_train_fold, y_train_fold = x[training_index], y[training_index]
    x_test_fold, y_test_fold = x[testing_index], y[testing_index]
    model.fit(x_train_fold, y_train_fold, eval_set=(x_test_fold,y_test_fold),eval_metric = lambda y_true, y_pred: [evaluate_macroF1_lgb(y_true,y_pred)])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[1]	valid_0's binary_logloss: 0.28951	valid_0's macroF1: 0.474268
[2]	valid_0's binary_logloss: 0.266245	valid_0's macroF1: 0.474268
[3]	valid_0's binary_logloss: 0.248837	valid_0's macroF1: 0.474268
[4]	valid_0's binary_logloss: 0.235365	valid_0's macroF1: 0.474268
[5]	valid_0's binary_logloss: 0.224279	valid_0's macroF1: 0.691565
[6]	valid_0's binary_logloss: 0.214593	valid_0's macroF1: 0.743383
[7]	valid_0's binary_logloss: 0.206098	valid_0's macroF1: 0.79792
[8]	valid_0's binary_logloss: 0.198465	valid_0's macroF1: 0.810983
[9]	valid_0's binary_logloss: 0.192515	valid_0's macroF1: 0.820103
[10]	valid_0's binary_logloss: 0.186296	valid_0's macroF1: 0.826652
[11]	valid_0's binary_logloss: 0.181639	valid_0's macroF1: 0.837845
[12]	valid_0's binary_logloss: 0.177881	valid_0's macroF1: 0.839369
[13]	valid_0's binary_logloss: 0.174443	valid_0's macroF1: 0.837385
[14]	valid_0's binary_logloss: 0.171348	valid_0's macroF1: 0.840413
[15]	valid_0's binary_logloss: 0.16799	valid_0's macroF1: 0

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[6]	valid_0's binary_logloss: 0.220006	valid_0's macroF1: 0.739199
[7]	valid_0's binary_logloss: 0.212167	valid_0's macroF1: 0.765945
[8]	valid_0's binary_logloss: 0.204256	valid_0's macroF1: 0.786269
[9]	valid_0's binary_logloss: 0.198526	valid_0's macroF1: 0.801485
[10]	valid_0's binary_logloss: 0.193344	valid_0's macroF1: 0.804075
[11]	valid_0's binary_logloss: 0.188255	valid_0's macroF1: 0.810014
[12]	valid_0's binary_logloss: 0.184398	valid_0's macroF1: 0.812437
[13]	valid_0's binary_logloss: 0.180773	valid_0's macroF1: 0.80992
[14]	valid_0's binary_logloss: 0.17735	valid_0's macroF1: 0.813939
[15]	valid_0's binary_logloss: 0.174694	valid_0's macroF1: 0.813939
[16]	valid_0's binary_logloss: 0.171739	valid_0's macroF1: 0.815564
[17]	valid_0's binary_logloss: 0.169926	valid_0's macroF1: 0.817085
[18]	valid_0's binary_logloss: 0.167485	valid_0's macroF1: 0.823448
[19]	valid_0's binary_logloss: 0.166202	valid_0's macroF1: 0.831932
[20]	valid_0's binary_logloss: 0.164609	valid_0's macr

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[17]	valid_0's binary_logloss: 0.1701	valid_0's macroF1: 0.832409
[18]	valid_0's binary_logloss: 0.168271	valid_0's macroF1: 0.835437
[19]	valid_0's binary_logloss: 0.166474	valid_0's macroF1: 0.83307
[20]	valid_0's binary_logloss: 0.164585	valid_0's macroF1: 0.839935
[21]	valid_0's binary_logloss: 0.163084	valid_0's macroF1: 0.842038
[22]	valid_0's binary_logloss: 0.161557	valid_0's macroF1: 0.84179
[23]	valid_0's binary_logloss: 0.160355	valid_0's macroF1: 0.843856
[24]	valid_0's binary_logloss: 0.159455	valid_0's macroF1: 0.843
[25]	valid_0's binary_logloss: 0.158441	valid_0's macroF1: 0.844451
[26]	valid_0's binary_logloss: 0.15743	valid_0's macroF1: 0.846756
[27]	valid_0's binary_logloss: 0.156431	valid_0's macroF1: 0.848772
[28]	valid_0's binary_logloss: 0.155634	valid_0's macroF1: 0.849633
[29]	valid_0's binary_logloss: 0.15501	valid_0's macroF1: 0.849633
[30]	valid_0's binary_logloss: 0.154152	valid_0's macroF1: 0.848772
[31]	valid_0's binary_logloss: 0.153281	valid_0's macroF1

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[1]	valid_0's binary_logloss: 0.29051	valid_0's macroF1: 0.474268
[2]	valid_0's binary_logloss: 0.267613	valid_0's macroF1: 0.474268
[3]	valid_0's binary_logloss: 0.249636	valid_0's macroF1: 0.474268
[4]	valid_0's binary_logloss: 0.235278	valid_0's macroF1: 0.48183
[5]	valid_0's binary_logloss: 0.223721	valid_0's macroF1: 0.722897
[6]	valid_0's binary_logloss: 0.21301	valid_0's macroF1: 0.752442
[7]	valid_0's binary_logloss: 0.204443	valid_0's macroF1: 0.79875
[8]	valid_0's binary_logloss: 0.196781	valid_0's macroF1: 0.81865
[9]	valid_0's binary_logloss: 0.190795	valid_0's macroF1: 0.817665
[10]	valid_0's binary_logloss: 0.185426	valid_0's macroF1: 0.822636
[11]	valid_0's binary_logloss: 0.179764	valid_0's macroF1: 0.819008
[12]	valid_0's binary_logloss: 0.175199	valid_0's macroF1: 0.823862
[13]	valid_0's binary_logloss: 0.171453	valid_0's macroF1: 0.832506
[14]	valid_0's binary_logloss: 0.16819	valid_0's macroF1: 0.833189
[15]	valid_0's binary_logloss: 0.164845	valid_0's macroF1: 0.83

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[9]	valid_0's binary_logloss: 0.197348	valid_0's macroF1: 0.813505
[10]	valid_0's binary_logloss: 0.191841	valid_0's macroF1: 0.819329
[11]	valid_0's binary_logloss: 0.186658	valid_0's macroF1: 0.825765
[12]	valid_0's binary_logloss: 0.182245	valid_0's macroF1: 0.833786
[13]	valid_0's binary_logloss: 0.178329	valid_0's macroF1: 0.834476
[14]	valid_0's binary_logloss: 0.174701	valid_0's macroF1: 0.835837
[15]	valid_0's binary_logloss: 0.171059	valid_0's macroF1: 0.842034
[16]	valid_0's binary_logloss: 0.168671	valid_0's macroF1: 0.841142
[17]	valid_0's binary_logloss: 0.166137	valid_0's macroF1: 0.843308
[18]	valid_0's binary_logloss: 0.164266	valid_0's macroF1: 0.848449
[19]	valid_0's binary_logloss: 0.162328	valid_0's macroF1: 0.848161
[20]	valid_0's binary_logloss: 0.160462	valid_0's macroF1: 0.849644
[21]	valid_0's binary_logloss: 0.158972	valid_0's macroF1: 0.852906
[22]	valid_0's binary_logloss: 0.157162	valid_0's macroF1: 0.855279
[23]	valid_0's binary_logloss: 0.155345	valid_0's

In [0]:
predicted_y = model.predict(x_test)
train_pred_y = model.predict(x)

In [41]:
print(); print(metrics.classification_report(y,train_pred_y))
print(); print(metrics.confusion_matrix(y, train_pred_y))

print(); print(metrics.classification_report(y_test, predicted_y))
print(); print(metrics.confusion_matrix(y_test, predicted_y))


              precision    recall  f1-score   support

           0       0.95      0.84      0.89      1345
           1       0.98      1.00      0.99     12395

    accuracy                           0.98     13740
   macro avg       0.97      0.92      0.94     13740
weighted avg       0.98      0.98      0.98     13740


[[ 1135   210]
 [   57 12338]]

              precision    recall  f1-score   support

           0       0.88      0.68      0.77       336
           1       0.97      0.99      0.98      3099

    accuracy                           0.96      3435
   macro avg       0.92      0.84      0.87      3435
weighted avg       0.96      0.96      0.96      3435


[[ 229  107]
 [  32 3067]]


<h2>Testing</h2>

In [0]:

test_path =os.path.abspath(os.path.join(os.path.dirname("__file__"), '..', '/content/drive/My Drive/datasets/test.csv'))
test_set = pd.read_csv(test_path, index_col="tripid")


In [0]:
test_set['pickup_time'] = pd.to_datetime(test_set['pickup_time'], format="%m/%d/%Y %H:%M")
test_set['drop_time'] = pd.to_datetime(test_set['drop_time'], format="%m/%d/%Y %H:%M")

In [0]:
test_set = test_set.assign(timeOfDay=pd.cut(test_set.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [0]:
test_set.loc[test_set['timeOfDay'] == 'day', 'isNormalCharge'] = 1
test_set.loc[test_set['timeOfDay'] != 'day', 'isNormalCharge'] = 0

In [0]:
durations = []
for index,row in test_set.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = np.nan
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

test_set.insert(4,"time_dif",durations)

In [0]:
new_column = []                    #empty column for distance
for index,row in test_set.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

test_set.insert(4,"distance",new_column)

In [0]:
test_set['time_driven'] = test_set['duration']  - test_set['meter_waiting']

In [0]:
chargeperhours = []
for index,row in test_set.iterrows():
    if(row['meter_waiting'] == 0):
        chargeperhour = np.nan
    else:
        chargeperhour = (row['meter_waiting_fare'] / row['meter_waiting'] * 3600)
    chargeperhours.append(chargeperhour)

test_set.insert(4,'charge_per_hour',chargeperhours)


In [0]:
test_set['driving_fare'] = test_set['fare']  - test_set['meter_waiting_fare'] - test_set['additional_fare']

In [0]:
avgspeeds = []
for index,row in test_set.iterrows():
    if(row['time_driven'] == 0):
        avgspeed = np.nan    
    else:
        avgspeed = (row['distance'] / row['time_driven'] * 3600)
    avgspeeds.append(avgspeed)

test_set.insert(4,"avg_speed",avgspeeds)



In [0]:
costsperkm = []
for index,row in test_set.iterrows():
    if row['distance'] == 0:
        costperkm = np.nan
            
    else:
        costperkm = (row['driving_fare'] / row['distance'])
    costsperkm.append(costperkm)

test_set.insert(4,"cost_per_km",costsperkm)

In [0]:
test_set['pickup_cluster_label'] = db_pick.labels_[17176*2:17176*2 + 8576 ]
test_set['drop_cluster_label'] = db_pick.labels_[17176*2 + 8576: 2*(17176+8576) ]

In [0]:
test_features = test_set[training_columns]

In [55]:
test_features.isna().sum()

meter_waiting             0
meter_waiting_fare        0
fare                      0
additional_fare           0
distance                  0
cost_per_km              25
avg_speed                 9
time_dif                  0
time_driven               0
charge_per_hour         298
driving_fare              0
isNormalCharge            0
pick_lat                  0
pick_lon                  0
drop_lat                  0
drop_lon                  0
drop_cluster_label        0
pickup_cluster_label      0
dtype: int64

In [0]:
predicted_labels = model.predict(test_features)

In [0]:
predicted_labels_df = pd.DataFrame(predicted_labels )

In [0]:
sub_path =os.path.abspath(os.path.join(os.path.dirname("__file__"), '..', '/content/drive/My Drive/datasets/sample_submission.csv'))
submission_set = pd.read_csv(sub_path)

In [0]:
submission_set['prediction']= predicted_labels_df.values[:,0]

In [0]:
theNotebook = "dbscan"

In [65]:
filename = '/content/drive/My Drive/datasets/'+theNotebook+'/'+theNotebook+'_{%i}.csv'
dirname = '/content/drive/My Drive/datasets/'+theNotebook
fileversion = 1

if not os.path.exists(dirname):
    os.makedirs(dirname)
while glob.glob(filename.replace('{%i}',str(fileversion))) :
    fileversion+=1
submission_set.to_csv(filename.replace('{%i}',str(fileversion)), index=False)
print("Completed!")

Completed!


In [64]:
submission_set['prediction'].value_counts()

1    8121
0     455
Name: prediction, dtype: int64