In [1]:
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import math
from sklearn import metrics

In [2]:
import os
data_path='/home/madnisal/Documents/ML_Project/datasets/'
training_df = pd.read_csv(data_path+'train.csv', index_col="tripid")

In [3]:
def dist_from_coordinates(lat1, lon1, lat2, lon2):
  R = 6371  # Earth radius in km

  #conversion to radians
  d_lat = np.radians(lat2-lat1)
  d_lon = np.radians(lon2-lon1)

  r_lat1 = np.radians(lat1)
  r_lat2 = np.radians(lat2)

  #haversine formula
  a = np.sin(d_lat/2.) **2 + np.cos(r_lat1) * np.cos(r_lat2) * np.sin(d_lon/2.)**2

  haversine = 2 * R * np.arcsin(np.sqrt(a))

  return haversine

In [4]:
training_df['pickup_time'] = pd.to_datetime(training_df['pickup_time'], format="%m/%d/%Y %H:%M")
training_df['drop_time'] = pd.to_datetime(training_df['drop_time'], format="%m/%d/%Y %H:%M")

In [5]:
training_df = training_df.assign(timeOfDay=pd.cut(training_df.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [None]:
durations = []
for index,row in training_df.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = np.nan
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

training_df.insert(4,"time_dif",durations)

In [None]:
new_column = []                    #empty column for distance
for index,row in training_df.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

training_df.insert(4,"distance",new_column)

In [None]:
training_df['time_driven'] = training_df['duration']  - training_df['meter_waiting']

In [None]:
chargeperhours = []
for index,row in training_df.iterrows():
    if(row['meter_waiting'] == 0):
        chargeperhour = np.nan
    else:
        chargeperhour = (row['meter_waiting_fare'] / row['meter_waiting'])
    chargeperhours.append(chargeperhour)

training_df.insert(4,'charge_per_hour',chargeperhours)


In [None]:
training_df['driving_fare'] = training_df['fare']  - training_df['meter_waiting_fare'] - training_df['additional_fare']

In [None]:
avgspeeds = []
for index,row in training_df.iterrows():
    if(row['time_driven'] == 0):
        avgspeed = np.nan
    else:
        avgspeed = (row['distance'] / row['time_driven'])
    avgspeeds.append(avgspeed)

training_df.insert(4,"avg_speed",avgspeeds)



In [None]:
costsperkm = []
for index,row in training_df.iterrows():
    if row['driving_fare'] == 0:
        costperkm = np.nan
            
    else:
        costperkm = (row['distance'] / row['driving_fare'])
    costsperkm.append(costperkm)

training_df.insert(4,"cost_per_km",costsperkm)

In [None]:
training_df = training_df.replace({'label': {'incorrect': 0, 'correct' : 1}})

In [None]:
training_columns = ['duration','meter_waiting','meter_waiting_fare','fare','additional_fare']

In [None]:
training_columns = ['cost_per_km', 'avg_speed',  'time_dif','time_driven', 'charge_per_hour', 'driving_fare']

In [None]:
target_column = ['label']

In [None]:
x = training_df[training_columns].values
y = training_df[target_column].values

In [None]:
x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
param_grid = {
    'num_leaves': [15,31, 63,127],
    'learning_rate': [0.1, 0.01],
    'class_weight' : [{0:4,1:1}, {0:3,1:1}],
    'num_iterations': [100,500,1000],
    'max_bins': [10,20,50,100,200,500,1000],
    'n_estimators': [50,100,120,200],
    'reg_alpha': [0.1, 0.5],
    'random_state': [1,2,4,8,16,32,64],
    'min_data_in_leaf': [30, 50, 100, 300, 400],
    'lambda_l1': [0, 1, 1.5],
    'lambda_l2': [0, 1]
    }

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import KFold

In [None]:
model = lightgbm.LGBMClassifier()

In [None]:
gkf = KFold(n_splits=5, shuffle=True, random_state=42).split(X=x, y=y)

In [None]:
gs = GridSearchCV(
    estimator=model, param_grid=param_grid, cv=gkf, scoring='f1_macro',
    verbose=True)

In [None]:
gs.fit(x, y.ravel())

In [None]:
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

In [None]:
predicted_y = gs.predict(x_test)

In [None]:
print(); print(metrics.classification_report(y_test, predicted_y))
print(); print(metrics.confusion_matrix(y_test, predicted_y))

In [None]:

test_path =os.path.abspath(os.path.join(os.path.dirname("__file__"), '..', '/content/drive/My Drive/datasets/test.csv'))
test_set = pd.read_csv(test_path, index_col="tripid")


In [None]:
test_set['pickup_time'] = pd.to_datetime(test_set['pickup_time'], format="%m/%d/%Y %H:%M")
test_set['drop_time'] = pd.to_datetime(test_set['drop_time'], format="%m/%d/%Y %H:%M")

In [None]:
test_set = test_set.assign(timeOfDay=pd.cut(test_set.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [None]:
durations = []
for index,row in test_set.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = np.nan
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

test_set.insert(4,"time_dif",durations)

In [None]:
new_column = []                    #empty column for distance
for index,row in test_set.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

test_set.insert(4,"distance",new_column)

In [None]:
test_set['time_driven'] = test_set['duration']  - test_set['meter_waiting']

In [None]:
chargeperhours = []
for index,row in test_set.iterrows():
    if(row['meter_waiting'] == 0):
        chargeperhour = 0
    else:
        chargeperhour = (row['meter_waiting_fare'] / row['meter_waiting'])
    chargeperhours.append(chargeperhour)

test_set.insert(4,'charge_per_hour',chargeperhours)


In [None]:
test_set['driving_fare'] = test_set['fare']  - test_set['meter_waiting_fare'] - test_set['additional_fare']

In [None]:
avgspeeds = []
for index,row in test_set.iterrows():
    if(row['time_driven'] == 0):
        avgspeed = 0    
    else:
        avgspeed = (row['distance'] / row['time_driven'])
    avgspeeds.append(avgspeed)

test_set.insert(4,"avg_speed",avgspeeds)



In [None]:
costsperkm = []
for index,row in test_set.iterrows():
    if row['driving_fare'] == 0:
        costperkm = 0
            
    else:
        costperkm = (row['distance'] / row['driving_fare'])
    costsperkm.append(costperkm)

test_set.insert(4,"cost_per_km",costsperkm)

In [None]:
test_features = test_set[training_columns]

In [None]:
test_features

In [None]:
predicted_labels = gs.predict(test_features)

In [None]:
sub_path =os.path.abspath(os.path.join(os.path.dirname("__file__"), '..', '/content/drive/My Drive/datasets/sample_submission.csv'))
submission_set = pd.read_csv(sub_path, index_col="tripid")

In [None]:
submission_set['prediction']= predicted_labels

In [None]:
import glob

filename = '/content/drive/My Drive/datasets/lgbm3/lgbm3_{%i}.csv'
dirname = '/content/drive/My Drive/datasets/lgbm3/'
fileversion = 1

if not os.path.exists(dirname):
    os.makedirs(dirname)
while glob.glob(filename.replace('{%i}',str(fileversion))) :
    fileversion+=1
submission_set.to_csv(filename.replace('{%i}',str(fileversion)), index=True)
print("Completed!")



In [None]:
submission_set['prediction'].value_counts()