In [1]:
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import math
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import os
import glob

In [2]:

data_path='/home/madnisal/Documents/ML_Project/datasets/'
training_df = pd.read_csv(data_path+'train.csv', index_col="tripid")

In [3]:
def dist_from_coordinates(lat1, lon1, lat2, lon2):
  R = 6371  # Earth radius in km

  #conversion to radians
  d_lat = np.radians(lat2-lat1)
  d_lon = np.radians(lon2-lon1)

  r_lat1 = np.radians(lat1)
  r_lat2 = np.radians(lat2)

  #haversine formula
  a = np.sin(d_lat/2.) **2 + np.cos(r_lat1) * np.cos(r_lat2) * np.sin(d_lon/2.)**2

  haversine = 2 * R * np.arcsin(np.sqrt(a))

  return haversine

In [4]:
training_df['pickup_time'] = pd.to_datetime(training_df['pickup_time'], format="%m/%d/%Y %H:%M")
training_df['drop_time'] = pd.to_datetime(training_df['drop_time'], format="%m/%d/%Y %H:%M")

In [5]:
training_df = training_df.assign(timeOfDay=pd.cut(training_df.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [6]:
training_df.loc[training_df['timeOfDay'] == 'day', 'isNormalCharge'] = 1
training_df.loc[training_df['timeOfDay'] != 'day', 'isNormalCharge'] = 0

In [7]:
durations = []
for index,row in training_df.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = np.nan
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

training_df.insert(4,"time_dif",durations)

In [8]:
new_column = []                    #empty column for distance
for index,row in training_df.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

training_df.insert(4,"distance",new_column)

In [9]:
training_df['time_driven'] = training_df['duration']  - training_df['meter_waiting']

In [10]:
chargeperhours = []
for index,row in training_df.iterrows():
    if(row['meter_waiting'] == 0):
        chargeperhour = np.nan
    else:
        chargeperhour = (row['meter_waiting_fare'] / row['meter_waiting'] * 3600)
    chargeperhours.append(chargeperhour)

training_df.insert(4,'charge_per_hour',chargeperhours)


In [11]:
training_df['driving_fare'] = training_df['fare']  - training_df['meter_waiting_fare'] - training_df['additional_fare']

In [12]:
avgspeeds = []
for index,row in training_df.iterrows():
    if(row['time_driven'] == 0):
        avgspeed = np.nan
    else:
        avgspeed = (row['distance'] / row['time_driven'] * 3600)
    avgspeeds.append(avgspeed)

training_df.insert(4,"avg_speed",avgspeeds)



In [13]:
costsperkm = []
for index,row in training_df.iterrows():
    if row['distance'] == 0:
        costperkm = np.nan
            
    else:
        costperkm = (row['driving_fare'] / row['distance'])
    costsperkm.append(costperkm)

training_df.insert(4,"cost_per_km",costsperkm)

In [14]:
training_df = training_df.replace({'label': {'incorrect': 0, 'correct' : 1}})

In [15]:
training_df.columns

Index(['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'cost_per_km', 'avg_speed', 'charge_per_hour', 'distance', 'time_dif',
       'meter_waiting_till_pickup', 'pickup_time', 'drop_time', 'pick_lat',
       'pick_lon', 'drop_lat', 'drop_lon', 'fare', 'label', 'timeOfDay',
       'isNormalCharge', 'time_driven', 'driving_fare'],
      dtype='object')

In [16]:
training_df.to_csv('/home/madnisal/Documents/ML_Project/datasets/full_fe_dataset.csv')

In [17]:
#training_columns = ['duration','meter_waiting','meter_waiting_fare','fare','additional_fare']

In [18]:
training_columns = ['meter_waiting','meter_waiting_fare','fare','additional_fare', 'distance','cost_per_km', 'avg_speed',  'time_dif','time_driven', 'charge_per_hour', 'driving_fare', 'isNormalCharge','pick_lat','pick_lon','drop_lat','drop_lon']

In [None]:
#training_columns = ['additional_fare', 'meter_waiting','cost_per_km', 'avg_speed', 'charge_per_hour', 'time_dif','isNormalCharge', 'time_driven']

In [None]:
#training_columns = ['duration','meter_waiting','meter_waiting_fare','fare','additional_fare', 'distance','cost_per_km', 'avg_speed',  'time_dif','time_driven', 'charge_per_hour', 'driving_fare', 'isNormalCharge']

In [19]:
target_column = ['label']

In [20]:
del_list = [189146174, 189157607, 189160756, 189176766, 189182800, 189191865, 189227008, 189236485, 189252401, 189259043, 189277797, 189284005, 189307311, 189323004, 189345964, 189363308, 189370252, 189372731, 189391717, 189394373, 189420181, 189428397, 189431382, 189433901, 189445521, 189449518, 189465650, 189475280, 189551186, 189572039, 189615129, 189623692, 189626199, 189627696, 189640410, 189655834, 189667686, 189670152, 189698072, 189738377, 189737409, 189734544, 189748039, 189747601, 189752791, 189778822, 189801272, 189820422, 189820090, 189820141, 189821179, 189824507, 189826263, 189829043, 189826754, 189835514, 189841958, 189842329, 189876756, 189877959, 189882220, 189884798, 189894385, 189893586, 189925207, 189980243, 189978643, 189995541, 190007029, 190008226, 190009896, 190010618, 190017339, 190023371, 190025934, 190042045, 190050433, 190050729, 190056653, 190059653, 190065135, 190066074, 190070075, 190080854, 190100669, 190108454, 190166142, 190173251, 190176497, 190184600, 190189374, 190196355, 190207045, 190212978, 190218103, 190220877, 190222676, 190240764, 190242569, 190248159, 190256099, 190258689, 190278813, 190284141, 190285346, 190286692, 190240827, 190292867, 190295160, 190294183, 190314264, 190315050, 190315565, 190317323, 190350800, 190356522, 190357494, 190368394, 190367842, 190371740, 190377182, 190375811, 190387837, 190390160, 190408980, 190414341, 190419497, 190429209, 190439600, 190465231, 190519002, 190523186, 190579195, 190587941, 190607178, 190616564, 190618151, 190624183, 190630270, 190632321, 190641730, 190651252, 190679701, 190757034, 190757593, 190758046, 190759936, 190762493, 190761267, 190764238, 190768625, 190804715, 190828141, 190858547, 190865763, 190868224, 190871768, 190875041, 190892867, 190906258, 190930144, 190932899, 190944700, 190945647, 190947363, 190966252, 190976655, 190977696, 190988783, 190996095, 191004988, 191022913, 191023223, 191062441, 191067600, 191062464, 191075868, 191090473, 191093204, 191130876, 191138239, 191141096, 191198064, 191210446, 191253555, 191256825, 191258602, 191263388, 191269029, 191269975, 191276742, 191277252, 191314169, 191315141, 191315804, 191321554, 191335143, 191342696, 191360752, 191362317, 191364887, 191367425, 191376211, 191377695, 191379325, 191378896, 191409793, 191425016, 191435886, 191457855, 191475356, 191476806, 191486731, 191494949, 191506681, 191520179, 191542329, 191548899, 191557599, 191562514, 191566068, 191567422, 191567745, 191569507, 191576838, 191576199, 191581912, 191584433, 191583906, 191608474, 191627237, 191630429, 191633349, 191629242, 191664140, 191846327, 191861124, 191861723, 191868734, 191870265, 191877641, 191877193, 191879586, 191947888, 191948291, 191949862, 191952868, 191961350, 191979263, 192000414, 191999785, 191999261, 192031574, 192035213, 192045514, 192048394, 192045536, 192066074, 192079256, 192080347, 192104690, 192118009, 192118666, 192121391, 192124522, 192143515, 192147872, 192151525, 192174228, 192190626, 192201478, 192227971, 192255373, 192281858, 192316353, 192342254, 192342422, 192344630, 192361082, 192368397, 192399235, 192402811, 192428282, 192424245, 192459342, 192589342, 192603743, 192699918, 192711250, 192721698, 192726338, 192725156, 192726168, 192728109, 192732272, 192740233, 192742121, 192755442, 192814679, 192901772, 192912876, 192976029, 193029201, 193028412, 193035660, 193038905, 193044425, 193048938, 193070062, 193063904, 193074515, 193123860, 193152407, 193164831, 193245329, 193257204, 193261350, 193264575, 193272764, 193303399, 193309783, 193333812, 193338047, 193350329, 193357510, 193358705, 193361902, 193363687, 193376278, 193378444, 193404526, 193341290, 193415117, 193417457, 193453026, 193456673, 193478316, 193481764, 193509482, 193528284, 193580371, 193585269, 193586336, 193592999, 193592887, 193594379, 193594530, 193597848, 193598400, 193607561, 193611450, 193640628, 193649132, 193653017, 193659984, 193682008, 193728458, 193746200, 193749854, 193753968, 193751975, 193760421, 193777686, 193779107, 193781025, 193796406, 193830196, 193838110, 193841338, 193848497, 193854103, 193855448, 193855250, 193863537, 193868492, 193874389, 193874272, 193882049, 193886201, 193889129, 193893878, 193910559, 193914033, 193912639, 193962847, 193964546, 193976530, 194037315, 194220674, 194253929, 194264005, 194274360, 194280450, 194280574, 194287387, 194287357, 194292672, 194305074, 194320960, 194328670, 194331092, 194329792, 194337863, 194359700, 194383143, 194399192, 194399774, 194418073, 194433063, 194464805, 194468034, 194479410, 194489229, 194497705, 194499804, 194502068, 194509368, 194514067, 194541864, 194542457, 194549922, 194551382, 194555324, 194561693, 194565631, 194565756, 194569955, 194579174, 194581698, 194590549, 194607215, 194617573, 194635555, 194640103, 194683585, 194702674, 194708987, 194713405, 194714722, 194717538, 194720261, 194723760, 194731697, 194732335, 194732737, 194751424, 194744996, 194753074, 194761973, 194777096, 194790667, 194794451, 194805191, 194804469, 194816937, 194824749, 194857616, 194869289, 194896456, 194936889, 194979450, 195003342, 195024725, 195033806, 195038854, 195039169, 195048097, 195052866, 195063269, 195068585, 195083607, 195088766, 195090658, 195097288, 195098361, 195100913, 195101510, 195113569, 195114874, 195114045, 195171803, 195205081, 195224171, 195229926, 195230565, 195260495, 195270255, 195273761, 195283686, 195299293, 195309305, 195345823, 195351363, 195355084, 195363067, 195361340, 195369393, 195376127, 195385934, 195397207, 195409793, 195410990, 195416679, 195420494, 195431944, 195463949, 195468490, 195473412, 195495838, 195500691, 195506533, 195563106, 195564195, 195565333, 195564856, 195565562, 195568649, 195570567, 195575895, 195580309, 195591012, 195591902, 195598576, 195618766, 195619594, 195635639, 195645883, 195657247, 195659920, 195673572, 195749648, 195762398, 195790876, 195793708, 195805203, 195857995, 195866339, 195879603, 195885444, 195885280, 195898832, 195909893, 195909616, 195919143, 195962233, 195969582, 195980281, 195998065, 196003266, 196001405, 196164069, 196260854, 196371958, 196401417, 196472675, 196474232, 196476814, 196489680, 196502361, 196516473, 196538137, 196542801, 196545947, 196554925, 196556276, 196568972, 196567702, 196575400, 196579222, 196587403, 196605553, 196605043, 196630063, 196745620, 196743487, 196760512, 196804768, 196816765, 196813413, 196855195, 196895659, 196904792, 196940311, 196949947, 196952056, 196958565, 196964604, 196984612, 197003034, 197011767, 197033666, 197034128, 197038067, 197042067, 197048141, 197050724, 197075221, 197102637, 197116181, 197127571, 197125170, 197137102, 197140697, 197150903, 197153402, 197163368, 197191446, 197193632, 197213836, 197214083, 197231761, 197232997, 197234560, 197243992, 197269750, 197298696, 197302262, 197301997, 197307820, 197316519, 197328020, 197345252, 197353709, 197366744, 197368218, 197379241, 197381495, 197397584, 197404081, 197407871, 197406621, 197407955, 197408662, 197418896, 197421930, 197426871, 197427998, 197442371, 197450562, 197453110, 197456305, 197445138, 197467003, 197476080, 197473968, 197480926, 197484275, 197491588, 197495698, 197498181, 197500698, 197540699, 197537284, 197548128, 197571140, 197580331, 197583546, 197583750, 197600970, 197605418, 197631790, 197626954, 197682932, 197700781, 197774280, 197594400, 197814149, 197820451, 197835993, 197870026, 197909760, 197926451, 197936477, 197936799, 197953417, 197955696, 197960243, 197974294, 197976560, 197983480, 197984312, 197999001, 198004755, 198012950, 198015626, 198022013, 198034365, 198076803, 198092335, 198135183, 198180213, 198161827, 198197619, 198206471, 198223689, 198240335, 198295772, 198319288, 198328938, 198328773, 198333166, 198336375, 198340239, 198351064, 198369874, 198373692, 198374726, 198380551, 198411902, 198429392, 198440944, 198447514, 198468720, 198477464, 198480922, 198486416, 198492681, 198510589, 198515089, 198521927, 198529489, 198532618, 198536059, 198567770, 198572753, 198585995, 198598763, 198602509, 198610346, 198622026, 198621653, 198626163, 198652287, 198655101, 198656315, 198658773, 198659662, 198688643, 198704554, 198707374, 198711403, 198717453, 198717134, 198732301, 198733864, 198784617, 198792329, 198798145, 198820236, 198821625, 198823534, 198829320, 198833045, 198836780, 198842846, 198856114, 198860975, 198892495, 198916276, 198965091, 198977232, 198999772, 199006853, 199009084, 199025591, 199037747, 199058849, 199068515, 199065158, 199077731, 199081758, 199104080, 199119071, 199133715, 199156085, 199170570, 199171419, 199170952, 199189167, 199199680, 199203460, 199211192, 199214308, 199214951, 199225357, 199230865, 199229511, 199228117, 199241930, 199242308, 199252297, 199259073, 199281693, 199280657, 199305791, 199309613, 199380826, 199404544, 199401947, 199432288, 199514603, 199550110, 199599231, 199613415, 199660617, 199687827, 199727133, 199736233, 199747822, 199750254, 199780354, 199855973, 199868018, 199882626, 199887339, 199913349, 199917826, 199928577, 199931901, 199961392, 199966663, 199967769, 199972800, 199986019, 199993354, 199999511, 200025918, 200027176, 200058993, 200069579, 200082652, 200108506, 200120578, 200119068, 200136070, 200164850, 200165531, 200169960, 200173539, 200183241, 200203616, 200207250, 200213054, 200235275, 200241069, 200246129, 200258292, 200259718, 200292537, 200310633, 200337448, 200341277, 200342375, 200352210, 200345029, 200351804, 200364758, 200389393, 200411277, 200422666, 200434325, 200436208, 200566992, 200573263, 200589221, 200589396, 200608272, 200620282, 200633216, 200655010, 200660672, 200672880, 200700918, 200716541, 200718521, 200760625, 200764288, 200817490, 200824696, 200829063, 200841461, 200895707, 200907124, 200908537, 200914832, 200931389, 200950475, 200974805, 201141871, 201149678, 201155746, 201158749, 201159410, 201161121, 201167648, 201168697, 201172575, 201174790, 201196145, 201201478, 201208587, 201206769, 201210325, 201211325, 201231734, 201257983, 201260920, 201320867, 201321715, 
            201349894, 201392282, 201405897, 201406349, 201418800, 201424753, 201425532, 201430397, 201441097, 201468074, 201478900, 201482820, 201491225, 201493577, 201534861, 201537012, 201540569, 201583076, 201600204, 201611493, 201616397, 201615141, 201622341, 201622567, 201627128, 201638255, 201640986, 201643832, 201679378, 201679540, 201683301, 201699206, 201704332, 201711200, 201710602, 201718751, 201737874, 201777203, 201784877, 201790269, 201834806, 201838108, 201878555, 201886984, 201887560, 201891757, 201900543, 201903834, 201905135, 201924017, 201924696, 201923779, 201930790, 201950022, 201964744, 201966970, 201969059, 201982478, 201987101, 201997944, 202003422, 202016752, 202019238, 202025752, 202064641, 202114377, 202143350, 202154929, 202148988, 202199418, 202210947, 202251004, 202287824, 202315278, 202370534, 202374449, 202399285, 202408897, 202423261, 202465541, 202472097, 202474852, 202477661, 202491973, 202520764, 202551215, 202578838, 202620728, 202622084, 202623692, 202635901, 202671829, 202716809, 202753353, 202758689, 202770147, 202771469, 202771293, 202776314, 202777247, 202780691, 202787247, 202793273, 202802144, 202815257, 202830659, 202832323, 202830074, 202847546, 202856436, 202873150, 202880624, 202892200, 202895839, 202900993, 202901039, 202917842, 202942020, 202967335, 202970492, 202998968, 203037655, 203041826, 203041135, 203049098, 203067687, 203085909, 203087161, 203088126, 203090887, 203111026, 203136736, 203147196, 203154500, 203156865, 203156316, 203171371, 203198420, 203264641, 203268390, 203275241, 203289347, 203307819, 203305152, 203320914, 203329568, 203343297, 203354199, 203357169, 203377689, 203378652, 203386020, 203385719, 203396911, 203478410, 203475249, 203526068, 203614817, 203619740, 203625630, 203626670, 203634943, 203637853, 203638609, 203641274, 203642705, 203653944, 203652976, 203655377, 203663571, 203694420, 203695038, 203735817, 203736388, 203751946, 203759989, 203783539, 203786014, 203808075, 203805699, 203838464, 203837743, 203915670, 203925709, 203947769, 203951536, 203954795, 203962226, 203970015, 203974277, 203990995, 204004680, 204019002, 204022612, 204029214, 204036970, 204049200, 204048023, 204057308, 204068145, 204081147, 204190852, 204197239, 204235579, 204240489, 204245757, 204250822, 204267004, 204268616, 204283853, 204290169, 204295195, 204297059, 204305036, 204308730, 204323575, 204319494, 204329245, 204364018, 204382088, 204383964, 204385620, 204407357, 204410843, 204412731, 204425668, 204444917, 204486093, 204488274, 204490890, 204499542, 204514423, 204513019, 204519840, 204529573, 204534982, 204540921, 204536331, 204570811, 204576503, 204570020, 204622779, 204637231, 204650797, 204654436, 204654711, 204655606, 204668520, 204672601, 204689321, 204694542, 204745747, 204761457, 204765082, 204775304, 204794047, 204811315, 204838324, 204844294, 204858026, 204877975, 204880076, 204881840, 204905077, 204912178, 204913005, 204932910, 204932258, 204953225, 204969773, 205010702, 205028512, 205033272, 205040832, 205035393, 205054444, 205053610, 205061023, 205061108, 205068251, 205074919, 205088473, 205093108, 205099477, 205110890, 205169146, 205200937, 205230443, 205247279, 205249625, 205284518, 205312460, 205354955, 205356666, 205357599, 205373792, 205372650, 205381262, 205386918, 205397713, 205418312, 205422364, 205458018, 205464065, 205556802, 205624630, 205667210, 205864570, 205870906, 205874631, 205875676, 205897073, 205914653, 205953178, 206006906, 206009807, 206030348, 206230703, 206274069, 206295700, 206312242, 206314604, 206336240, 206342353, 206367780, 206384464, 206400279, 206410922, 206417661, 206441957, 206453190, 206500455, 206628434, 206633517, 206658119, 206686656, 206690799, 206690473, 206696115, 206700288, 206706318, 206709794, 206734980, 206765171, 206769279, 206772545, 206779752, 206793775, 206818921, 206819393, 206821321, 206834432, 206833330, 206835421, 206841298, 206852984, 206879581, 206909672, 206926693, 206925971, 206927884, 206938858, 206949982, 206949784, 206964282, 206975185, 206979117, 206987805, 206988691, 207004869, 207029475, 207060245, 207089136, 207105204, 207124327, 207129713, 207133068, 207133463, 207141930, 207147932, 207149263, 207150455, 207169878, 207170984, 207185711, 207205192, 207211758, 207297176, 207305840, 207316519, 207327757, 207344550, 207362092, 207366182, 207379940, 207384297, 207389433, 207402015, 207405568, 207428016, 207477173, 207489390, 207495497, 207530242, 207533611, 207557863, 207569766, 207571367, 207578104, 207586465, 207598568, 207599890, 207601778, 207606570, 207604246, 207613281, 207614597, 207626595, 207627882, 207632781, 207637448, 207642744, 207614821, 207684582, 207696392, 207700138, 207715964, 207718347, 207715125, 207723641, 207732065, 207730099, 207739129, 207764483, 207766255, 207772884, 207813025, 207826142, 207826319, 207832729, 207848915, 207859130, 207868411, 207873402, 207877505, 207881247, 207886185, 207895791, 207898909, 207905869, 207910697, 207954602, 207966221, 208005967, 208039159, 208044107, 208041139, 208061508, 208065076, 208077873, 208081080, 208089368, 208101585, 208108116, 208159279, 208176995, 208182048, 208183402, 208197893, 208202538, 208205622, 208215823, 208237121, 208239836, 208364938, 208367338, 208395913, 208404080, 208400084, 208409330, 208421825, 208432046, 208459389, 208478615, 208506440, 208510893, 208520089, 208525194, 208543920, 208597810, 208604253, 208612909, 208613802, 208620921, 208621034, 208630653, 208640185, 208648524, 208649108, 208649859, 208670051, 208694242, 208719270, 208756243, 208774653, 208780804, 208786150, 208788111, 208800679, 208805409, 208813654, 208823368, 208824970, 208826848, 208829489, 208838197, 208839288, 208851780, 208867641, 208886427, 208922326, 208922571, 208926768, 208949007, 208974669, 208976040, 208975122, 208980172, 208990936, 209005243, 209013152, 209022771, 209093530, 209099212, 209103770, 209105215, 209105889, 209121072, 209157284, 209188474, 209203396, 209213610, 209273748, 209282303, 209296491, 209298449, 209320437, 209344193, 209354538, 209347025, 209358735, 209391412, 209400517, 209408843, 209420341, 209428933, 209441830, 209478409, 209489337, 209497213, 209499033, 209508842, 209508762, 209513451, 209520116, 209526215, 209600339, 209606067, 209626595, 209627263, 209714447, 209719378, 209785942, 209809890, 209809748, 209817987, 209822072, 209830326, 209830968, 209833884, 209839428, 209868394, 209869099, 209868313, 209881355, 209899881, 209908453, 209919472, 209930461, 209930793, 209938388, 209941545, 209969323, 209971365, 209972600, 209974687, 210005748, 210033272, 210073545, 210086904, 210120283, 210149474, 210154017, 210158716, 210159461, 210157137, 210178003, 210222368, 210232606, 210254638, 210258686, 210302481, 210307581, 210315120, 210316480, 210334710, 210340254, 210341446, 210341829, 210353114, 210352569, 210367498, 210385701, 210389234, 210393082, 210402355, 210400136, 210403458, 210411922, 210411446, 210423048, 210421960, 210427333, 210434690, 210441714, 210444154, 210458048, 210473804, 210470261, 210468314, 210505057, 210521729, 210527836, 210542180, 210574193, 210595481, 210598977, 210615078, 210617878, 210629052, 210627344, 210636628, 210639482, 210653917, 210663029, 210668901, 210671703, 210692312, 210734578, 210739778, 210740396, 210757114, 210757331, 210760462, 210764435, 210771451, 210773022, 210774254, 210780018, 210785741, 210788405, 210796492, 210814027, 210845213, 210853450, 210860657, 210861022, 210869646, 210884899, 210894816, 210902966, 210905360, 210911847, 210913864, 210919849, 210924086, 210924331, 210929367, 210933272, 210939090, 210945448, 210947070, 210948157, 210953120, 210954778, 210959483, 210983705, 210993580, 211001049, 211002281, 211001741, 211000862, 211004948, 211009834, 211031561, 211036333, 211036440, 211042868, 211044101, 211044858, 211046064, 211090933, 211112852, 211112320, 211121898, 211157832, 211159693, 211164631, 211207421, 211211419, 211211246, 211212170, 211212751, 211219023, 211222750, 211226655, 211232429, 211233198, 211240547, 211250235, 211261425, 211271346, 211294360, 211370384, 211377431, 211394776, 211399436, 211397757, 211409420, 211412322, 211416017, 211449915, 211463411, 211461241, 211471146, 211478629, 211483381, 211496213, 211490695, 211503968, 211506815, 211510613, 211510214, 211508895, 211546473, 211601943, 211605950, 211619444, 211632914, 211637866, 211640592, 211642336, 211644465, 211645955, 211649249, 211652920, 211664080, 211687945, 211711355, 211719584, 211733997, 211749880, 211773923, 211778961, 211819633, 211823838, 211826760, 211836871, 211840640, 211840921, 211854172, 211867209, 211877853, 211878759, 211886322, 211886137, 211911657, 211924899, 211939462, 211955405, 211963440, 211964385, 211965361, 211987712, 212032780, 212047120, 212056863, 212074109, 212075792, 212073836, 212076776, 212078750, 212082114, 212086503, 212096849, 212108174, 212117989, 212151161, 212150421, 212151797, 212157536, 212160704, 212165685, 212164327, 212188818, 212233687, 212237076, 212255727, 212277492, 212300433, 212307853, 212308124, 212337435, 212362151, 212362219, 212369304, 212390182, 212389971, 212391554, 212399844, 212451379, 212497108, 212505782, 212507575, 212509789, 212528361, 212535851, 212548279, 212574657, 212573729, 212580444, 212604058, 212614175, 212630316, 212655130, 212654026, 212677194, 212678991, 212695088, 212696863, 212707890, 212719117, 212727900, 212741154, 212749966, 212755005, 212755869, 212769525, 212772958, 212781236, 212798138, 212799567, 212803597, 212810230, 212810280, 212837339, 212844909, 212864328, 212869383, 212873948, 212889600, 212942286, 212943888, 212951457, 212954387, 212961544, 212962761, 212969826, 212972465, 212973690, 212975841, 212978502, 212994171, 212995896, 213017985, 213026032, 213031411, 213044679, 213058845, 213080720, 213083881, 213107913, 213102254, 213138977, 213171583, 213203777, 213219435, 213220067, 213223493, 213222760, 213222963, 213226135, 213231319, 213246453, 213259070, 213266336, 213271023, 213319817, 213326387, 213332042, 213378908, 213380528, 213392029, 213396369, 213398246, 213416659, 213467729, 213481304, 213496703, 213500996, 213501578, 213519888, 213531224, 213539421, 213552291, 213567037, 213592205, 213610505, 213618688, 213639520, 213647286, 213647844, 213667074, 213689462, 213721660, 213743810, 213787716, 213795752]

In [21]:
for i in del_list :
    training_df = training_df.drop(i)

In [None]:
training_df = training_df.drop(190167541)

In [22]:
x = training_df[training_columns].values
y = training_df[target_column].values

In [None]:
x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [23]:
training_df['label'].value_counts()

1    13926
0     1402
Name: label, dtype: int64

<h2>Hyper tuning</h2>

In [None]:
param_grid = {
    'num_leaves': [15,31, 63],
    'learning_rate': [0.1, 0.01],
    'class_weight' : [{0:4,1:1}, {0:3,1:1}],
    'num_boosting_rounds': [100,500],
    'max_bins': [10,100,1000],
    'n_estimators': [50,100,200],
    'reg_alpha': [0.1, 0.5],
    'random_state': [1,8,16,64],
    'min_data_in_leaf': [30,100,400],
    'lambda_l2': [0, 1]
    }

In [None]:
model = lightgbm.LGBMClassifier()

In [None]:
gkf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(X=x, y=y)

In [None]:
gs = GridSearchCV(
    estimator=model, param_grid=param_grid, cv=gkf, scoring='f1_macro',
    verbose=True, n_jobs=3)

In [None]:
gs.fit(x, y.ravel())

In [None]:
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

In [None]:
predicted_y = gs.predict(x_test)
train_pred_y = gs.predict(x)

In [None]:
print(); print(metrics.classification_report(y,train_pred_y))
print(); print(metrics.confusion_matrix(y, train_pred_y))

print(); print(metrics.classification_report(y_test, predicted_y))
print(); print(metrics.confusion_matrix(y_test, predicted_y))

<h2>Choosing the best model</h2>

In [24]:
import math

In [25]:
scale_pos_weight = math.sqrt(1681/15494)

In [26]:
scale_pos_weight

0.32938368270125923

In [27]:
from sklearn.metrics import f1_score
def evaluate_macroF1_lgb(y_true, y_pred):  
    y_hat = np.where(y_pred < 0.5, 0, 1) 
    f1 = f1_score(y_true, y_hat, average='macro')
    return ('macroF1', f1, True) 

In [28]:
def focal_loss_lgb_eval_error(y_true, y_pred, alpha=.25, gamma=2.):
    a,g = alpha, gamma
    p = 1/(1+np.exp(-y_pred))
    loss = -( a*y_true + (1-a)*(1-y_true) ) * (( 1 - ( y_true*p + (1-y_true)*(1-p)) )**g) * ( y_true*np.log(p)+(1-y_true)*np.log(1-p) )
    return 'focal_loss', np.mean(loss), False

In [29]:
gkf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [30]:
model = lightgbm.LGBMClassifier(boosting_type='gbdt',scale_pos_weight = scale_pos_weight, learning_rate=0.01, max_bins=10, min_data_in_leaf=60, n_estimators=100, num_iterations=1000, num_leaves=63, random_state=1, reg_alpha=0.1, metric=["custom",'binary_logloss'],early_stopping_rounds=250)

In [31]:
model = lightgbm.LGBMClassifier(class_weight={0:3,1:1}, learning_rate=0.1)

In [32]:
for training_index, testing_index in gkf.split(X=x, y=y):
    x_train_fold, y_train_fold = x[training_index], y[training_index]
    x_test_fold, y_test_fold = x[testing_index], y[testing_index]
    model.fit(x_train_fold, y_train_fold, eval_set=(x_test_fold,y_test_fold),eval_metric = lambda y_true, y_pred: [evaluate_macroF1_lgb(y_true,y_pred)])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[1]	valid_0's binary_logloss: 0.342827	valid_0's macroF1: 0.476077
[2]	valid_0's binary_logloss: 0.32049	valid_0's macroF1: 0.476077
[3]	valid_0's binary_logloss: 0.300154	valid_0's macroF1: 0.476077
[4]	valid_0's binary_logloss: 0.283826	valid_0's macroF1: 0.476077
[5]	valid_0's binary_logloss: 0.268752	valid_0's macroF1: 0.779391
[6]	valid_0's binary_logloss: 0.255441	valid_0's macroF1: 0.802469
[7]	valid_0's binary_logloss: 0.244584	valid_0's macroF1: 0.811228
[8]	valid_0's binary_logloss: 0.235258	valid_0's macroF1: 0.82326
[9]	valid_0's binary_logloss: 0.226948	valid_0's macroF1: 0.831289
[10]	valid_0's binary_logloss: 0.218745	valid_0's macroF1: 0.841113
[11]	valid_0's binary_logloss: 0.211809	valid_0's macroF1: 0.837879
[12]	valid_0's binary_logloss: 0.206051	valid_0's macroF1: 0.840602
[13]	valid_0's binary_logloss: 0.200981	valid_0's macroF1: 0.838243
[14]	valid_0's binary_logloss: 0.196149	valid_0's macroF1: 0.840954
[15]	valid_0's binary_logloss: 0.192324	valid_0's macroF1: 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[2]	valid_0's binary_logloss: 0.318844	valid_0's macroF1: 0.475987
[3]	valid_0's binary_logloss: 0.299396	valid_0's macroF1: 0.475987
[4]	valid_0's binary_logloss: 0.283126	valid_0's macroF1: 0.475987
[5]	valid_0's binary_logloss: 0.269022	valid_0's macroF1: 0.782576
[6]	valid_0's binary_logloss: 0.256795	valid_0's macroF1: 0.802424
[7]	valid_0's binary_logloss: 0.245939	valid_0's macroF1: 0.820323
[8]	valid_0's binary_logloss: 0.236392	valid_0's macroF1: 0.828581
[9]	valid_0's binary_logloss: 0.228706	valid_0's macroF1: 0.833442
[10]	valid_0's binary_logloss: 0.221451	valid_0's macroF1: 0.827926
[11]	valid_0's binary_logloss: 0.215464	valid_0's macroF1: 0.831261
[12]	valid_0's binary_logloss: 0.209082	valid_0's macroF1: 0.831546
[13]	valid_0's binary_logloss: 0.203571	valid_0's macroF1: 0.831816
[14]	valid_0's binary_logloss: 0.198844	valid_0's macroF1: 0.830279
[15]	valid_0's binary_logloss: 0.194585	valid_0's macroF1: 0.83131
[16]	valid_0's binary_logloss: 0.190594	valid_0's macroF1

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[1]	valid_0's binary_logloss: 0.34189	valid_0's macroF1: 0.475987
[2]	valid_0's binary_logloss: 0.317713	valid_0's macroF1: 0.475987
[3]	valid_0's binary_logloss: 0.297914	valid_0's macroF1: 0.475987
[4]	valid_0's binary_logloss: 0.280705	valid_0's macroF1: 0.695136
[5]	valid_0's binary_logloss: 0.26586	valid_0's macroF1: 0.789477
[6]	valid_0's binary_logloss: 0.253033	valid_0's macroF1: 0.810577
[7]	valid_0's binary_logloss: 0.241191	valid_0's macroF1: 0.825163
[8]	valid_0's binary_logloss: 0.231582	valid_0's macroF1: 0.830797
[9]	valid_0's binary_logloss: 0.222981	valid_0's macroF1: 0.832224
[10]	valid_0's binary_logloss: 0.215171	valid_0's macroF1: 0.834876
[11]	valid_0's binary_logloss: 0.20779	valid_0's macroF1: 0.8361
[12]	valid_0's binary_logloss: 0.201411	valid_0's macroF1: 0.840519
[13]	valid_0's binary_logloss: 0.195694	valid_0's macroF1: 0.842953
[14]	valid_0's binary_logloss: 0.190725	valid_0's macroF1: 0.84607
[15]	valid_0's binary_logloss: 0.186117	valid_0's macroF1: 0.84

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[25]	valid_0's binary_logloss: 0.166535	valid_0's macroF1: 0.858648
[26]	valid_0's binary_logloss: 0.165035	valid_0's macroF1: 0.858648
[27]	valid_0's binary_logloss: 0.163504	valid_0's macroF1: 0.857885
[28]	valid_0's binary_logloss: 0.162043	valid_0's macroF1: 0.858648
[29]	valid_0's binary_logloss: 0.160594	valid_0's macroF1: 0.86218
[30]	valid_0's binary_logloss: 0.159468	valid_0's macroF1: 0.860953
[31]	valid_0's binary_logloss: 0.158349	valid_0's macroF1: 0.864509
[32]	valid_0's binary_logloss: 0.157285	valid_0's macroF1: 0.862502
[33]	valid_0's binary_logloss: 0.156798	valid_0's macroF1: 0.860953
[34]	valid_0's binary_logloss: 0.156122	valid_0's macroF1: 0.86218
[35]	valid_0's binary_logloss: 0.154931	valid_0's macroF1: 0.860639
[36]	valid_0's binary_logloss: 0.154229	valid_0's macroF1: 0.859108
[37]	valid_0's binary_logloss: 0.153626	valid_0's macroF1: 0.859108
[38]	valid_0's binary_logloss: 0.153241	valid_0's macroF1: 0.858648
[39]	valid_0's binary_logloss: 0.153091	valid_0's 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[20]	valid_0's binary_logloss: 0.176135	valid_0's macroF1: 0.851977
[21]	valid_0's binary_logloss: 0.174265	valid_0's macroF1: 0.849206
[22]	valid_0's binary_logloss: 0.172126	valid_0's macroF1: 0.850458
[23]	valid_0's binary_logloss: 0.170519	valid_0's macroF1: 0.851707
[24]	valid_0's binary_logloss: 0.168827	valid_0's macroF1: 0.851217
[25]	valid_0's binary_logloss: 0.167036	valid_0's macroF1: 0.850685
[26]	valid_0's binary_logloss: 0.165646	valid_0's macroF1: 0.850196
[27]	valid_0's binary_logloss: 0.164949	valid_0's macroF1: 0.85095
[28]	valid_0's binary_logloss: 0.164027	valid_0's macroF1: 0.850196
[29]	valid_0's binary_logloss: 0.162348	valid_0's macroF1: 0.848198
[30]	valid_0's binary_logloss: 0.161527	valid_0's macroF1: 0.850196
[31]	valid_0's binary_logloss: 0.15973	valid_0's macroF1: 0.850421
[32]	valid_0's binary_logloss: 0.158266	valid_0's macroF1: 0.851922
[33]	valid_0's binary_logloss: 0.157586	valid_0's macroF1: 0.851922
[34]	valid_0's binary_logloss: 0.156847	valid_0's 

In [33]:
predicted_y = model.predict(x_test)
train_pred_y = model.predict(x)

NameError: name 'x_test' is not defined

In [34]:
train_pred_y = model.predict(x)

In [None]:
print(); print(metrics.classification_report(y,train_pred_y))
print(); print(metrics.confusion_matrix(y, train_pred_y))

print(); print(metrics.classification_report(y_test, predicted_y))
print(); print(metrics.confusion_matrix(y_test, predicted_y))

In [35]:
print(); print(metrics.classification_report(y,train_pred_y))
print(); print(metrics.confusion_matrix(y, train_pred_y))


              precision    recall  f1-score   support

           0       0.90      0.88      0.89      1402
           1       0.99      0.99      0.99     13926

    accuracy                           0.98     15328
   macro avg       0.94      0.93      0.94     15328
weighted avg       0.98      0.98      0.98     15328


[[ 1229   173]
 [  141 13785]]


<h2>Testing</h2>

In [36]:

test_set = pd.read_csv(data_path+'test.csv', index_col="tripid")


In [37]:
test_set['pickup_time'] = pd.to_datetime(test_set['pickup_time'], format="%m/%d/%Y %H:%M")
test_set['drop_time'] = pd.to_datetime(test_set['drop_time'], format="%m/%d/%Y %H:%M")

In [38]:
test_set = test_set.assign(timeOfDay=pd.cut(test_set.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [39]:
test_set.loc[test_set['timeOfDay'] == 'day', 'isNormalCharge'] = 1
test_set.loc[test_set['timeOfDay'] != 'day', 'isNormalCharge'] = 0

In [40]:
durations = []
for index,row in test_set.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = np.nan
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

test_set.insert(4,"time_dif",durations)

In [41]:
new_column = []                    #empty column for distance
for index,row in test_set.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

test_set.insert(4,"distance",new_column)

In [42]:
test_set['time_driven'] = test_set['duration']  - test_set['meter_waiting']

In [43]:
chargeperhours = []
for index,row in test_set.iterrows():
    if(row['meter_waiting'] == 0):
        chargeperhour = np.nan
    else:
        chargeperhour = (row['meter_waiting_fare'] / row['meter_waiting'] * 3600)
    chargeperhours.append(chargeperhour)

test_set.insert(4,'charge_per_hour',chargeperhours)


In [44]:
test_set['driving_fare'] = test_set['fare']  - test_set['meter_waiting_fare'] - test_set['additional_fare']

In [45]:
avgspeeds = []
for index,row in test_set.iterrows():
    if(row['time_driven'] == 0):
        avgspeed = np.nan    
    else:
        avgspeed = (row['distance'] / row['time_driven'] * 3600)
    avgspeeds.append(avgspeed)

test_set.insert(4,"avg_speed",avgspeeds)



In [46]:
costsperkm = []
for index,row in test_set.iterrows():
    if row['distance'] == 0:
        costperkm = np.nan
            
    else:
        costperkm = (row['driving_fare'] / row['distance'])
    costsperkm.append(costperkm)

test_set.insert(4,"cost_per_km",costsperkm)

In [47]:
test_features = test_set[training_columns]

In [48]:
test_features.isna().sum()

meter_waiting           0
meter_waiting_fare      0
fare                    0
additional_fare         0
distance                0
cost_per_km            25
avg_speed               9
time_dif                0
time_driven             0
charge_per_hour       298
driving_fare            0
isNormalCharge          0
pick_lat                0
pick_lon                0
drop_lat                0
drop_lon                0
dtype: int64

In [49]:
predicted_labels = model.predict(test_features)

In [50]:
predicted_labels_df = pd.DataFrame(predicted_labels )

In [51]:
sub_path =os.path.abspath(os.path.join(data_path+'/sample_submission.csv'))
submission_set = pd.read_csv(sub_path, index_col="tripid")

In [52]:
submission_set['prediction']= predicted_labels_df.values[:,0]

In [53]:
%%javascript
var kernel = IPython.notebook.kernel;
var thename = window.document.getElementById("notebook_name").innerHTML;
var command = "theNotebook = " + "'"+thename+"'";
kernel.execute(command);

<IPython.core.display.Javascript object>

In [59]:
filename = '../../submissions/'+theNotebook+'/'+theNotebook+'_{%i}.csv'
dirname = '../../submissions/'+theNotebook
fileversion = 1

if not os.path.exists(dirname):
    os.makedirs(dirname)
while glob.glob(filename.replace('{%i}',str(fileversion))) :
    fileversion+=1
submission_set.to_csv(filename.replace('{%i}',str(fileversion)), index=True)
print("Completed!")

Completed!


In [54]:
submission_set['prediction'].value_counts()

1    8042
0     534
Name: prediction, dtype: int64

<h2>testing against others</h2>

In [55]:
best_path =os.path.abspath(os.path.join('/home/madnisal/Documents/ML_Project/submissions/grid-lgbm/lgbm3_19.csv'))
best_set = pd.read_csv(best_path, index_col="tripid")

In [56]:
best_set['prediction'].value_counts()

1    8040
0     536
Name: prediction, dtype: int64

In [58]:
best_set[best_set['prediction'] != submission_set['prediction']].shape

(146, 1)