In [122]:

import pandas as pd
import numpy as np
import xgboost as xgb
from geopy.geocoders import Nominatim
import pickle
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

In [123]:
sample_df = pd.read_csv(r"C:\Users\Abhinav\Desktop\train.csv")

In [124]:
sample_df.shape

(1458644, 11)

In [125]:

sample_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


# Data Preprocessing

In [126]:
sample_df["store_and_fwd_flag"].value_counts()

N    1450599
Y       8045
Name: store_and_fwd_flag, dtype: int64

In [127]:
#Convert character variables to numeric 
f = lambda x: 0 if x == 'N' else 1

sample_df["store_and_fwd_flag"] = sample_df["store_and_fwd_flag"].apply(lambda x: f(x))

In [128]:
#Check result
sample_df["store_and_fwd_flag"].value_counts()

0    1450599
1       8045
Name: store_and_fwd_flag, dtype: int64

# Engineer features

In [129]:
#First, convert datetime strings into datetime
sample_df["dropoff_datetime"] = pd.to_datetime(sample_df["dropoff_datetime"], format='%Y-%m-%d %H:%M:%S')
sample_df["pickup_datetime"] = pd.to_datetime(sample_df["pickup_datetime"], format='%Y-%m-%d %H:%M:%S')

In [130]:
#Now construct other variables, like month, date, etc.
sample_df["pickup_month"] = sample_df["pickup_datetime"].dt.month
sample_df["pickup_day"] = sample_df["pickup_datetime"].dt.day
sample_df["pickup_weekday"] = sample_df["pickup_datetime"].dt.weekday #sample_df["pickup_weekday"] = sample_df["pickup_datetime"].dt.weekday_name
sample_df["pickup_hour"] = sample_df["pickup_datetime"].dt.hour
sample_df["pickup_minute"] = sample_df["pickup_datetime"].dt.minute

In [131]:
#Get latitude and longitude differences 
sample_df["latitude_difference"] = sample_df["dropoff_latitude"] - sample_df["pickup_latitude"]
sample_df["longitude_difference"] = sample_df["dropoff_longitude"] - sample_df["pickup_longitude"]

In [132]:
#Convert duration to minutes for easier interpretation
sample_df["trip_duration"] = sample_df["trip_duration"].apply(lambda x: round(x/60))

In [133]:
#Convert trip distance from longitude and latitude differences to Manhattan distance.
sample_df["trip_distance"] = 0.621371 * 6371 * (abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(sample_df["latitude_difference"]) * np.pi / 180) / 2))), 
                                  np.sqrt(1-(np.square(np.sin((abs(sample_df["latitude_difference"]) * np.pi / 180) / 2)))))) + \
                                     abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(sample_df["longitude_difference"]) * np.pi / 180) / 2))), 
                                  np.sqrt(1-(np.square(np.sin((abs(sample_df["longitude_difference"]) * np.pi / 180) / 2)))))))

In [134]:
sample_df.head(5)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_day,pickup_weekday,pickup_hour,pickup_minute,latitude_difference,longitude_difference,trip_distance
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,8,3,14,0,17,24,-0.002335,0.017525,1.372146
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,11,6,12,6,0,43,-0.007412,-0.019066,1.82944
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,35,1,19,1,11,35,-0.053852,-0.026306,5.538397
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,7,4,6,2,19,32,-0.013252,-0.002228,1.069567
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,7,3,26,5,13,30,-0.010689,0.00013,0.747485


# Modelling

In [135]:
X = sample_df.drop(["trip_duration", "id", "vendor_id", "pickup_datetime", "dropoff_datetime"], axis=1)
y = sample_df["trip_duration"]

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2018)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=2019)

In [137]:
#Define evaluation metric
def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5

In [138]:
#XGBoost parameters 
params = {
    'booster':            'gbtree',
    'objective':          'reg:linear',
    'learning_rate':      0.05,
    'max_depth':          14,
    'subsample':          0.9,
    'colsample_bytree':   0.7,
    'colsample_bylevel':  0.7,
    'silent':             1,
    'feval':              'rmsle'
}

In [139]:
nrounds = 2000

In [140]:
#Define train and validation sets
dtrain = xgb.DMatrix(X_train, np.log(y_train+1))
dval = xgb.DMatrix(X_val, np.log(y_val+1))

#this is for tracking the error
watchlist = [(dval, 'eval'), (dtrain, 'train')]

In [141]:
#Train model
gbm = xgb.train(params,
                dtrain,
                num_boost_round = nrounds,
                evals = watchlist,
                verbose_eval = True
                )


Parameters: { feval, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	eval-rmse:2.00600	train-rmse:2.00533
[1]	eval-rmse:1.90977	train-rmse:1.90919
[2]	eval-rmse:1.81870	train-rmse:1.81809
[3]	eval-rmse:1.73243	train-rmse:1.73183
[4]	eval-rmse:1.65036	train-rmse:1.64983
[5]	eval-rmse:1.57251	train-rmse:1.57196
[6]	eval-rmse:1.49885	train-rmse:1.49826
[7]	eval-rmse:1.42934	train-rmse:1.42875
[8]	eval-rmse:1.36316	train-rmse:1.36246
[9]	eval-rmse:1.30042	train-rmse:1.29947
[10]	eval-rmse:1.24151	train-rmse:1.24039
[11]	eval-rmse:1.18619	train-rmse:1.18496
[12]	eval-rmse:1.13395	train-rmse:1.13259
[13]	eval-rmse:1.08366	train-rmse:1.08198
[14]	eval-rmse:1.03616	train-rmse:1.03430
[15]	eval-rmse:0.99153	train-rmse:0.98946
[16]	eval-rmse:0.94900	train-rmse:0.94675
[17]	ev

[180]	eval-rmse:0.33259	train-rmse:0.22722
[181]	eval-rmse:0.33258	train-rmse:0.22711
[182]	eval-rmse:0.33254	train-rmse:0.22686
[183]	eval-rmse:0.33252	train-rmse:0.22651
[184]	eval-rmse:0.33251	train-rmse:0.22618
[185]	eval-rmse:0.33249	train-rmse:0.22576
[186]	eval-rmse:0.33243	train-rmse:0.22523
[187]	eval-rmse:0.33244	train-rmse:0.22503
[188]	eval-rmse:0.33244	train-rmse:0.22491
[189]	eval-rmse:0.33238	train-rmse:0.22460
[190]	eval-rmse:0.33237	train-rmse:0.22428
[191]	eval-rmse:0.33235	train-rmse:0.22421
[192]	eval-rmse:0.33233	train-rmse:0.22384
[193]	eval-rmse:0.33227	train-rmse:0.22362
[194]	eval-rmse:0.33226	train-rmse:0.22323
[195]	eval-rmse:0.33224	train-rmse:0.22317
[196]	eval-rmse:0.33217	train-rmse:0.22286
[197]	eval-rmse:0.33203	train-rmse:0.22229
[198]	eval-rmse:0.33200	train-rmse:0.22209
[199]	eval-rmse:0.33200	train-rmse:0.22199
[200]	eval-rmse:0.33188	train-rmse:0.22136
[201]	eval-rmse:0.33185	train-rmse:0.22102
[202]	eval-rmse:0.33171	train-rmse:0.22057
[203]	eval-

[371]	eval-rmse:0.32917	train-rmse:0.19053
[372]	eval-rmse:0.32916	train-rmse:0.19043
[373]	eval-rmse:0.32915	train-rmse:0.19017
[374]	eval-rmse:0.32915	train-rmse:0.19007
[375]	eval-rmse:0.32915	train-rmse:0.18995
[376]	eval-rmse:0.32915	train-rmse:0.18989
[377]	eval-rmse:0.32915	train-rmse:0.18987
[378]	eval-rmse:0.32913	train-rmse:0.18960
[379]	eval-rmse:0.32912	train-rmse:0.18946
[380]	eval-rmse:0.32911	train-rmse:0.18917
[381]	eval-rmse:0.32910	train-rmse:0.18901
[382]	eval-rmse:0.32909	train-rmse:0.18872
[383]	eval-rmse:0.32908	train-rmse:0.18847
[384]	eval-rmse:0.32908	train-rmse:0.18837
[385]	eval-rmse:0.32908	train-rmse:0.18833
[386]	eval-rmse:0.32908	train-rmse:0.18827
[387]	eval-rmse:0.32904	train-rmse:0.18806
[388]	eval-rmse:0.32903	train-rmse:0.18802
[389]	eval-rmse:0.32903	train-rmse:0.18800
[390]	eval-rmse:0.32903	train-rmse:0.18780
[391]	eval-rmse:0.32903	train-rmse:0.18765
[392]	eval-rmse:0.32902	train-rmse:0.18747
[393]	eval-rmse:0.32902	train-rmse:0.18732
[394]	eval-

[562]	eval-rmse:0.32812	train-rmse:0.16792
[563]	eval-rmse:0.32812	train-rmse:0.16790
[564]	eval-rmse:0.32811	train-rmse:0.16781
[565]	eval-rmse:0.32811	train-rmse:0.16769
[566]	eval-rmse:0.32811	train-rmse:0.16762
[567]	eval-rmse:0.32811	train-rmse:0.16751
[568]	eval-rmse:0.32811	train-rmse:0.16740
[569]	eval-rmse:0.32810	train-rmse:0.16721
[570]	eval-rmse:0.32810	train-rmse:0.16716
[571]	eval-rmse:0.32810	train-rmse:0.16707
[572]	eval-rmse:0.32810	train-rmse:0.16698
[573]	eval-rmse:0.32809	train-rmse:0.16683
[574]	eval-rmse:0.32809	train-rmse:0.16680
[575]	eval-rmse:0.32809	train-rmse:0.16673
[576]	eval-rmse:0.32808	train-rmse:0.16657
[577]	eval-rmse:0.32808	train-rmse:0.16653
[578]	eval-rmse:0.32808	train-rmse:0.16646
[579]	eval-rmse:0.32808	train-rmse:0.16640
[580]	eval-rmse:0.32808	train-rmse:0.16636
[581]	eval-rmse:0.32808	train-rmse:0.16631
[582]	eval-rmse:0.32808	train-rmse:0.16613
[583]	eval-rmse:0.32808	train-rmse:0.16598
[584]	eval-rmse:0.32808	train-rmse:0.16588
[585]	eval-

[753]	eval-rmse:0.32770	train-rmse:0.15172
[754]	eval-rmse:0.32770	train-rmse:0.15163
[755]	eval-rmse:0.32770	train-rmse:0.15152
[756]	eval-rmse:0.32770	train-rmse:0.15146
[757]	eval-rmse:0.32770	train-rmse:0.15141
[758]	eval-rmse:0.32770	train-rmse:0.15136
[759]	eval-rmse:0.32770	train-rmse:0.15132
[760]	eval-rmse:0.32769	train-rmse:0.15124
[761]	eval-rmse:0.32769	train-rmse:0.15119
[762]	eval-rmse:0.32769	train-rmse:0.15114
[763]	eval-rmse:0.32768	train-rmse:0.15107
[764]	eval-rmse:0.32768	train-rmse:0.15102
[765]	eval-rmse:0.32768	train-rmse:0.15094
[766]	eval-rmse:0.32768	train-rmse:0.15075
[767]	eval-rmse:0.32767	train-rmse:0.15072
[768]	eval-rmse:0.32767	train-rmse:0.15071
[769]	eval-rmse:0.32767	train-rmse:0.15066
[770]	eval-rmse:0.32767	train-rmse:0.15060
[771]	eval-rmse:0.32767	train-rmse:0.15059
[772]	eval-rmse:0.32767	train-rmse:0.15057
[773]	eval-rmse:0.32767	train-rmse:0.15046
[774]	eval-rmse:0.32767	train-rmse:0.15037
[775]	eval-rmse:0.32766	train-rmse:0.15031
[776]	eval-

[944]	eval-rmse:0.32745	train-rmse:0.13846
[945]	eval-rmse:0.32745	train-rmse:0.13836
[946]	eval-rmse:0.32744	train-rmse:0.13824
[947]	eval-rmse:0.32744	train-rmse:0.13815
[948]	eval-rmse:0.32744	train-rmse:0.13800
[949]	eval-rmse:0.32744	train-rmse:0.13795
[950]	eval-rmse:0.32744	train-rmse:0.13792
[951]	eval-rmse:0.32743	train-rmse:0.13788
[952]	eval-rmse:0.32743	train-rmse:0.13783
[953]	eval-rmse:0.32743	train-rmse:0.13780
[954]	eval-rmse:0.32743	train-rmse:0.13778
[955]	eval-rmse:0.32743	train-rmse:0.13771
[956]	eval-rmse:0.32743	train-rmse:0.13765
[957]	eval-rmse:0.32743	train-rmse:0.13758
[958]	eval-rmse:0.32743	train-rmse:0.13750
[959]	eval-rmse:0.32743	train-rmse:0.13744
[960]	eval-rmse:0.32742	train-rmse:0.13726
[961]	eval-rmse:0.32742	train-rmse:0.13712
[962]	eval-rmse:0.32742	train-rmse:0.13708
[963]	eval-rmse:0.32742	train-rmse:0.13702
[964]	eval-rmse:0.32742	train-rmse:0.13692
[965]	eval-rmse:0.32742	train-rmse:0.13688
[966]	eval-rmse:0.32742	train-rmse:0.13686
[967]	eval-

[1132]	eval-rmse:0.32731	train-rmse:0.12734
[1133]	eval-rmse:0.32731	train-rmse:0.12728
[1134]	eval-rmse:0.32732	train-rmse:0.12723
[1135]	eval-rmse:0.32731	train-rmse:0.12713
[1136]	eval-rmse:0.32731	train-rmse:0.12710
[1137]	eval-rmse:0.32732	train-rmse:0.12705
[1138]	eval-rmse:0.32732	train-rmse:0.12700
[1139]	eval-rmse:0.32732	train-rmse:0.12693
[1140]	eval-rmse:0.32732	train-rmse:0.12686
[1141]	eval-rmse:0.32732	train-rmse:0.12678
[1142]	eval-rmse:0.32731	train-rmse:0.12666
[1143]	eval-rmse:0.32732	train-rmse:0.12660
[1144]	eval-rmse:0.32732	train-rmse:0.12654
[1145]	eval-rmse:0.32732	train-rmse:0.12647
[1146]	eval-rmse:0.32732	train-rmse:0.12646
[1147]	eval-rmse:0.32732	train-rmse:0.12642
[1148]	eval-rmse:0.32731	train-rmse:0.12635
[1149]	eval-rmse:0.32731	train-rmse:0.12629
[1150]	eval-rmse:0.32732	train-rmse:0.12625
[1151]	eval-rmse:0.32731	train-rmse:0.12622
[1152]	eval-rmse:0.32731	train-rmse:0.12618
[1153]	eval-rmse:0.32731	train-rmse:0.12602
[1154]	eval-rmse:0.32730	train-r

[1319]	eval-rmse:0.32723	train-rmse:0.11760
[1320]	eval-rmse:0.32722	train-rmse:0.11750
[1321]	eval-rmse:0.32722	train-rmse:0.11744
[1322]	eval-rmse:0.32722	train-rmse:0.11735
[1323]	eval-rmse:0.32722	train-rmse:0.11728
[1324]	eval-rmse:0.32722	train-rmse:0.11723
[1325]	eval-rmse:0.32722	train-rmse:0.11717
[1326]	eval-rmse:0.32722	train-rmse:0.11708
[1327]	eval-rmse:0.32722	train-rmse:0.11705
[1328]	eval-rmse:0.32722	train-rmse:0.11702
[1329]	eval-rmse:0.32722	train-rmse:0.11699
[1330]	eval-rmse:0.32722	train-rmse:0.11695
[1331]	eval-rmse:0.32722	train-rmse:0.11693
[1332]	eval-rmse:0.32722	train-rmse:0.11689
[1333]	eval-rmse:0.32722	train-rmse:0.11687
[1334]	eval-rmse:0.32722	train-rmse:0.11682
[1335]	eval-rmse:0.32722	train-rmse:0.11678
[1336]	eval-rmse:0.32722	train-rmse:0.11674
[1337]	eval-rmse:0.32722	train-rmse:0.11668
[1338]	eval-rmse:0.32722	train-rmse:0.11664
[1339]	eval-rmse:0.32722	train-rmse:0.11662
[1340]	eval-rmse:0.32722	train-rmse:0.11656
[1341]	eval-rmse:0.32722	train-r

[1506]	eval-rmse:0.32718	train-rmse:0.10873
[1507]	eval-rmse:0.32718	train-rmse:0.10869
[1508]	eval-rmse:0.32718	train-rmse:0.10863
[1509]	eval-rmse:0.32718	train-rmse:0.10862
[1510]	eval-rmse:0.32718	train-rmse:0.10859
[1511]	eval-rmse:0.32718	train-rmse:0.10855
[1512]	eval-rmse:0.32718	train-rmse:0.10853
[1513]	eval-rmse:0.32718	train-rmse:0.10849
[1514]	eval-rmse:0.32718	train-rmse:0.10846
[1515]	eval-rmse:0.32718	train-rmse:0.10844
[1516]	eval-rmse:0.32718	train-rmse:0.10844
[1517]	eval-rmse:0.32718	train-rmse:0.10838
[1518]	eval-rmse:0.32718	train-rmse:0.10833
[1519]	eval-rmse:0.32718	train-rmse:0.10829
[1520]	eval-rmse:0.32718	train-rmse:0.10826
[1521]	eval-rmse:0.32718	train-rmse:0.10823
[1522]	eval-rmse:0.32718	train-rmse:0.10821
[1523]	eval-rmse:0.32718	train-rmse:0.10818
[1524]	eval-rmse:0.32718	train-rmse:0.10812
[1525]	eval-rmse:0.32718	train-rmse:0.10808
[1526]	eval-rmse:0.32718	train-rmse:0.10807
[1527]	eval-rmse:0.32717	train-rmse:0.10804
[1528]	eval-rmse:0.32717	train-r

[1693]	eval-rmse:0.32716	train-rmse:0.10124
[1694]	eval-rmse:0.32716	train-rmse:0.10121
[1695]	eval-rmse:0.32716	train-rmse:0.10115
[1696]	eval-rmse:0.32716	train-rmse:0.10110
[1697]	eval-rmse:0.32716	train-rmse:0.10105
[1698]	eval-rmse:0.32716	train-rmse:0.10102
[1699]	eval-rmse:0.32716	train-rmse:0.10098
[1700]	eval-rmse:0.32716	train-rmse:0.10095
[1701]	eval-rmse:0.32716	train-rmse:0.10093
[1702]	eval-rmse:0.32716	train-rmse:0.10088
[1703]	eval-rmse:0.32716	train-rmse:0.10085
[1704]	eval-rmse:0.32716	train-rmse:0.10081
[1705]	eval-rmse:0.32716	train-rmse:0.10077
[1706]	eval-rmse:0.32716	train-rmse:0.10070
[1707]	eval-rmse:0.32716	train-rmse:0.10066
[1708]	eval-rmse:0.32716	train-rmse:0.10063
[1709]	eval-rmse:0.32716	train-rmse:0.10061
[1710]	eval-rmse:0.32716	train-rmse:0.10057
[1711]	eval-rmse:0.32716	train-rmse:0.10054
[1712]	eval-rmse:0.32716	train-rmse:0.10051
[1713]	eval-rmse:0.32716	train-rmse:0.10043
[1714]	eval-rmse:0.32716	train-rmse:0.10036
[1715]	eval-rmse:0.32716	train-r

[1880]	eval-rmse:0.32715	train-rmse:0.09455
[1881]	eval-rmse:0.32715	train-rmse:0.09452
[1882]	eval-rmse:0.32715	train-rmse:0.09449
[1883]	eval-rmse:0.32715	train-rmse:0.09446
[1884]	eval-rmse:0.32715	train-rmse:0.09441
[1885]	eval-rmse:0.32715	train-rmse:0.09435
[1886]	eval-rmse:0.32714	train-rmse:0.09431
[1887]	eval-rmse:0.32714	train-rmse:0.09428
[1888]	eval-rmse:0.32714	train-rmse:0.09425
[1889]	eval-rmse:0.32714	train-rmse:0.09423
[1890]	eval-rmse:0.32714	train-rmse:0.09422
[1891]	eval-rmse:0.32714	train-rmse:0.09420
[1892]	eval-rmse:0.32714	train-rmse:0.09414
[1893]	eval-rmse:0.32714	train-rmse:0.09406
[1894]	eval-rmse:0.32714	train-rmse:0.09403
[1895]	eval-rmse:0.32714	train-rmse:0.09402
[1896]	eval-rmse:0.32714	train-rmse:0.09400
[1897]	eval-rmse:0.32714	train-rmse:0.09395
[1898]	eval-rmse:0.32714	train-rmse:0.09390
[1899]	eval-rmse:0.32714	train-rmse:0.09387
[1900]	eval-rmse:0.32714	train-rmse:0.09386
[1901]	eval-rmse:0.32714	train-rmse:0.09383
[1902]	eval-rmse:0.32714	train-r

In [142]:
#Test predictions
pred = np.exp(gbm.predict(xgb.DMatrix(X_test))) - 1

In [143]:

#Use mean absolute error to get a basic estimate of the error
mae = (abs(pred - y_test)).mean()
mae

4.8215870279440365

In [144]:

#Take a look at feature importance
feature_scores = gbm.get_fscore()
feature_scores

{'latitude_difference': 503200,
 'trip_distance': 454740,
 'pickup_latitude': 755256,
 'pickup_weekday': 217697,
 'dropoff_longitude': 674020,
 'pickup_month': 241243,
 'passenger_count': 245093,
 'pickup_minute': 414737,
 'store_and_fwd_flag': 5590,
 'longitude_difference': 450291,
 'pickup_hour': 350519,
 'pickup_longitude': 827080,
 'pickup_day': 394199,
 'dropoff_latitude': 631465}

In [145]:
#This is not very telling, so let's scale the features
summ = 0
for key in feature_scores:
    summ = summ + feature_scores[key]

for key in feature_scores:
    feature_scores[key] = feature_scores[key] / summ

feature_scores

{'latitude_difference': 0.08162033890607336,
 'trip_distance': 0.0737600018166689,
 'pickup_latitude': 0.12250447273617912,
 'pickup_weekday': 0.03531101533949811,
 'dropoff_longitude': 0.10932778384235206,
 'pickup_month': 0.03913023731859669,
 'passenger_count': 0.03975471725657042,
 'pickup_minute': 0.0672714119572499,
 'store_and_fwd_flag': 0.0009067124294216018,
 'longitude_difference': 0.07303836253250134,
 'pickup_hour': 0.05685508659184802,
 'pickup_longitude': 0.134154510934887,
 'pickup_day': 0.06394009534267728,
 'dropoff_latitude': 0.10242525299547617}

In [146]:
filename = "xgb_model.sav"
pickle.dump(gbm, open(filename, 'wb'))

In [147]:
loaded_model = pickle.load(open(filename, 'rb'))



In [148]:
loaded_model

<xgboost.core.Booster at 0x23d87b70fd0>

# Genetic Algorithm

In [149]:
#Sample date
import datetime
date_list = [4, 6, 2016] #April 6, 2016

year = int(date_list[2])
month = int(date_list[1])
day = int(date_list[0])

my_date = datetime.date(year, month, day)

In [150]:
#Sample test locations

test_locations = {'L1': (40.819688, -73.915091),
                  'L2': (40.815421, -73.941761),
                  'L3': (40.764198, -73.910785),
                  'L4': (40.768790, -73.953285),
                  'L5': (40.734851, -73.952950),
                  'L6': (40.743613, -73.977998),
                  'L7': (40.745313, -73.993793),
                  'L8': (40.662713, -73.946101),
                  'L9': (40.703761, -73.886496),
                  'L10': (40.713620, -73.943076),
                  'L11': (40.725212, -73.809179)
             }

In [153]:

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="test/2")
addresses = []

for key in test_locations:
    location = geolocator.reverse(test_locations[key])
    addresses.append(location.address)

In [154]:

addresses

['424, East 155th Street, The Bronx, Bronx County, New York, 10455, United States of America',
 '137, West 136th Street, Harlem, Manhattan Community Board 10, Manhattan, New York County, New York, 10030, United States of America',
 '43-11, 28th Avenue, Queens, Queens County, New York, 11103, United States of America',
 '435, East 74th Street, Upper East Side, Manhattan Community Board 8, Manhattan, New York County, New York, 10021, United States of America',
 '211, Freeman Street, Brooklyn, Kings County, New York, 11222, United States of America',
 '232, East 32nd Street, Manhattan Community Board 6, Manhattan, New York County, New York, 10016, United States of America',
 '159, West 25th Street, Flatiron District, Manhattan Community Board 5, Manhattan, New York County, New York, 10001, United States of America',
 '458, Lefferts Avenue, Crown Heights, Brooklyn, Kings County, New York, 11225, United States of America',
 '70-38, 67th Place, Queens, Queens County, New York, 11385, United 

In [155]:
test_addresses = {'L1': '424 East 155th Street NY',
                  'L2': '137 West 136th Street NY',
                  'L3': '43-11 28th Avenue NY',
                  'L4': '435 East 74th Street NY',
                  'L5': '211 Freeman Street NY',
                  'L6': '232 East 32nd Street NY',
                  'L7': '159 West 25th Street NY',
                  'L8': '486 Brooklyn Avenue NY',
                  'L9': '70-38 67th Place NY',
                  'L10': '194 Devoe Street NY',
                  'L11': '158-46 76th Avenue NY'
             }

In [156]:

def create_guess(points):
    """
    Creates a possible path between all points, returning to the original.
    Input: List of point IDs
    """
    guess = points
    np.random.shuffle(guess)
    guess.append(guess[0])
    return list(guess)

create_guess(list(test_locations.keys()))

['L11', 'L10', 'L5', 'L3', 'L1', 'L9', 'L6', 'L4', 'L7', 'L2', 'L8', 'L11']

In [157]:
def create_generation(points, population=100):
    """
    Makes a list of guessed point orders given a list of point IDs.
    Input:
    points: list of point ids
    population: how many guesses to make
    """
    generation = [create_guess(points) for _ in range(population)]
    return generation

test_generation = create_generation(list(test_locations.keys()), population=10)
print(test_generation)

[['L9', 'L5', 'L3', 'L11', 'L6', 'L2', 'L8', 'L7', 'L4', 'L10', 'L1', 'L9'], ['L2', 'L5', 'L7', 'L4', 'L3', 'L9', 'L6', 'L1', 'L8', 'L10', 'L9', 'L11', 'L2'], ['L6', 'L9', 'L2', 'L7', 'L2', 'L5', 'L4', 'L9', 'L1', 'L8', 'L3', 'L11', 'L10', 'L6'], ['L1', 'L6', 'L8', 'L6', 'L9', 'L5', 'L9', 'L10', 'L11', 'L7', 'L4', 'L2', 'L2', 'L3', 'L1'], ['L1', 'L9', 'L2', 'L9', 'L1', 'L5', 'L3', 'L6', 'L6', 'L2', 'L10', 'L8', 'L4', 'L11', 'L7', 'L1'], ['L2', 'L10', 'L1', 'L7', 'L4', 'L6', 'L5', 'L11', 'L6', 'L9', 'L2', 'L1', 'L3', 'L1', 'L8', 'L9', 'L2'], ['L2', 'L2', 'L4', 'L5', 'L8', 'L6', 'L10', 'L11', 'L1', 'L7', 'L6', 'L1', 'L9', 'L3', 'L9', 'L1', 'L2', 'L2'], ['L4', 'L9', 'L2', 'L1', 'L2', 'L2', 'L1', 'L5', 'L2', 'L10', 'L11', 'L6', 'L3', 'L8', 'L6', 'L1', 'L7', 'L9', 'L4'], ['L2', 'L2', 'L9', 'L6', 'L4', 'L6', 'L8', 'L1', 'L10', 'L9', 'L7', 'L2', 'L3', 'L5', 'L4', 'L11', 'L2', 'L1', 'L1', 'L2'], ['L1', 'L10', 'L2', 'L3', 'L9', 'L4', 'L5', 'L2', 'L1', 'L1', 'L6', 'L6', 'L4', 'L7', 'L8', 'L11', 

In [158]:
def travel_time_between_points(point1_id, point2_id, hour, date, passenger_count = 1, 
                               store_and_fwd_flag = 0, pickup_minute = 0):
    """
    Given two points, this calculates travel between them based on a XGBoost predictive model
    """
    
    model_data = {'passenger_count': passenger_count,
                  'pickup_longitude' : point1_id[1],
                  'pickup_latitude' : point1_id[0],
                  'dropoff_longitude' : point2_id[1],
                  'dropoff_latitude' : point2_id[0],
                  'store_and_fwd_flag' : store_and_fwd_flag,
                  'pickup_month' : my_date.month,
                  'pickup_day' : my_date.day,
                  'pickup_weekday' : my_date.weekday(),
                  'pickup_hour': hour,
                  'pickup_minute' : pickup_minute,
                  'latitude_difference' : point2_id[0] - point1_id[0],
                  'longitude_difference' : point2_id[1] - point1_id[1],
                  'trip_distance' : 0.621371 * 6371 * (abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(point2_id[0] - point1_id[0]) * np.pi / 180) / 2))), 
                                  np.sqrt(1-(np.square(np.sin((abs(point2_id[0] - point1_id[0]) * np.pi / 180) / 2)))))) + \
                                     abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(point2_id[1] - point1_id[1]) * np.pi / 180) / 2))), 
                                  np.sqrt(1-(np.square(np.sin((abs(point2_id[1] - point1_id[1]) * np.pi / 180) / 2)))))))
                 }

    df = pd.DataFrame([model_data], columns=model_data.keys())
    
    pred = np.exp(loaded_model.predict(xgb.DMatrix(df))) - 1
    
    return pred[0]

In [159]:
coordinates = test_locations

In [160]:

def fitness_score(guess):
    """
    Loops through the points in the guesses order and calculates
    how much distance the path would take to complete a loop.
    Lower is better.
    """
    score = 0
    for ix, point_id in enumerate(guess[:-1]):
        score += travel_time_between_points(coordinates[point_id], coordinates[guess[ix+1]], 11, my_date)
    return score

def check_fitness(guesses):
    """
    Goes through every guess and calculates the fitness score. 
    Returns a list of tuples: (guess, fitness_score)
    """
    fitness_indicator = []
    for guess in guesses:
        fitness_indicator.append((guess, fitness_score(guess)))
    return fitness_indicator

print(check_fitness(test_generation))

[(['L9', 'L5', 'L3', 'L11', 'L6', 'L2', 'L8', 'L7', 'L4', 'L10', 'L1', 'L9'], 294.28173065185547), (['L2', 'L5', 'L7', 'L4', 'L3', 'L9', 'L6', 'L1', 'L8', 'L10', 'L9', 'L11', 'L2'], 293.47003173828125), (['L6', 'L9', 'L2', 'L7', 'L2', 'L5', 'L4', 'L9', 'L1', 'L8', 'L3', 'L11', 'L10', 'L6'], 358.0016927719116), (['L1', 'L6', 'L8', 'L6', 'L9', 'L5', 'L9', 'L10', 'L11', 'L7', 'L4', 'L2', 'L2', 'L3', 'L1'], 320.40600299835205), (['L1', 'L9', 'L2', 'L9', 'L1', 'L5', 'L3', 'L6', 'L6', 'L2', 'L10', 'L8', 'L4', 'L11', 'L7', 'L1'], 412.35965633392334), (['L2', 'L10', 'L1', 'L7', 'L4', 'L6', 'L5', 'L11', 'L6', 'L9', 'L2', 'L1', 'L3', 'L1', 'L8', 'L9', 'L2'], 392.54063606262207), (['L2', 'L2', 'L4', 'L5', 'L8', 'L6', 'L10', 'L11', 'L1', 'L7', 'L6', 'L1', 'L9', 'L3', 'L9', 'L1', 'L2', 'L2'], 355.9325952529907), (['L4', 'L9', 'L2', 'L1', 'L2', 'L2', 'L1', 'L5', 'L2', 'L10', 'L11', 'L6', 'L3', 'L8', 'L6', 'L1', 'L7', 'L9', 'L4'], 430.73903369903564), (['L2', 'L2', 'L9', 'L6', 'L4', 'L6', 'L8', 'L1',

In [161]:
def get_breeders_from_generation(guesses, take_best_N=10, take_random_N=5, verbose=False, mutation_rate=0.1):
    """
    This sets up the breeding group for the next generation. You have
    to be very careful how many breeders you take, otherwise your
    population can explode. These two, plus the "number of children per couple"
    in the make_children function must be tuned to avoid exponential growth or decline!
    """
    # First, get the top guesses from last time
    fit_scores = check_fitness(guesses)
    sorted_guesses = sorted(fit_scores, key=lambda x: x[1]) # sorts so lowest is first, which we want
    new_generation = [x[0] for x in sorted_guesses[:take_best_N]]
    best_guess = new_generation[0]
    
    if verbose:
        # If we want to see what the best current guess is!
        print(best_guess)
    
    # Second, get some random ones for genetic diversity
    for _ in range(take_random_N):
        ix = np.random.randint(len(guesses))
        new_generation.append(guesses[ix])
        
    # No mutations here since the order really matters.
    # If we wanted to, we could add a "swapping" mutation,
    # but in practice it doesn't seem to be necessary
    
    np.random.shuffle(new_generation)
    return new_generation, best_guess

def make_child(parent1, parent2):
    """ 
    Take some values from parent 1 and hold them in place, then merge in values
    from parent2, filling in from left to right with cities that aren't already in 
    the child. 
    """
    list_of_ids_for_parent1 = list(np.random.choice(parent1, replace=False, size=len(parent1)//2))
    child = [-99 for _ in parent1]
    
    for ix in range(0, len(list_of_ids_for_parent1)):
        child[ix] = parent1[ix]
    for ix, gene in enumerate(child):
        if gene == -99:
            for gene2 in parent2:
                if gene2 not in child:
                    child[ix] = gene2
                    break
    child[-1] = child[0]
    return child

def make_children(old_generation, children_per_couple=1):
    """
    Pairs parents together, and makes children for each pair. 
    If there are an odd number of parent possibilities, one 
    will be left out. 
    
    Pairing happens by pairing the first and last entries. 
    Then the second and second from last, and so on.
    """
    mid_point = len(old_generation)//2
    next_generation = [] 
    
    for ix, parent in enumerate(old_generation[:mid_point]):
        for _ in range(children_per_couple):
            next_generation.append(make_child(parent, old_generation[-ix-1]))
    return next_generation

In [166]:
current_generation = create_generation(list(test_locations.keys()),population=500)
print_every_n_generations = 5

for i in range(100):
    if not i % print_every_n_generations:
        print("Generation %i: "%i, end='')
        print(len(current_generation))
        is_verbose = True
    else:
        is_verbose = False
    breeders, best_guess = get_breeders_from_generation(current_generation, 
                                                        take_best_N=250, take_random_N=100, 
                                                        verbose=is_verbose)
    current_generation = make_children(breeders, children_per_couple=3)

Generation 0: 100
['L3', 'L10', 'L8', 'L1', 'L5', 'L2', 'L6', 'L11', 'L9', 'L7', 'L4', 'L3']


KeyError: -99

In [163]:
def evolve_to_solve(current_generation, max_generations, take_best_N, take_random_N,
                    mutation_rate, children_per_couple, print_every_n_generations, verbose=False):
    """
    Takes in a generation of guesses then evolves them over time using our breeding rules.
    Continue this for "max_generations" times.
    Inputs:
    current_generation: The first generation of guesses
    max_generations: how many generations to complete
    take_best_N: how many of the top performers get selected to breed
    take_random_N: how many random guesses get brought in to keep genetic diversity
    mutation_rate: How often to mutate (currently unused)
    children_per_couple: how many children per breeding pair
    print_every_n_geneartions: how often to print in verbose mode
    verbose: Show printouts of progress
    Returns:
    fitness_tracking: a list of the fitness score at each generations
    best_guess: the best_guess at the end of evolution
    """
    fitness_tracking = []
    for i in range(max_generations):
        if verbose and not i % print_every_n_generations and i > 0:
            print("Generation %i: "%i, end='')
            print(len(current_generation))
            print("Current Best Score: ", fitness_tracking[-1])
            is_verbose = True
        else:
            is_verbose = False
        breeders, best_guess = get_breeders_from_generation(current_generation, 
                                                            take_best_N=take_best_N, take_random_N=take_random_N, 
                                                            verbose=is_verbose, mutation_rate=mutation_rate)
        fitness_tracking.append(fitness_score(best_guess))
        current_generation = make_children(breeders, children_per_couple=children_per_couple)
    
    return fitness_tracking, best_guess

current_generation = create_generation(list(test_locations.keys()),population=500)
fitness_tracking, best_guess = evolve_to_solve(current_generation, 100, 150, 70, 0.5, 3, 5, verbose=True)

KeyError: -99