In [15]:
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
import xgboost as xgb
import pandas as pd
import numpy as np

In [16]:
df = pd.read_csv('filtered.csv')

In [17]:
df.head()

Unnamed: 0,trip_duration,trip_distance,Temp.,Humidity,Pressure,Wind Speed,Conditions_0,Conditions_3,Conditions_4,day_0,...,month_2,month_3,month_4,month_5,Wind Dir_0,Wind Dir_1,Wind Dir_2,Wind Dir_3,Wind Dir_4,Wind Dir_6
0,7.383333,-0.479681,0.414781,-0.579947,-0.232014,16.7,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,18.566667,-0.32543,-1.31432,-0.872333,2.270896,7.4,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,12.183333,-0.28542,1.214812,-0.287562,1.176659,16.7,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
3,8.1,-0.154195,0.414781,-0.346039,-0.357788,16.7,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
4,7.05,-0.399343,0.414781,-0.346039,-1.213054,7.4,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [18]:
selected_features = ['trip_distance', 'Temp.', 'Humidity','Pressure', 'Wind Speed']

In [19]:
x = df[selected_features]
y = df['trip_duration']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2018)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=2019)

In [21]:
def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean()

In [22]:
params = {
    'booster':            'gbtree',
    'objective':          'reg:linear',
    'learning_rate':      0.05,
    'max_depth':          14,
    'subsample':          0.9,
    'colsample_bytree':   0.7,
    'colsample_bylevel':  0.7,
    'silent':             1,
    'feval':              'rmsle'
}
nrounds = 100

In [23]:
dtrain = xgb.DMatrix(X_train, np.log(y_train+1))
dval = xgb.DMatrix(X_val, np.log(y_val+1))

#this is for tracking the error
watchlist = [(dval, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params,
                dtrain,
                num_boost_round = nrounds,
                evals = watchlist,
                verbose_eval = True
                )

Parameters: { "feval", "silent" } are not used.

[0]	eval-rmse:1.96308	train-rmse:1.96524
[1]	eval-rmse:1.86969	train-rmse:1.87188
[2]	eval-rmse:1.78127	train-rmse:1.78350
[3]	eval-rmse:1.69747	train-rmse:1.69978
[4]	eval-rmse:1.62350	train-rmse:1.62590
[5]	eval-rmse:1.55361	train-rmse:1.55611
[6]	eval-rmse:1.48190	train-rmse:1.48444
[7]	eval-rmse:1.41408	train-rmse:1.41655
[8]	eval-rmse:1.35499	train-rmse:1.35757
[9]	eval-rmse:1.29926	train-rmse:1.30195
[10]	eval-rmse:1.24693	train-rmse:1.24973
[11]	eval-rmse:1.19763	train-rmse:1.20054
[12]	eval-rmse:1.15142	train-rmse:1.15441
[13]	eval-rmse:1.10167	train-rmse:1.10441
[14]	eval-rmse:1.05477	train-rmse:1.05724
[15]	eval-rmse:1.01600	train-rmse:1.01855
[16]	eval-rmse:0.97971	train-rmse:0.98235
[17]	eval-rmse:0.93977	train-rmse:0.94208
[18]	eval-rmse:0.90219	train-rmse:0.90432
[19]	eval-rmse:0.87214	train-rmse:0.87432
[20]	eval-rmse:0.83864	train-rmse:0.84043
[21]	eval-rmse:0.80722	train-rmse:0.80849
[22]	eval-rmse:0.77780	train-rmse:0.7

In [24]:
pred = np.exp(gbm.predict(xgb.DMatrix(X_test))) - 1

In [25]:
mae = (abs(pred - y_test)).mean()
mae

5.506066296132972

In [46]:
params = {
    'booster':            'gbtree',
    'fobj':               'reg:linear',
    'learning_rate':      0.001,
    'max_depth':          14,
    'num_leaves':        2^14 + 1,
    'subsample':          0.9,
    'colsample_bytree':   0.7,
    'colsample_bylevel':  0.7,
    'verbose_eval' :      1,
    'feval':              'rmsle'
}
nrounds = 800

In [47]:
dtrain = lgbm.Dataset(X_train, np.log(y_train+1))
dval = lgbm.Dataset(X_val, np.log(y_val+1), reference=dtrain)

#this is for tracking the error
watchlist = [(dtrain, 'train'), (dval, 'eval')]

In [48]:
bst = lgbm.train(params,
                dtrain,
                num_boost_round = nrounds,
                valid_sets = [dtrain, dval],
                valid_names = ['train', 'valid'],
                categorical_feature = [20, 24]
                )



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 624
[LightGBM] [Info] Number of data points in the train set: 215993, number of used features: 5
[LightGBM] [Info] Start training from score 2.447457


In [49]:
pred = np.exp(bst.predict(X_test)) - 1

In [50]:
mae = (abs(pred - y_test)).mean()
mae

6.992069983145163