In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import lightgbm as lgb
from TaxiDataFormatter.dataformatter_v2 import formatter
from sklearn.model_selection import train_test_split

fig=plt.figure(figsize=(16, 9), dpi= 80, facecolor='w', edgecolor='k')
%matplotlib inline

In [2]:
!ls

sample_submission.csv  TaxiDataFormatter  Taxi_v3.ipynb
sample_submission.zip  Taxi_v1.ipynb	  test.csv
submission_lgbm.csv    Taxi_v2.ipynb	  train.csv


# Load Data and Analyze

In [3]:
train = pd.read_csv('train.csv', delimiter=',')
test = pd.read_csv('test.csv', delimiter=',')

In [4]:
train.shape

(1458644, 11)

In [5]:
train.tail()


Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
1458639,id2376096,2,2016-04-08 13:31:04,2016-04-08 13:44:02,4,-73.982201,40.745522,-73.994911,40.74017,N,778
1458640,id1049543,1,2016-01-10 07:35:15,2016-01-10 07:46:10,1,-74.000946,40.747379,-73.970184,40.796547,N,655
1458641,id2304944,2,2016-04-22 06:57:41,2016-04-22 07:10:25,1,-73.959129,40.768799,-74.004433,40.707371,N,764
1458642,id2714485,1,2016-01-05 15:56:26,2016-01-05 16:02:39,1,-73.982079,40.749062,-73.974632,40.757107,N,373
1458643,id1209952,1,2016-04-05 14:44:25,2016-04-05 14:47:43,1,-73.979538,40.78175,-73.972809,40.790585,N,198


In [6]:
y = train[['trip_duration']].values
Y = np.log(y+1)
train = train.drop(['trip_duration','dropoff_datetime'],axis=1)
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id2875421,2,2016-03-14 17:24:55,1,-73.982155,40.767937,-73.96463,40.765602,N
1,id2377394,1,2016-06-12 00:43:35,1,-73.980415,40.738564,-73.999481,40.731152,N
2,id3858529,2,2016-01-19 11:35:24,1,-73.979027,40.763939,-74.005333,40.710087,N
3,id3504673,2,2016-04-06 19:32:31,1,-74.01004,40.719971,-74.012268,40.706718,N
4,id2181028,2,2016-03-26 13:30:55,1,-73.973053,40.793209,-73.972923,40.78252,N


In [7]:
t1 = time.time()
mod_df1 = formatter(train)
mod_df1 = mod_df1.drop(['id'],axis=1)
t2 = time.time()
print("Time Taken: ",t2-t1)

Time Taken:  48.188042402267456


In [8]:
mod_df1.head()

Unnamed: 0,passenger_count,pickup_longitude,pickup_latitude,dist,flag_n,vendor_id,p_hour,p_min,is_weekend
0,1,-73.982155,40.767937,0.930489,1,1,0.708333,0.4,0
1,1,-73.980415,40.738564,1.121109,1,0,0.0,0.716667,1
2,1,-73.979027,40.763939,3.964754,1,1,0.458333,0.583333,0
3,1,-74.01004,40.719971,0.922403,1,1,0.791667,0.533333,0
4,1,-73.973053,40.793209,0.73804,1,1,0.541667,0.5,1


In [9]:
t1 = time.time()
mod_df2 = formatter(test)
mod_df2 = mod_df2.drop(['id'],axis=1)
mod_df2.head()
t2 = time.time()
print("Time Taken: ",t2-t1)

Time Taken:  20.110716342926025


# Scale and split the data

In [10]:
#find the normalization params
plong_mu = np.mean(mod_df1['pickup_longitude'])
plong_std = np.std(mod_df1['pickup_longitude'])

plat_mu = np.mean(mod_df1['pickup_latitude'])
plat_std = np.std(mod_df1['pickup_latitude'])

dist_mu = np.mean(mod_df1['dist'])
dist_std = np.std(mod_df1['dist'])

In [11]:
#normalize train and test set
mod_df1['pickup_longitude'] = (mod_df1['pickup_longitude'] - plong_mu)/plong_std
mod_df1['pickup_latitude'] = (mod_df1['pickup_latitude'] - plat_mu)/plat_std
mod_df1['dist'] = (mod_df1['dist'] - dist_mu)/dist_std

mod_df2['pickup_longitude'] = (mod_df2['pickup_longitude'] - plong_mu)/plong_std
mod_df2['pickup_latitude'] = (mod_df2['pickup_latitude'] - plat_mu)/plat_std
mod_df2['dist'] = (mod_df2['dist'] - dist_mu)/dist_std

In [12]:
mod_df1.head()

Unnamed: 0,passenger_count,pickup_longitude,pickup_latitude,dist,flag_n,vendor_id,p_hour,p_min,is_weekend
0,1,-0.122261,0.517494,-0.452072,1,1,0.708333,0.4,0
1,1,-0.097727,-0.375819,-0.380622,1,0,0.0,0.716667,1
2,1,-0.078143,0.39591,0.685258,1,1,0.458333,0.583333,0
3,1,-0.515558,-0.941274,-0.455103,1,1,0.791667,0.533333,0
4,1,0.006112,1.286091,-0.524207,1,1,0.541667,0.5,1


In [13]:
feats = ['dist','p_hour', 'p_min', 'pickup_longitude', 'pickup_latitude']
X = mod_df1.values

X_test = mod_df2.values

In [14]:
print("Shape of Training Data: ",X.shape)
print("Shape of Test Data: ", X_test.shape)

Shape of Training Data:  (1458644, 9)
Shape of Test Data:  (625134, 9)


# Build the model

In [15]:
#define loss function, prediction is in form of y_hat = log(p+1)
def rmsle(y_true, y_pred):
    
    assert y_true.shape == y_pred.shape
    return np.sqrt(np.mean((y_true - y_pred)**2))

In [22]:
#set the hyper-parameters

print('Training and making predictions')
params = {
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'num_leaves': 50,
    'learning_rate': 0.5,
    'min_data_in_leaf': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_lambda': 0.0,
    'reg_alpha': 0.0,
    'objective': 'regression',
    'metric': 'rmsle',
    'n_jobs': -1,
    'verbose': 0, 
    }

n_estimators = 1000
n_iters = 5
preds_buf = []
err_buf = []

Training and making predictions


In [23]:
Y = Y.reshape((-1,)) 
print(Y.shape)

(1458644,)


In [24]:
t1 = time.time()
for i in range(n_iters): 
    x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.05, random_state=i)
    d_train = lgb.Dataset(x_train, label=y_train)
    d_valid = lgb.Dataset(x_valid, label=y_valid)
    watchlist = [d_valid]

    model = lgb.train(params, d_train,n_estimators, watchlist, verbose_eval=1)

    val_preds = model.predict(x_valid)
    err = rmsle(y_valid, val_preds)
    err_buf.append(err)
    
    train_preds = model.predict(x_train)
    train_err = rmsle(y_train, train_preds)
    print("Validation RMSLE = ",err,"\tTraining RMSLE = ",train_err)
    print()
    
    test_preds = model.predict(X_test)
    test_preds = np.exp(test_preds)-1
    preds_buf.append(test_preds)


t2 = time.time()
print("Time Taken: ",t2-t1)

Validation RMSLE =  0.45435568936581894 	Training RMSLE =  0.3943337512102654

Validation RMSLE =  0.45575012897092343 	Training RMSLE =  0.393889195140695

Validation RMSLE =  0.4515911943888604 	Training RMSLE =  0.39426871379884504

Validation RMSLE =  0.46351648669924356 	Training RMSLE =  0.39370420030610537

Validation RMSLE =  0.45743002823019624 	Training RMSLE =  0.3943915647561121

Time Taken:  122.99295043945312


# Prepare submission for Test Data

In [25]:
# Prepare submission
subm = pd.DataFrame()
subm['id'] = test.id.values
subm['trip_duration'] = test_preds
subm.to_csv('submission_lgbm1.csv', index=False)