In [3]:
import datetime
import lightgbm as lgb
import numpy as np
import os
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import time 
random_seed = 0
random.seed(random_seed)
np.random.seed(random_seed)

In [4]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
ss = pd.read_csv('sample_submission.csv')

In [5]:
def haversine(x1,x2):
    from math import sin, cos, sqrt, atan2, radians

    # approximate radius of earth in km
    R = 6373.0

    lat1 = abs(x1[0])
    lon1 = abs(x1[1])
    lat2 = abs(x2[0])
    lon2 = abs(x2[1])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    dist = R * c
    
    return dist

In [6]:
def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.sqrt(np.mean(np.power(np.log1p(y_true + 1) - np.log1p(y_pred + 1), 2)))

In [7]:
def extract_features(df):
    df['hdistance'] = df.apply(lambda r: haversine((r['pickup_latitude'],r['pickup_longitude']),(r['dropoff_latitude'], r['dropoff_longitude'])), axis=1)
    df['distance'] = np.sqrt(np.power(df['dropoff_longitude'] - df['pickup_longitude'], 2) + np.power(df['dropoff_latitude'] - df['pickup_latitude'], 2))
    df['log_distance'] = np.log(df['distance'])
    df['month'] = df['pickup_datetime'].apply(lambda x: int(x.split(' ')[0].split('-')[1]))
    df['day'] = df['pickup_datetime'].apply(lambda x: int(x.split(' ')[0].split('-')[2]))
    df['hour'] = df['pickup_datetime'].apply(lambda x: int(x.split(' ')[1].split(':')[0]))
    df['minutes'] = df['pickup_datetime'].apply(lambda x: int(x.split(' ')[1].split(':')[1]))
    df['is_weekend'] = ((df.pickup_datetime.astype('datetime64[ns]').dt.dayofweek) // 4 == 1).astype(float)
    df['weekday'] = df.pickup_datetime.astype('datetime64[ns]').dt.dayofweek
    df['is_holyday'] = df.apply(lambda row: 1 if (row['month']==1 and row['day']==1) or (row['month']==7 and row['day']==4) or (row['month']==11 and row['day']==11) or (row['month']==12 and row['day']==25) or (row['month']==1 and row['day'] >= 15 and row['day'] <= 21 and row['weekday'] == 0) or (row['month']==2 and row['day'] >= 15 and row['day'] <= 21 and row['weekday'] == 0) or (row['month']==5 and row['day'] >= 25 and row['day'] <= 31 and row['weekday'] == 0) or (row['month']==9 and row['day'] >= 1 and row['day'] <= 7 and row['weekday'] == 0) or (row['month']==10 and row['day'] >= 8 and row['day'] <= 14 and row['weekday'] == 0) or (row['month']==11 and row['day'] >= 22 and row['day'] <= 28 and row['weekday'] == 3) else 0, axis=1)
    df['is_day_before_holyday'] = df.apply(lambda row: 1 if (row['month']==12 and row['day']==31) or (row['month']==7 and row['day']==3) or (row['month']==11 and row['day']==10) or (row['month']==12 and row['day']==24) or (row['month']==1 and row['day'] >= 14 and row['day'] <= 20 and row['weekday'] == 6) or (row['month']==2 and row['day'] >= 14 and row['day'] <= 20 and row['weekday'] == 6) or (row['month']==5 and row['day'] >= 24 and row['day'] <= 30 and row['weekday'] == 6) or ((row['month']==9 and row['day'] >= 1 and row['day'] <= 6) or (row['month']==8 and row['day'] == 31) and row['weekday'] == 6) or (row['month']==10 and row['day'] >= 7 and row['day'] <= 13 and row['weekday'] == 6) or (row['month']==11 and row['day'] >= 21 and row['day'] <= 27 and row['weekday'] == 2) else 0, axis=1)
    df['store_and_fwd_flag'] = df['store_and_fwd_flag'].map(lambda x: 0 if x =='N' else 1)
    df.drop('day', axis=1, inplace=True)

In [8]:
t1 = time.time()
# Extract features
print('Extracting train features')
extract_features(train)
print('Extracting test features')
extract_features(test)

train.head()
t2 = time.time()
print("Time Taken: ",t2-t1)

Extracting train features


  after removing the cwd from sys.path.


Extracting test features
Time Taken:  268.51502299308777


In [9]:
# Prepare data
X = np.array(train.drop(['id', 'pickup_datetime', 'dropoff_datetime', 'store_and_fwd_flag', 'trip_duration'], axis=1))
y = np.log(train['trip_duration'].values+1)
median_trip_duration = np.median(train['trip_duration'].values)

print('X.shape = ' + str(X.shape))
print('y.shape = ' + str(y.shape))

X_test = np.array(test.drop(['id', 'pickup_datetime', 'store_and_fwd_flag'], axis=1))

print('X_test.shape = ' + str(X_test.shape))

X.shape = (1458644, 16)
y.shape = (1458644,)
X_test.shape = (625134, 16)


In [13]:
t1 = time.time()
print('Training and making predictions')
params = {
    'boosting_type': 'gbdt',
    'max_depth': 7,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'min_data_in_leaf': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_lambda': 0.5,
    'reg_alpha': 1.5,
    'objective': 'regression',
    'metric': 'rmsle',
    'learning_rate': 0.1,
    'verbose': 0, 
    }
n_estimators = 100

n_iters = 10
preds_buf = []
err_buf = []
for i in range(n_iters): 
    x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=i)
    d_train = lgb.Dataset(x_train, label=y_train)
    d_valid = lgb.Dataset(x_valid, label=y_valid)
    watchlist = [d_valid]

    model = lgb.train(params, d_train, n_estimators, watchlist, verbose_eval=1)

    preds = model.predict(x_valid)
    preds = np.exp(preds)
    preds[preds < 0] = median_trip_duration
    err = rmsle(np.exp(y_valid), preds)
    err_buf.append(err)
    print('RMSLE = ' + str(err))
    
    preds = model.predict(X_test)
    preds = np.exp(preds)
    preds[preds < 0] = median_trip_duration
    preds_buf.append(preds)

print('Mean RMSLE = ' + str(np.mean(err_buf)) + ' +/- ' + str(np.std(err_buf)))
# Average predictions
preds = np.mean(preds_buf, axis=0)
t2 = time.time()
print("Time Taken: ",t2-t1)

Training and making predictions
RMSLE = 0.4197975070150431
RMSLE = 0.4170133398899779
RMSLE = 0.41514868377328495
RMSLE = 0.420055171748733
RMSLE = 0.4183415336156974
RMSLE = 0.41826054567877763
RMSLE = 0.41608759119849986
RMSLE = 0.4223613125232363
RMSLE = 0.4172726874972868
RMSLE = 0.413067528873883
Mean RMSLE = 0.417740590181442 +/- 0.0025220322904568188
Time Taken:  38.0350456237793


In [12]:
# Prepare submission
subm = pd.DataFrame()
subm['id'] = test.id.values
subm['trip_duration'] = preds
subm.to_csv('submission_lgbm.csv', index=False)