In [3]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [4]:
features = ["pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "passenger_count", "pickup_day", 
            "pickup_hour","pickup_day_of_week", "pickup_month", "pickup_year", "is_pickup_JFK", "is_dropoff_JFK", 
            "is_pickup_EWR", "is_dropoff_EWR", "is_pickup_la_guardia", "is_dropoff_la_guardia", "trip_distance"]
target = ["fare_amount"]

In [52]:
X = pd.read_csv("C:/Users/dhruv/Machine Learning/cleaned_train_data.csv", usecols = features)
y = pd.read_csv("C:/Users/dhruv/Machine Learning/cleaned_train_data.csv", usecols = target)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [7]:
#Creating a baseline prediction and calculating RMSE

avg_fare = round(np.mean(y_train),2)
baseline_pred=np.repeat(avg_fare, y_test.shape[0])
baseline_rmse=np.sqrt(mean_squared_error(baseline_pred, y_test))
print("Baseline RMSE of Validation data: ", baseline_rmse)

Baseline RMSE of Validation data:  9.600321642864396


In [8]:
#Creating a LinearRegression model and calculating the RMSE of the predictions

from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = np.round(lm.predict(X_test), 2)
lm_rmse=np.sqrt(mean_squared_error(y_pred, y_test))
lm_train_rmse = np.sqrt(mean_squared_error(lm.predict(X_train), y_train))
lm_variance=abs(lm_train_rmse - lm_rmse)
print("Test RMSE for Linear Regression is ", lm_rmse)
print("Train RMSE for Linear Regression is ", lm_train_rmse)
print("Variance for Linear Regression is ", lm_variance)

Test RMSE for Linear Regression is  4.955841583221551
Train RMSE for Linear Regression is  4.966887194386981
Variance for Linear Regression is  0.01104561116542957


In [10]:
#Creating a LightGBM Model for better prediction and a better RMSE value

train_data=lgb.Dataset(X_train,label=y_train)
param = {'num_leaves':31, 'num_trees':5000,'objective':'regression'}
param['metric'] = 'l2_root'
lgb_bst=lgb.train(param,train_data,100)
lgb_pred = lgb_bst.predict(X_test)
lgb_rmse=np.sqrt(mean_squared_error(lgb_pred, y_test))
print("RMSE for Light GBM is ",lgb_rmse)



RMSE for Light GBM is  3.5747546735970523


In [11]:
#Running LightGBM with CrossValidation and Comparing the RMSE value for the predictions

num_round=5000
cv_results = lgb.cv(param, train_data, num_boost_round=num_round, nfold=10,verbose_eval=20, early_stopping_rounds=20,stratified=False)
lgb_bst=lgb.train(param,train_data,len(cv_results['rmse-mean']))
lgb_pred = lgb_bst.predict(X_test)
lgb_rmse=np.sqrt(mean_squared_error(lgb_pred, y_test))
print("RMSE for Light GBM is ",lgb_rmse)



[20]	cv_agg's rmse: 4.29848 + 0.0991737
[40]	cv_agg's rmse: 3.98245 + 0.0997185
[60]	cv_agg's rmse: 3.8953 + 0.101485
[80]	cv_agg's rmse: 3.85209 + 0.100272
[100]	cv_agg's rmse: 3.81881 + 0.100226
[120]	cv_agg's rmse: 3.79493 + 0.100373
[140]	cv_agg's rmse: 3.77628 + 0.10098
[160]	cv_agg's rmse: 3.7603 + 0.100003
[180]	cv_agg's rmse: 3.74858 + 0.100658
[200]	cv_agg's rmse: 3.73797 + 0.100502
[220]	cv_agg's rmse: 3.72853 + 0.101448
[240]	cv_agg's rmse: 3.72028 + 0.100375
[260]	cv_agg's rmse: 3.7119 + 0.10068
[280]	cv_agg's rmse: 3.70528 + 0.10144
[300]	cv_agg's rmse: 3.69949 + 0.100944
[320]	cv_agg's rmse: 3.69379 + 0.101617
[340]	cv_agg's rmse: 3.6877 + 0.102821
[360]	cv_agg's rmse: 3.68307 + 0.103489
[380]	cv_agg's rmse: 3.67904 + 0.103507
[400]	cv_agg's rmse: 3.67511 + 0.103894
[420]	cv_agg's rmse: 3.67149 + 0.103958
[440]	cv_agg's rmse: 3.66822 + 0.103971
[460]	cv_agg's rmse: 3.66502 + 0.104768
[480]	cv_agg's rmse: 3.66187 + 0.105182
[500]	cv_agg's rmse: 3.65901 + 0.105249
[520]	cv_



RMSE for Light GBM is  3.5747546735970523


In [10]:
#Improving LightGBM Model by adding more tuning parameters 

param = {
    'objective' : 'regression',
    'boosting_type' : 'gbdt',
    'bagging_freq' : 20,
    'subsample_frequency' : 100,
    'num_leaves':100,
    'min_data_in_leaf': 50, 
    'max_depth':8,
    'colsample_bytree':0.5, 
    'lambda_l1':1,
    'lambda_l2':0   
}

cv_results = lgb.cv(param, train_data, num_boost_round=num_round, nfold=10,verbose_eval=20, early_stopping_rounds=20,stratified=False)
lgb_bst=lgb.train(param,train_data,len(cv_results['rmse-mean']))
lgb_pred = lgb_bst.predict(X_test)
lgb_rmse=np.sqrt(mean_squared_error(lgb_pred, y_test))
print("RMSE for Light GBM is ",lgb_rmse)

[20]	cv_agg's rmse: 4.37248 + 0.0963056
[40]	cv_agg's rmse: 3.99978 + 0.0993363
[60]	cv_agg's rmse: 3.89768 + 0.100983
[80]	cv_agg's rmse: 3.85548 + 0.101451
[100]	cv_agg's rmse: 3.82042 + 0.101236
[120]	cv_agg's rmse: 3.78958 + 0.102016
[140]	cv_agg's rmse: 3.76647 + 0.102975
[160]	cv_agg's rmse: 3.74707 + 0.103009
[180]	cv_agg's rmse: 3.7352 + 0.1035
[200]	cv_agg's rmse: 3.72391 + 0.102624
[220]	cv_agg's rmse: 3.71588 + 0.102788
[240]	cv_agg's rmse: 3.70665 + 0.10304
[260]	cv_agg's rmse: 3.69667 + 0.103073
[280]	cv_agg's rmse: 3.68687 + 0.103331
[300]	cv_agg's rmse: 3.68043 + 0.103441
[320]	cv_agg's rmse: 3.67184 + 0.104453
[340]	cv_agg's rmse: 3.66667 + 0.104766
[360]	cv_agg's rmse: 3.6632 + 0.104928
[380]	cv_agg's rmse: 3.65799 + 0.105286
[400]	cv_agg's rmse: 3.65509 + 0.105476
[420]	cv_agg's rmse: 3.65188 + 0.105649
[440]	cv_agg's rmse: 3.64933 + 0.105598
[460]	cv_agg's rmse: 3.64706 + 0.10568
[480]	cv_agg's rmse: 3.64513 + 0.105509
[500]	cv_agg's rmse: 3.64314 + 0.105349
[520]	cv

In [15]:
#Building LightGBM model with more tuning parameters 

param = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': 4,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 5000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0,
        'num_rounds':50000
    }

lgb_bst=lgb.train(param,train_data,num_boost_round=10000,verbose_eval=500)
lgb_pred = lgb_bst.predict(X_test)
lgb_rmse=np.sqrt(mean_squared_error(lgb_pred, y_test))
print("RMSE for Light GBM is ",lgb_rmse)

RMSE for Light GBM is  3.473198503479004


In [16]:
#Using GridSearchCV to find the optimal tuning parameters for the LightGBM model

from sklearn.model_selection import GridSearchCV
grid_params = { 
    'max_depth':[5, 15, 30],
    'colsample_bytree':[0.3, 0.7], 
    'lambda_l1':[0, 1, 1.5],
    'lambda_l2':[0, 1]    
}

lgb_reg = lgb.LGBMRegressor(
    objective='regression',
    n_jobs = -1,
    verbose = 1, 
    boosting_type = 'gbdt',
    num_leaves = 31,
    bagging_freq = 20,
    max_depth = 5,
    colsample_bytree = 0.3,
    min_data_in_leaf = 30,
    lambda_l1 = 0,
    lambda_l2 = 0)

lgb_reg.get_params().keys()


grid = GridSearchCV(lgb_reg, grid_params)
grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)




{'colsample_bytree': 0.7, 'lambda_l1': 1, 'lambda_l2': 0, 'max_depth': 15}
0.8412823564351186


In [17]:
#Building LightGBM model with optimal tuning parameters found using GridSearchCV

param = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': 4,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': 15,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 5000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.7,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0,
        'num_rounds':50000,
        'lambda_l1': 1,
        'lambda_l2': 0
    }

lgb_bst=lgb.train(param,train_data,num_boost_round=10000,verbose_eval=500)
lgb_pred = lgb_bst.predict(X_test)
lgb_rmse=np.sqrt(mean_squared_error(lgb_pred, y_test))
print("RMSE for Light GBM is ",lgb_rmse)



RMSE for Light GBM is  3.4710864718957914


In [18]:
lgb_pred

array([ 4.27342703,  8.4309459 , 30.63615465, ...,  4.73294701,
       17.3961679 ,  4.9889011 ])

In [53]:
test_data = pd.read_csv("C:/Users/dhruv/Machine Learning/cleaned_test_data.csv", usecols = features)

In [24]:
predictions = lgb_bst.predict(test_data)

In [27]:
predictions

array([ 9.33529451, 10.71883571,  3.99542776, ..., 53.31700581,
       17.72180397,  6.69142548])

In [54]:
test_data_keys = pd.read_csv("C:/Users/dhruv/Machine Learning/cleaned_test_data.csv", usecols = ["key"])

In [40]:
predictions

array([ 9.33529451, 10.71883571,  3.99542776, ..., 53.31700581,
       17.72180397,  6.69142548])

In [47]:
test_data_prediction = pd.DataFrame()

In [48]:
test_data_prediction['key']=test_data_keys['key']

In [49]:
test_data_prediction['fare_amount'] = predictions

In [51]:
test_data_prediction.to_csv("predictions.csv", index = False)