Imported Libraries

In [1]:
import numpy as np
import pandas as pd
import time

Mounted Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Installed Catboost and Lightboost packages

In [3]:
! pip install catboost
! pip install lightgbm

Collecting catboost
  Downloading catboost-1.0.3-cp37-none-manylinux1_x86_64.whl (76.3 MB)
[K     |████████████████████████████████| 76.3 MB 14 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.3


Reading the data

In [4]:
train_data = pd.read_csv("/content/drive/MyDrive/foml/ass4/train.csv",nrows=700000)
test_data = pd.read_csv("/content/drive/MyDrive/foml/ass4/test.csv")

print(train_data.shape,test_data.shape)


(700000, 8) (9914, 7)


Pre Processing steps

In [5]:
key = test_data['key']

train_data.drop(train_data[train_data["passenger_count"] > 5].index, axis=0, inplace=True)
train_data.drop(train_data[train_data["passenger_count"] == 0].index, axis=0, inplace=True)

train_data.dropna(inplace=True)

train_data.drop(train_data.index[(train_data.pickup_longitude < -75.0)|
                             (train_data.pickup_longitude > -72.0)|
                             (train_data.pickup_latitude < 40.0)|
                             (train_data.pickup_latitude > 42.0)], inplace=True)

train_data.drop(train_data.index[(train_data.dropoff_longitude < -75.0)|
                             (train_data.dropoff_longitude > -72.0)|
                             (train_data.dropoff_latitude < 40.0)|
                             (train_data.dropoff_latitude > 42.0)], inplace=True)

Method to get a CSV submission file

In [6]:
def get_submission_file(y_pred,filename="submission.csv"):
    submission = pd.DataFrame()
    submission['fare_amount'] = y_pred
    submission['key'] = key
    submission = submission.to_csv(filename,index=False)
    return submission

Using Label encoding to encode selected columns

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

res = pd.DataFrame()

to_encode = ['key','pickup_datetime']
for label in to_encode:
    train_data[label] = le.fit_transform(train_data[label])
    test_data[label] = le.fit_transform(test_data[label])


fares = train_data['fare_amount']
xfeatures = train_data.drop(columns=['fare_amount'])
yfeatures=test_data

Observing Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xfeatures,fares)
y_pred = model.predict(yfeatures)
get_submission_file(y_pred)

Observing Light Boost Regressor

In [9]:
from lightgbm import LGBMRegressor
model = LGBMRegressor(n_estimators=1250)
model.fit(xfeatures,fares)
lgbmy_pred = model.predict(yfeatures)
get_submission_file(lgbmy_pred,"submission_lgbm.csv")

Observing Cat boost regressor

In [10]:
from catboost import CatBoostRegressor
model = CatBoostRegressor(loss_function='RMSE')
model.fit(xfeatures,fares)
cbry_pred = model.predict(yfeatures)
get_submission_file(cbry_pred,"submission_cbr.csv")

Learning rate set to 0.114431
0:	learn: 9.1310184	total: 148ms	remaining: 2m 27s
1:	learn: 8.5816520	total: 243ms	remaining: 2m 1s
2:	learn: 8.1181673	total: 329ms	remaining: 1m 49s
3:	learn: 7.7218369	total: 415ms	remaining: 1m 43s
4:	learn: 7.3950567	total: 496ms	remaining: 1m 38s
5:	learn: 7.1042432	total: 575ms	remaining: 1m 35s
6:	learn: 6.8617345	total: 669ms	remaining: 1m 34s
7:	learn: 6.6564951	total: 758ms	remaining: 1m 33s
8:	learn: 6.4872475	total: 836ms	remaining: 1m 32s
9:	learn: 6.3051028	total: 930ms	remaining: 1m 32s
10:	learn: 6.1818433	total: 1.01s	remaining: 1m 30s
11:	learn: 6.0698534	total: 1.09s	remaining: 1m 29s
12:	learn: 5.9827237	total: 1.17s	remaining: 1m 28s
13:	learn: 5.8728343	total: 1.26s	remaining: 1m 28s
14:	learn: 5.7928556	total: 1.34s	remaining: 1m 28s
15:	learn: 5.7295813	total: 1.43s	remaining: 1m 27s
16:	learn: 5.6371879	total: 1.52s	remaining: 1m 27s
17:	learn: 5.5600815	total: 1.6s	remaining: 1m 27s
18:	learn: 5.4951035	total: 1.69s	remaining: 1

Taking the mean of the CBR and LGBM

In [11]:
mean_pred = [(ele1 + ele2) / 2 for ele1, ele2 in zip(lgbmy_pred, cbry_pred)]
get_submission_file(mean_pred,"submission_mean.csv")

XGBoost Regressor

In [12]:
import xgboost as xgb
xg_reg = xgb.XGBRegressor(n_estimators=200,learning_rate=0.05,max_depth=10)
xg_reg.fit(xfeatures,fares)
y_pred = xg_reg.predict(yfeatures)
get_submission_file(y_pred,"submission_xgb.csv")

