In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import scipy
from scipy.stats import spearmanr

import sklearn

import matplotlib.pyplot as plt
import seaborn as sb

#importing the data
data = pd.read_csv('/kaggle/input/nyctaxifares/NYCTaxiFares.csv')
data.head()

exploring features and shape of dataset

In [None]:
data.shape

In [None]:
data.info()

In [None]:
#checking for empty values for all features
data.isnull().sum()

In [None]:
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])

data['day_of_week'] = data['pickup_datetime'].dt.weekday

#day of week starts from 0-6, 0-sunday to 6-saturday in python
data.head()

In [None]:
#reassigning values of day from 0-6 to 1-7 as 0 value effects the computation
data['day_of_week'] = data['day_of_week'] + 1

#retrieving hour,month and year from datatime and adding that data in new column
data['hour'] = data['pickup_datetime'].dt.hour
data['month'] = data['pickup_datetime'].dt.month
data['year'] = data['pickup_datetime'].dt.year
data.head()

In [None]:
#exploring date column values
print(data['hour'].unique())
print(data['day_of_week'].unique())
print(data['month'].unique())
print(data['year'].unique())

Calculating distance from pickup - latitude,longitude and dropoff - latitude,longitude for all the observations
-> Geopy packge needs to be installed seperatly 
    !pip install geopy

In [None]:
import geopy.distance

distance_kms = []
for index,row in data.iterrows():
    coords_1 = (row['pickup_latitude'],row['pickup_longitude'])
    coords_2 = (row['dropoff_latitude'],row['dropoff_longitude'])
    dist = geopy.distance.distance(coords_1, coords_2).km
    distance_kms.append(dist)

data['distance_kms'] = distance_kms
data.head()

Dropping month and year  as all observations are of one month and one year. And dropping datatime,pickup_latitude,pickup_longitude,dropoff_latitude and dropoff_longitude columns as we have already stored required data from it into seperate columns

In [None]:
data = data.drop(['pickup_datetime','month','year','pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude'],axis = 1)
data.head()

In [None]:
#plotting heatmap with corelations to identify weakly corelated values
plt.figure(figsize=(10,10))
sb.heatmap(data.corr(),annot=True)

In [None]:
data.groupby("day_of_week")["fare_amount"].mean().sort_values().plot()

plt.xlabel("Week")
plt.ylabel("Fare Amount Average")
plt.show()

In [None]:
sb.barplot(data=data, x = "fare_class", y = "fare_amount")
plt.title("Fare Amount vs Fare Class", fontsize = 14)
plt.show()

In [None]:
#splitting data for training and testing
from sklearn.model_selection import train_test_split
y = data['fare_amount'].values
X = data.drop(columns=['fare_amount'],axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [None]:
#performing linear regression on the trained data
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
model = lin_reg.fit(X_train,y_train)
y_pred=model.predict(X_test)

In [None]:
#Checking R2 value for linear regression
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

In [None]:
#calucating the root mean sqaured error for linear regression
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,y_pred)
rootmeansq = np.sqrt(mse)
print(mse)
print(rootmeansq)

In [None]:
#performing XG Boost regression on the trained data
import xgboost as xg

xgb_r = xg.XGBRegressor(objective ='reg:linear',n_estimators = 10, seed = 123)
xgb_r.fit(X_train, y_train)
y_pred = xgb_r.predict(X_test)

In [None]:
#Checking R2 value for xg boost regression
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

In [None]:
#calucating the root mean sqaured error for xg boost regression
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,y_pred)
rootmeansq = np.sqrt(mse)
print(mse)
print(rootmeansq)

Install catboost regressor explicitly --!pip install catboost

In [None]:
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor

model_CBR = CatBoostRegressor()

parameters = {'depth': [6,8,10],'learning_rate' : [0.01, 0.05, 0.1],'iterations': [30, 50, 60]}

#using Grid Search Cross Validation for hyperparameter tuning and finding the best parameters
grid = GridSearchCV(estimator=model_CBR, param_grid = parameters, cv = 2, n_jobs=-1)
grid.fit(X_train, y_train)

print("\n The best parameters across ALL searched params:\n", grid.best_params_)

In [None]:
model_CBR = CatBoostRegressor(depth= 10, iterations= 60, learning_rate= 0.1)
model_CBR.fit(X_train,y_train)
y_pred = model_CBR.predict(X_test)

In [None]:
#Checking R2 value for xg boost regression
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

In [None]:
#calucating the root mean sqaured error for xg boost regression
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,y_pred)
rootmeansq = np.sqrt(mse)
print(mse)
print(rootmeansq)

**Catboost regressor gave the best R2 score(0.8793) and minimum RMSE score(2.594) for the given dataset**