## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime
import warnings
warnings.filterwarnings('ignore')

## Data Analysis

In [2]:
train=pd.read_csv("/Users/rohithsiddi/Desktop/Participants_Dataset/train.csv")
test=pd.read_csv("/Users/rohithsiddi/Desktop/Participants_Dataset/test.csv")

In [3]:
train.head()

Unnamed: 0,trip_distance,rate_code,store_and_fwd_flag,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,imp_surcharge,total_amount,pickup_location_id,dropoff_location_id,year,month,day,day_of_week,hour_of_day,trip_duration,calculated_total_amount
0,9.01,1,N,1,26.0,0.0,0.5,8.14,5.76,0.3,40.7,262,138,2018,3,7,2,6,2131.0,24.3
1,0.2,1,N,1,3.0,0.0,0.5,0.75,0.0,0.3,4.55,263,236,2018,2,25,6,10,2377.0,37.4
2,9.65,1,N,1,41.5,0.0,0.5,9.61,5.76,0.3,57.67,138,230,2018,1,29,0,8,1286.0,30.36
3,9.5,1,N,1,30.0,0.5,0.5,9.25,5.76,0.3,46.31,186,138,2018,9,25,1,20,2586.0,4.3
4,5.8,1,N,1,21.5,0.5,0.5,4.56,0.0,0.3,27.36,162,87,2018,8,20,0,21,1575.0,23.8


In [4]:
train['weekend'] = train['day_of_week'].apply(lambda x: 1 if x in [5,6] else 0)
test['weekend'] = test['day_of_week'].apply(lambda x: 1 if x in [5,6] else 0)

In [5]:
train.drop(["pickup_location_id","dropoff_location_id","year"],axis=1,inplace=True)
test.drop(["pickup_location_id","dropoff_location_id","year"],axis=1,inplace=True)

In [6]:
train.shape

(35000, 18)

In [7]:
train.isnull().sum()

trip_distance              0
rate_code                  0
store_and_fwd_flag         0
payment_type               0
fare_amount                0
extra                      0
mta_tax                    0
tip_amount                 0
tolls_amount               0
imp_surcharge              0
total_amount               0
month                      0
day                        0
day_of_week                0
hour_of_day                0
trip_duration              0
calculated_total_amount    0
weekend                    0
dtype: int64

In [8]:
train["store_and_fwd_flag"].value_counts()

N    34823
Y      177
Name: store_and_fwd_flag, dtype: int64

In [9]:
train=pd.get_dummies(train,columns=["store_and_fwd_flag","mta_tax","imp_surcharge"],drop_first=True)
test=pd.get_dummies(test,columns=["store_and_fwd_flag","mta_tax","imp_surcharge"],drop_first=True)

In [10]:
train["rate_code"]=train["rate_code"].apply(str)
train["payment_type"]=train["payment_type"].apply(str)
test["rate_code"]=test["rate_code"].apply(str)
test["payment_type"]=test["payment_type"].apply(str)

In [11]:
train=pd.get_dummies(train,columns=["rate_code","payment_type"],drop_first=True)
test=pd.get_dummies(test,columns=["rate_code","payment_type"],drop_first=True)

In [12]:
train.corr()["calculated_total_amount"].sort_values()

payment_type_2            -0.038343
rate_code_5               -0.013888
payment_type_3            -0.010936
imp_surcharge_0.3         -0.008944
payment_type_4            -0.007232
store_and_fwd_flag_Y      -0.005429
fare_amount               -0.004883
rate_code_3               -0.002708
day                       -0.001215
day_of_week                0.000730
tolls_amount               0.000835
month                      0.000863
hour_of_day                0.001093
trip_distance              0.001392
total_amount               0.001413
rate_code_4                0.001421
mta_tax_0.5                0.001764
extra                      0.004083
trip_duration              0.006518
weekend                    0.006671
tip_amount                 0.021426
rate_code_2                0.022800
calculated_total_amount    1.000000
Name: calculated_total_amount, dtype: float64

In [13]:
#train.info()

In [14]:
train["hour_of_day"].unique()

array([ 6, 10,  8, 20, 21,  7,  2,  3, 18, 12, 16, 15, 11, 17, 19, 22, 14,
        0,  4,  5,  9, 13, 23,  1])

In [15]:
train.loc[(train['hour_of_day'] >= 0) & (train["hour_of_day"] <= 6), 'hour_of_day'] = 1
train.loc[(train['hour_of_day'] > 6) & (train['hour_of_day'] <= 16), 'hour_of_day'] = 2
train.loc[(train['hour_of_day'] > 16) & (train['hour_of_day'] <= 24), 'hour_of_day'] = 3

In [16]:
test.loc[(test['hour_of_day'] >= 0) & (test["hour_of_day"] <= 6), 'hour_of_day'] = 1
test.loc[(test['hour_of_day'] > 6) & (test['hour_of_day'] <= 16), 'hour_of_day'] = 2
test.loc[(test['hour_of_day'] > 16) & (test['hour_of_day'] <= 24), 'hour_of_day'] = 3

In [17]:
train["hour_of_day"].unique()

array([1, 2, 3])

In [18]:
train=pd.get_dummies(train,columns=["hour_of_day"],drop_first=True)
test=pd.get_dummies(test,columns=["hour_of_day"],drop_first=True)

### Splitting data 

In [19]:
X=train.drop("calculated_total_amount",axis=1)
y=train["calculated_total_amount"]

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
train_X,test_X,train_y,test_y=train_test_split(X,y,test_size=0.1)

### Feature Scaling

In [22]:
train_X.columns

Index(['trip_distance', 'fare_amount', 'extra', 'tip_amount', 'tolls_amount',
       'total_amount', 'month', 'day', 'day_of_week', 'trip_duration',
       'weekend', 'store_and_fwd_flag_Y', 'mta_tax_0.5', 'imp_surcharge_0.3',
       'rate_code_2', 'rate_code_3', 'rate_code_4', 'rate_code_5',
       'payment_type_2', 'payment_type_3', 'payment_type_4', 'hour_of_day_2',
       'hour_of_day_3'],
      dtype='object')

In [23]:
from sklearn.preprocessing import StandardScaler

In [24]:
ss=StandardScaler()

In [25]:
index_cols_scaling=['trip_distance', 'fare_amount', 'extra', 'tip_amount',
       'tolls_amount','total_amount','month', 'day',
       'day_of_week', 'trip_duration', 'store_and_fwd_flag_Y']

In [26]:
encoded_cols1=train_X[['rate_code_2',
       'rate_code_3', 'rate_code_4', 'rate_code_5', 'payment_type_2',
       'payment_type_3', 'payment_type_4','hour_of_day_2', 'hour_of_day_3',"weekend","mta_tax_0.5","imp_surcharge_0.3"]].copy()

encoded_cols2=test_X[['rate_code_2',
       'rate_code_3', 'rate_code_4', 'rate_code_5', 'payment_type_2',
       'payment_type_3', 'payment_type_4','hour_of_day_2', 'hour_of_day_3',"weekend","mta_tax_0.5","imp_surcharge_0.3"]].copy()

encoded_cols3=test[['rate_code_2',
       'rate_code_3', 'rate_code_4', 'rate_code_5', 'payment_type_2',
       'payment_type_3', 'payment_type_4','hour_of_day_2', 'hour_of_day_3',"weekend","mta_tax_0.5","imp_surcharge_0.3"]].copy()

In [27]:
train_X.drop(['rate_code_2',
       'rate_code_3', 'rate_code_4', 'rate_code_5', 'payment_type_2',
      'payment_type_3', 'payment_type_4','hour_of_day_2', 'hour_of_day_3',"weekend","mta_tax_0.5","imp_surcharge_0.3"],axis=1,inplace=True)

In [28]:
test_X.drop(['rate_code_2',
       'rate_code_3', 'rate_code_4', 'rate_code_5', 'payment_type_2',
      'payment_type_3', 'payment_type_4','hour_of_day_2', 'hour_of_day_3',"weekend","mta_tax_0.5","imp_surcharge_0.3"],axis=1,inplace=True)

In [29]:
test.drop(['rate_code_2',
       'rate_code_3', 'rate_code_4', 'rate_code_5', 'payment_type_2',
      'payment_type_3', 'payment_type_4','hour_of_day_2', 'hour_of_day_3',"weekend","mta_tax_0.5","imp_surcharge_0.3"],axis=1,inplace=True)

In [30]:
train_X=pd.DataFrame(ss.fit_transform(train_X))
train_X.columns=index_cols_scaling

In [31]:
test_X=pd.DataFrame(ss.transform(test_X))
test_X.columns=index_cols_scaling

In [32]:
test=pd.DataFrame(ss.transform(test))
test.columns=index_cols_scaling

In [33]:
train_X.index=encoded_cols1.index
test_X.index=encoded_cols2.index
test.index=encoded_cols3.index

In [34]:
train_X=pd.concat([train_X,encoded_cols1],axis=1)
test_X=pd.concat([test_X,encoded_cols2],axis=1)
test=pd.concat([test,encoded_cols3],axis=1)

## LinearRegression

In [35]:
from sklearn.linear_model import LinearRegression

In [36]:
lr=LinearRegression()

In [37]:
lr.fit(train_X,train_y)

In [38]:
predictions_lr=lr.predict(test_X)

In [39]:
from sklearn.metrics import mean_squared_error

In [40]:
rmse_lr=np.sqrt(mean_squared_error(predictions_lr,test_y))

In [41]:
rmse_lr

18.376932505512006

## KNN

In [42]:
from sklearn.neighbors import KNeighborsRegressor

In [43]:
knn=KNeighborsRegressor()

In [44]:
k_values=list(range(1,7))

In [45]:
from sklearn.model_selection import GridSearchCV

In [46]:
param_grid={'n_neighbors':k_values}

In [47]:
knn=GridSearchCV(knn,param_grid,cv=5,scoring="accuracy")

In [48]:
knn.fit(train_X,train_y)

In [49]:
knn.best_params_

{'n_neighbors': 1}

In [50]:
predictions_knn=knn.predict(test_X)

In [51]:
rmse_knn=np.sqrt(mean_squared_error(test_y,predictions_knn))

In [52]:
rmse_knn

25.906356936915255

## SVM


In [53]:
from sklearn.svm import SVR

In [54]:
svr=SVR()

In [55]:
param_grid={'kernel':['linear', 'rbf'],"C":[0.25,0.5,1,1.5,2]}

In [56]:
svr=GridSearchCV(svr,param_grid,cv=5)

In [57]:
svr.fit(train_X,train_y)

In [58]:
svr.best_params_

{'C': 1, 'kernel': 'linear'}

In [59]:
predictions_svr=svr.predict(test_X)

In [60]:
rmse_svr=np.sqrt(mean_squared_error(predictions_svr,test_y))

In [61]:
rmse_svr

18.60513990389267

### RandomForest

In [62]:
from sklearn.ensemble import RandomForestRegressor

In [63]:
rfr=RandomForestRegressor()

In [64]:
n_estimators=[150,200,250,275,300,350,400]
max_features= [1,2,3,4,5]
bootstrap = [True]

In [65]:
param_grid = {'n_estimators':n_estimators,
             'max_features':max_features,
             'bootstrap':bootstrap}  # Note, oob_score only makes sense when bootstrap=True!

In [66]:
rfr= GridSearchCV(rfr,param_grid,cv=5)

In [67]:
rfr.fit(train_X,train_y)

In [68]:
rfr.best_params_

{'bootstrap': True, 'max_features': 5, 'n_estimators': 350}

In [69]:
predictions_rfr=rfr.predict(test_X)

In [70]:
rmse_rfr=np.sqrt(mean_squared_error(predictions_rfr,test_y))

In [71]:
rmse_rfr

18.765431344362334

### Gradient Boosting

In [72]:
from sklearn.ensemble import GradientBoostingRegressor

In [73]:
gbr=GradientBoostingRegressor()

In [74]:
param_grid = {"n_estimators":[2,3,4,5,6,7,8,9,10,11,12],'max_depth':[1,2,3,4]}

In [75]:
gbr = GridSearchCV(gbr,param_grid,cv=5)

In [76]:
gbr.fit(train_X,train_y)

In [77]:
gbr.best_params_

{'max_depth': 2, 'n_estimators': 12}

In [78]:
predictions_gbr=gbr.predict(test_X)

In [79]:
rmse_gbr=np.sqrt(mean_squared_error(predictions_gbr,test_y))

In [80]:
rmse_gbr

18.35972222091564

## XGBoost

In [81]:
from xgboost import XGBRegressor

In [82]:
xgb=XGBRegressor()

In [83]:
param_grid={"n_estimators":[100,150,200,250,300,350,400],'max_depth': [2,3, 4, 5]}

In [84]:
xgb=GridSearchCV(xgb,param_grid,cv=5)

In [85]:
xgb.fit(train_X,train_y)

In [86]:
xgb.best_params_

{'max_depth': 2, 'n_estimators': 100}

In [87]:
predictions_xgb=xgb.predict(test_X)

In [88]:
rmse_xgb=np.sqrt(mean_squared_error(predictions_xgb,test_y))

In [89]:
rmse_xgb

18.452298375456405

## ANN

In [90]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import MeanSquaredLogarithmicError

hidden_units1 = 300
hidden_units2 = 600
hidden_units3 = 400
learning_rate = 0.01
def build_model_using_sequential():
    model = Sequential([
    Dense(hidden_units1, kernel_initializer='normal', activation='relu'),
    Dropout(0.2),
    Dense(hidden_units2, kernel_initializer='normal', activation='relu'),
    Dropout(0.2),
    Dense(hidden_units3, kernel_initializer='normal', activation='relu'),
    Dense(1, kernel_initializer='normal', activation='linear')
    ])
    return model
# build the model
model = build_model_using_sequential()
# loss function
msle = MeanSquaredLogarithmicError()
model.compile(
    loss=msle, 
    optimizer=Adam(learning_rate=learning_rate), 
    metrics=[msle]
)
# train the model
history = model.fit(
    train_X, 
    train_y, 
    epochs=50,
    batch_size=64,
    validation_split=0.2
)

Metal device set to: Apple M1 Pro
Epoch 1/50


2023-01-24 18:40:24.381822: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-01-24 18:40:24.381947: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2023-01-24 18:40:24.566505: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-01-24 18:40:24.953990: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2023-01-24 18:40:29.443383: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50


Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [91]:
predictions_ann=model.predict(test_X)



2023-01-24 18:42:44.416241: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




In [92]:
rmse_ann=np.sqrt(mean_squared_error(predictions_ann,test_y))

In [93]:
rmse_ann

18.900378364138056

### Final Prediction

In [94]:
final_predictions=gbr.predict(test)

In [95]:
final_predictions

In [96]:
submission = pd.DataFrame({"calculated_total_amount":final_predictions})

In [97]:
submission.shape

In [98]:
submission.to_csv('my_submission_file.csv', index=False)