## Training and Analyzing Performance of Standalone models

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [3]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

In [6]:
data_frame = pd.read_csv("./../data/interim/pre_processed_dataset_2023.csv")
data_frame.head()

Unnamed: 0,Airline_AirAsia,Airline_AkasaAir,Airline_AllianceAir,Airline_GO FIRST,Airline_Indigo,Airline_SpiceJet,Airline_StarAir,Airline_Vistara,Arrival_encoded,Class,Days_left,Departure_encoded,Destination_Bangalore,Destination_Chennai,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_Mumbai,Duration_in_hours,Fare,Journey_date,Journey_day_encoded,Journey_month,Source_Bangalore,Source_Chennai,Source_Delhi,Source_Hyderabad,Source_Kolkata,Source_Mumbai,Total_stops
0,0,0,0,0,0,1,0,0,2,0.0,1,2,0,0,0,0,0,1,2.0833,5335,16,1,1,0,0,1,0,0,0,0
1,0,0,0,0,1,0,0,0,4,0.0,1,2,0,0,0,0,0,1,2.3333,5899,16,1,1,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,4,0.0,1,2,0,0,0,0,0,1,2.1667,5801,16,1,1,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,2,0.0,1,2,0,0,0,0,0,1,2.0833,5794,16,1,1,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,2,0.0,1,2,0,0,0,0,0,1,2.1667,5955,16,1,1,0,0,1,0,0,0,0


In [16]:
# Making X and y data frame and train test split
from sklearn.model_selection import train_test_split
x = data_frame.drop('Fare', axis=1)
y = data_frame['Fare']

X_train, X_test, y_train, y_test = train_test_split(data_frame.drop('Fare', axis=1), data_frame['Fare'], test_size=0.2, random_state=42)

In [8]:
# Scaling to the values to machine language
from sklearn.preprocessing import MinMaxScaler

scalar = MinMaxScaler(feature_range=(0, 1))
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.fit_transform(X_test)
X_train = scalar.fit_transform(X_train_scaled)
X_test = scalar.fit_transform(X_test_scaled)

In [9]:
# Results data frame to store performance of all models
performance_measure = {'Model Name':[], 'Mean_Absolute_Error_MAE':[] ,'Adj_R_Square':[] ,'Root_Mean_Squared_Error_RMSE':[] ,'Mean_Absolute_Percentage_Error_MAPE':[] ,'Mean_Squared_Error_MSE':[] ,'Root_Mean_Squared_Log_Error_RMSLE':[] ,'R2_score':[]}

results = pd.DataFrame(performance_measure)
results.head()

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score


In [10]:
# Building the stand alone models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [11]:
# Creating objects for the models
lr = LinearRegression()
dt = DecisionTreeRegressor()
br = BaggingRegressor()
rfr = RandomForestRegressor()
svr = SVR()
xgbr = xgb.XGBRegressor()
knn = KNeighborsRegressor(n_neighbors=5)
etr = ExtraTreesRegressor()
re = Ridge()
lo = linear_model.Lasso(alpha=0.1)
gbr = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1,                                      min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)

In [19]:
# Training the models and printing their performance
from sklearn import metrics

models_list = [lr, dt, br, rfr, svr, xgbr, knn, etr, re, lo, gbr]

for model in models_list:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print('Model Name: ', model)
    print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
    print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
    print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
    print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
    print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

    # Calculating mean absolute percentage error
    def MAPE (y_test, y_pred):
        y_test, y_pred = np.array(y_test), np.array(y_pred)
        return np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    result = MAPE(y_test, y_pred)
    print('Mean Absolute Percentage Error (MAPE):', round(result, 2), '%')

    # Calculating adjusted R squared value
    r_squared = round(metrics.r2_score(y_test, y_pred),6)
    adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
    print('Adj R Square: ', adjusted_r_squared)

    print("================")

    new_row = {
    "Model Name": model,
    "Mean_Absolute_Error_MAE": metrics.mean_absolute_error(y_test, y_pred),
    "Adj_R_Square": adjusted_r_squared,
    "Root_Mean_Squared_Error_RMSE": np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
    "Mean_Absolute_Percentage_Error_MAPE": result,
    "Mean_Squared_Error_MSE": metrics.mean_squared_error(y_test, y_pred),
    "Root_Mean_Squared_Log_Error_RMSLE": np.log(
        np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    ),
    "R2_score": metrics.r2_score(y_test, y_pred),
    }

    results = results.append(new_row, ignore_index=True)

Model Name:  LinearRegression()
Mean Absolute Error (MAE): 6807.192
Mean Squared Error (MSE): 94958199.61
Root Mean Squared Error (RMSE): 9744.65
R2_score: 0.769891
Root Mean Squared Log Error (RMSLE): 9.184
Mean Absolute Percentage Error (MAPE): 42.39 %
Adj R Square:  0.769876
Model Name:  DecisionTreeRegressor()
Mean Absolute Error (MAE): 2504.367
Mean Squared Error (MSE): 33485687.223
Root Mean Squared Error (RMSE): 5786.682
R2_score: 0.918855
Root Mean Squared Log Error (RMSLE): 8.663
Mean Absolute Percentage Error (MAPE): 11.48 %
Adj R Square:  0.91885
Model Name:  BaggingRegressor()
Mean Absolute Error (MAE): 2220.092
Mean Squared Error (MSE): 20639181.004
Root Mean Squared Error (RMSE): 4543.037
R2_score: 0.949986
Root Mean Squared Log Error (RMSLE): 8.421
Mean Absolute Percentage Error (MAPE): 10.43 %
Adj R Square:  0.949983
Model Name:  RandomForestRegressor()
Mean Absolute Error (MAE): 2133.977
Mean Squared Error (MSE): 18876430.778
Root Mean Squared Error (RMSE): 4344.701
R2

In [None]:
result.head()

In [None]:
# Creating a clean data frame of the results
models = [
    "LinearRegression",
    "DecisionTreeRegressor",
    "RandomForestRegressor",
    "KNeighborsRegressor",
    "ExtraTreesRegressor",
    "GradientBoostingRegressor",
    "XGBRegressor",
    "BaggingRegressor",
    "Ridge Regression",
    "Lasso Regression",
]
results_formatted = pd.DataFrame({"Model_Name": models})
results_formatted["Adj_R_Square"] = result["Adj_R_Square"]
results_formatted["Mean_Absolute_Error_MAE"] = result["Mean_Absolute_Error_MAE"]
results_formatted["Root_Mean_Squared_Error_RMSE"] = result[
    "Root_Mean_Squared_Error_RMSE"
]
results_formatted["Mean_Absolute_Percentage_Error_MAPE"] = result[
    "Mean_Absolute_Percentage_Error_MAPE"
]
results_formatted["Mean_Squared_Error_MSE"] = result["Mean_Squared_Error_MSE"]
results_formatted["Root_Mean_Squared_Log_Error_RMSLE"] = result[
    "Root_Mean_Squared_Log_Error_RMSLE"
]
results_formatted["R2_score"] = result["R2_score"]
results_formatted = results_formatted.sort_values(
    by="Adj_R_Square", ascending=False
).reset_index(drop=True)
results_formatted
