# Voting Regression

In [28]:
# Importing all the Required Libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
import pickle

In [29]:
data_frame = pd.read_csv("./../data/interim/pre_processed_dataset_2023.csv")
data_frame.head()

Unnamed: 0,Airline_AirAsia,Airline_AkasaAir,Airline_AllianceAir,Airline_GO FIRST,Airline_Indigo,Airline_SpiceJet,Airline_StarAir,Airline_Vistara,Arrival_encoded,Class,...,Journey_date,Journey_day_encoded,Journey_month,Source_Bangalore,Source_Chennai,Source_Delhi,Source_Hyderabad,Source_Kolkata,Source_Mumbai,Total_stops
0,0,0,0,0,0,1,0,0,2,0.0,...,16,1,1,0,0,1,0,0,0,0
1,0,0,0,0,1,0,0,0,4,0.0,...,16,1,1,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,4,0.0,...,16,1,1,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,2,0.0,...,16,1,1,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,2,0.0,...,16,1,1,0,0,1,0,0,0,0


In [30]:
# Taking all columns except the target variable 'Fare'
X = data_frame.drop('Fare', axis=1)
print(X.head())

# Target variable 'Fare'
y = data_frame['Fare']
print(y.head())

   Airline_AirAsia  Airline_AkasaAir  Airline_AllianceAir  Airline_GO FIRST  \
0                0                 0                    0                 0   
1                0                 0                    0                 0   
2                0                 0                    0                 1   
3                0                 0                    0                 0   
4                0                 0                    0                 0   

   Airline_Indigo  Airline_SpiceJet  Airline_StarAir  Airline_Vistara  \
0               0                 1                0                0   
1               1                 0                0                0   
2               0                 0                0                0   
3               0                 1                0                0   
4               0                 0                0                0   

   Arrival_encoded  Class  ...  Journey_date  Journey_day_encoded  \
0                

In [31]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(data_frame.drop('Fare', axis=1), data_frame['Fare'], test_size=0.2, random_state=42)

In [32]:
# create the models
rf = RandomForestRegressor()
gb = GradientBoostingRegressor()
lr = LinearRegression()

In [33]:
# create the voting regressor
voting_regressor = VotingRegressor([('rf', rf), ('gb', gb), ('lr', lr)])

# fit the model
voting_regressor.fit(X_train, y_train)

# Using the pickle file to get the trained model
# model = open('./../models/voting_regressor.pkl','rb')
# voting_regressor = pickle.load(model)

# predict the values
y_pred = voting_regressor.predict(X_test)

# Analyzing the performance of the model
print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

Mean Absolute Error (MAE): 3824.298
Mean Squared Error (MSE): 35292208.117
Root Mean Squared Error (RMSE): 5940.725
R2_score: 0.914478
Root Mean Squared Log Error (RMSLE): 8.69


In [38]:
# Saving the model to pickle file
file = open('./../models/voting_regressor.pkl', 'wb')
pickle.dump(voting_regressor, file)

> **Checking how the individual models perform**

In [34]:
# Random Forest
print("RandomForest Regressor...")
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

RandomForest Regressor...
Mean Absolute Error (MAE): 2133.587
Mean Squared Error (MSE): 18896375.764
Root Mean Squared Error (RMSE): 4346.996
R2_score: 0.954209
Root Mean Squared Log Error (RMSLE): 8.377


In [35]:
# Gradient Boosting
print("Gradient Boosting...")
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

Gradient Boosting...
Mean Absolute Error (MAE): 4029.051
Mean Squared Error (MSE): 42650374.964
Root Mean Squared Error (RMSE): 6530.725
R2_score: 0.896647
Root Mean Squared Log Error (RMSLE): 8.784


In [36]:
# Linear Regression
print("Linear Regression...")
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

Linear Regression...
Mean Absolute Error (MAE): 6807.192
Mean Squared Error (MSE): 94958199.61
Root Mean Squared Error (RMSE): 9744.65
R2_score: 0.769891
Root Mean Squared Log Error (RMSLE): 9.184
