# Voting Regression

In [1]:
# Importing all the Required Libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
import pickle

In [2]:
data_frame = pd.read_csv("./../data/interim/pre_processed_dataset_2023.csv")
data_frame.head()

Unnamed: 0,AirAsia,AkasaAir,AllianceAir,Arrival_encoded,Class,Days_left,Departure_encoded,Destination_Bangalore,Destination_Chennai,Destination_Delhi,...,Source_Bangalore,Source_Chennai,Source_Delhi,Source_Hyderabad,Source_Kolkata,Source_Mumbai,SpiceJet,StarAir,Total_stops,Vistara
0,0,0,0,2,0.0,1,2,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,0,0,0,4,0.0,1,2,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,4,0.0,1,2,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,2,0.0,1,2,0,0,0,...,0,0,1,0,0,0,1,0,0,0
4,0,0,0,2,0.0,1,2,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [3]:
# Taking all columns except the target variable 'Fare'
X = data_frame.drop('Fare', axis=1)
print(X.head())

# Target variable 'Fare'
y = data_frame['Fare']
print(y.head())

   AirAsia  AkasaAir  AllianceAir  Arrival_encoded  Class  Days_left  \
0        0         0            0                2    0.0          1   
1        0         0            0                4    0.0          1   
2        0         0            0                4    0.0          1   
3        0         0            0                2    0.0          1   
4        0         0            0                2    0.0          1   

   Departure_encoded  Destination_Bangalore  Destination_Chennai  \
0                  2                      0                    0   
1                  2                      0                    0   
2                  2                      0                    0   
3                  2                      0                    0   
4                  2                      0                    0   

   Destination_Delhi  ...  Source_Bangalore  Source_Chennai  Source_Delhi  \
0                  0  ...                 0               0             1   
1   

In [4]:
# train-test split random sampling
X_train, X_test, y_train, y_test = train_test_split(data_frame.drop('Fare', axis=1), data_frame['Fare'], test_size=0.2, random_state=42)

# Assuming you have your feature matrix 'X' and target vector 'y'
# X: array-like or dataframe with features
# y: array-like or series with target labels

# Splitting the data into training and testing sets while maintaining stratification
X_train, X_test, y_train, y_test = train_test_split(data_frame.drop('Fare', axis=1), data_frame['Fare'], test_size=0.2, stratify=data_frame['Fare'], random_state=42)



In [5]:
# create the models
rf = RandomForestRegressor()
gb = GradientBoostingRegressor()
lr = LinearRegression()

In [6]:
# create the voting regressor
voting_regressor = VotingRegressor([('rf', rf), ('gb', gb), ('lr', lr)])

# fit the model
voting_regressor.fit(X_train, y_train)

# Using the pickle file to get the trained model
# model = open('./../models/voting_regressor.pkl','rb')
# voting_regressor = pickle.load(model)

# predict the values
y_pred = voting_regressor.predict(X_test)

# Analyzing the performance of the model
print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

Mean Absolute Error (MAE): 3824.226
Mean Squared Error (MSE): 35300021.191
Root Mean Squared Error (RMSE): 5941.382
R2_score: 0.914459
Root Mean Squared Log Error (RMSLE): 8.69


In [7]:
# Saving the model to pickle file
file = open('./../models/voting_regressor.pkl', 'wb')
pickle.dump(voting_regressor, file)

> **Checking how the individual models perform**

In [8]:
# Random Forest
print("RandomForest Regressor...")
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

RandomForest Regressor...
Mean Absolute Error (MAE): 2132.312
Mean Squared Error (MSE): 18841386.418
Root Mean Squared Error (RMSE): 4340.667
R2_score: 0.954342
Root Mean Squared Log Error (RMSLE): 8.376


In [9]:
# Gradient Boosting
print("Gradient Boosting...")
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

Gradient Boosting...
Mean Absolute Error (MAE): 4029.051
Mean Squared Error (MSE): 42650374.964
Root Mean Squared Error (RMSE): 6530.725
R2_score: 0.896647
Root Mean Squared Log Error (RMSLE): 8.784


In [10]:
# Linear Regression
print("Linear Regression...")
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

Linear Regression...
Mean Absolute Error (MAE): 6807.192
Mean Squared Error (MSE): 94958199.61
Root Mean Squared Error (RMSE): 9744.65
R2_score: 0.769891
Root Mean Squared Log Error (RMSLE): 9.184
