# **Creating Models**

In [58]:
import pandas as pd
import numpy as np

In [59]:
# Creating the dataframe for the 2023 dataset

datapath = "./../cleaned_dataset_2023.csv"
df = pd.read_csv(datapath)
df.head()

Unnamed: 0,Date_of_journey,Journey_day,Airline,Flight_code,Class,Source,Departure,Total_stops,Arrival,Destination,Duration_in_hours,Days_left,Fare
0,16-01-2023,Monday,SpiceJet,SG-8169,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.0833,1,5335
1,16-01-2023,Monday,Indigo,6E-2519,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai,2.3333,1,5899
2,16-01-2023,Monday,GO FIRST,G8-354,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai,2.1667,1,5801
3,16-01-2023,Monday,SpiceJet,SG-8709,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.0833,1,5794
4,16-01-2023,Monday,Air India,AI-805,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.1667,1,5955


In [60]:
df["Journey_month"] = pd.to_datetime(df["Date_of_journey"], format = "%d-%m-%Y").dt.month
df["Journey_day"] = pd.to_datetime(df.Date_of_journey, format="%d-%m-%Y").dt.day

In [61]:
# Encoding the training dataset
Airline = df[["Airline"]]
Airline = pd.get_dummies(Airline, drop_first= True)
Class = df[["Class"]]
Class = pd.get_dummies(Class, drop_first= True)
Departure = df[["Departure"]]
Departure = pd.get_dummies(Departure, drop_first= True)
Source = df[["Source"]]
Source = pd.get_dummies(Source, drop_first= True)
df["Destination"].value_counts()
Destination = df[["Destination"]]
Destination = pd.get_dummies(Destination, drop_first = True)
df.drop(["Flight_code"], axis = 1, inplace = True)
df.replace({"non-stop": 0, "1-stop": 1, "2+-stop": 2}, inplace = True)
df4 = pd.concat([df, Airline, Class, Source, Destination, Departure], axis = 1)
df4.drop(["Airline", "Class", "Source", "Destination", "Date_of_journey", "Journey_day", "Arrival", "Departure"], axis = 1, inplace = True)
print(df4.head())
df = df4

   Total_stops  Duration_in_hours  Days_left  Fare  Journey_month  \
0            0             2.0833          1  5335              1   
1            0             2.3333          1  5899              1   
2            0             2.1667          1  5801              1   
3            0             2.0833          1  5794              1   
4            0             2.1667          1  5955              1   

   Airline_AirAsia  Airline_AkasaAir  Airline_AllianceAir  Airline_GO FIRST  \
0                0                 0                    0                 0   
1                0                 0                    0                 0   
2                0                 0                    0                 1   
3                0                 0                    0                 0   
4                0                 0                    0                 0   

   Airline_Indigo  ...  Source_Mumbai  Destination_Bangalore  \
0               0  ...              0         

In [62]:
# storing the Dependent Variables in X and Independent Variable in Y
x=df.drop(['Fare'],axis=1)
y=df['Fare']

In [63]:
df.head()

Unnamed: 0,Total_stops,Duration_in_hours,Days_left,Fare,Journey_month,Airline_AirAsia,Airline_AkasaAir,Airline_AllianceAir,Airline_GO FIRST,Airline_Indigo,...,Source_Mumbai,Destination_Bangalore,Destination_Chennai,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_Mumbai,Departure_6 AM - 12 PM,Departure_After 6 PM,Departure_Before 6 AM
0,0,2.0833,1,5335,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,0,2.3333,1,5899,1,0,0,0,0,1,...,0,0,0,0,0,0,1,0,1,0
2,0,2.1667,1,5801,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
3,0,2.0833,1,5794,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,0,2.1667,1,5955,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0


In [64]:
# Splitting the Data into Training set and Testing Set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((316461, 30), (135627, 30), (316461,), (135627,))

In [66]:
# VOTING REGRESSOR MODEL

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [68]:
# Fitting the model into VotingRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression

rf = RandomForestRegressor()
gb = GradientBoostingRegressor()
lr = LinearRegression()

voting_regressor = VotingRegressor([('rf', rf), ('gb', gb), ('lr', lr)])
# Fitting the model
voting_regressor.fit(x_train, y_train)

# Predicting the values
y_pred = voting_regressor.predict(x_test)

# Calculating the mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)
accuracy = voting_regressor.score(x_test, y_test)
print("Accuracy: ", accuracy)

Mean Absolute Error: 3551.7804081958984
Accuracy:  0.9193341620444981


In [70]:
# Saving the model to a pickel file
import pickle

file = open('voting_regressor_2023.pkl', 'wb')
pickle.dump(voting_regressor, file)

In [73]:
# Using the pickel file to get the trained model
model = open('voting_regressor_2023.pkl','rb')
voting = pickle.load(model)

y_prediction = voting.predict(x_test)

In [None]:
x_test.head()