## Preprocessing the dataset

In [140]:
# Importing required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [141]:
# Loading the dataset
data_path = "./../data/processed/cleaned_dataset_2023.csv"
encoded_data = pd.read_csv(data_path)
encoded_data.head()

Unnamed: 0,Date_of_journey,Journey_day,Airline,Flight_code,Class,Source,Departure,Total_stops,Arrival,Destination,Duration_in_hours,Days_left,Fare
0,16-01-2023,Monday,SpiceJet,SG-8169,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.0833,1,5335
1,16-01-2023,Monday,Indigo,6E-2519,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai,2.3333,1,5899
2,16-01-2023,Monday,GO FIRST,G8-354,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai,2.1667,1,5801
3,16-01-2023,Monday,SpiceJet,SG-8709,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.0833,1,5794
4,16-01-2023,Monday,Air India,AI-805,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.1667,1,5955


In [142]:
print(encoded_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452088 entries, 0 to 452087
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Date_of_journey    452088 non-null  object 
 1   Journey_day        452088 non-null  object 
 2   Airline            452088 non-null  object 
 3   Flight_code        452088 non-null  object 
 4   Class              452088 non-null  object 
 5   Source             452088 non-null  object 
 6   Departure          452088 non-null  object 
 7   Total_stops        452088 non-null  object 
 8   Arrival            452088 non-null  object 
 9   Destination        452088 non-null  object 
 10  Duration_in_hours  452088 non-null  float64
 11  Days_left          452088 non-null  int64  
 12  Fare               452088 non-null  int64  
dtypes: float64(1), int64(2), object(10)
memory usage: 44.8+ MB
None


In [143]:
# Preprocess the training data

encoded_data.dropna(inplace=True)

# Extract the day and month of the journey from the "Date_of_Journey" column
encoded_data["Journey_date"] = pd.to_datetime(encoded_data["Date_of_journey"], format="%d-%m-%Y").dt.day
encoded_data["Journey_month"] = pd.to_datetime(encoded_data["Date_of_journey"], format="%d-%m-%Y").dt.month

# One-hot encoding the Journey_day feature
encoded_data['Journey_day_encoded'] = encoded_data['Journey_day'].map({'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4,'Friday': 5, 'Saturday': 6, 'Sunday': 7})


In [144]:
encoded_data.head()
print(encoded_data["Journey_day_encoded"].unique())
print(encoded_data["Journey_month"].unique())

[1 2 3 4 5 6 7]
[1 2 3]


In [145]:
# Drop the "Date_of_Journey" column
encoded_data.drop(["Date_of_journey", "Journey_day"], axis=1, inplace=True)

In [146]:
# Drop the "Flight code" column
encoded_data.drop(["Flight_code"], axis=1, inplace=True)

In [147]:
# Ordinal encoding the Class feature
from sklearn.preprocessing import OrdinalEncoder

# Extract unique values of Class feature
class_values = encoded_data['Class'].unique()
print("Unique class values: ", class_values)

# Create OrdinalEncoder object
encoder = OrdinalEncoder(categories=[class_values])

# Fit and transform Class feature
encoded_data['Class'] = encoder.fit_transform(encoded_data[['Class']])


Unique class values:  ['Economy' 'Premium Economy' 'Business' 'First']


In [148]:
print(encoded_data['Class'].unique())
print(encoded_data.head())

[0. 1. 2. 3.]
     Airline  Class Source   Departure Total_stops      Arrival Destination  \
0   SpiceJet    0.0  Delhi  After 6 PM    non-stop   After 6 PM      Mumbai   
1     Indigo    0.0  Delhi  After 6 PM    non-stop  Before 6 AM      Mumbai   
2   GO FIRST    0.0  Delhi  After 6 PM    non-stop  Before 6 AM      Mumbai   
3   SpiceJet    0.0  Delhi  After 6 PM    non-stop   After 6 PM      Mumbai   
4  Air India    0.0  Delhi  After 6 PM    non-stop   After 6 PM      Mumbai   

   Duration_in_hours  Days_left  Fare  Journey_date  Journey_month  \
0             2.0833          1  5335            16              1   
1             2.3333          1  5899            16              1   
2             2.1667          1  5801            16              1   
3             2.0833          1  5794            16              1   
4             2.1667          1  5955            16              1   

   Journey_day_encoded  
0                    1  
1                    1  
2              

In [149]:
# # encoding source destination and airline using one hot encoding
# airline = pd.get_dummies(train_data["Airline"], drop_first=True)
# source = pd.get_dummies(train_data["Source"], drop_first=True)
# destination = pd.get_dummies(train_data["Destination"], drop_first=True)
# train_data.replace({"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}, inplace=True)

# train_data = pd.concat([train_data, airline, source, destination], axis=1)
# train_data.drop(["Airline", "Source", "Destination"], axis=1, inplace=True)

In [150]:
# Encode source and destination features using one hot encoding
Airline = encoded_data['Airline']
print("Unique airlines : ", encoded_data["Airline"].unique())
Airline = pd.get_dummies(Airline, drop_first= True)

print("Airlines column without sorting: \n", Airline.head())
print("Airlines column after sorting: \n", (Airline.sort_index(axis=1)).head)

# Adding Airlines to the encoded data frame
print("Encoded data frame after appending Airlines:\n", encoded_data.head())

Source = encoded_data[["Source"]]
Source = pd.get_dummies(Source, drop_first= True)

print("Source column without sorting: \n", Source.head())
print("Source column after sorting: \n", (Source.sort_index(axis=1)).head)
encoded_data.append(Source)
print("Encoded data frame after appending Source:\n", encoded_data.head())

Destination = encoded_data[["Destination"]]
Destination = pd.get_dummies(Destination, drop_first = True)
encoded_data = pd.concat([encoded_data, Airline, Source, Destination], axis = 1)
encoded_data.drop(["Airline", "Source", "Destination"], axis = 1, inplace = True)

encoded_data.append(Destination)
print("Encoded data frame after appending Destination:\n", encoded_data.head())

Unique airlines :  ['SpiceJet' 'Indigo' 'GO FIRST' 'Air India' 'AirAsia' 'Vistara' 'AkasaAir'
 'AllianceAir' 'StarAir']
Airlines column without sorting: 
    AirAsia  AkasaAir  AllianceAir  GO FIRST  Indigo  SpiceJet  StarAir  \
0        0         0            0         0       0         1        0   
1        0         0            0         0       1         0        0   
2        0         0            0         1       0         0        0   
3        0         0            0         0       0         1        0   
4        0         0            0         0       0         0        0   

   Vistara  
0        0  
1        0  
2        0  
3        0  
4        0  
Airlines column after sorting: 
 <bound method NDFrame.head of         AirAsia  AkasaAir  AllianceAir  GO FIRST  Indigo  SpiceJet  StarAir  \
0             0         0            0         0       0         1        0   
1             0         0            0         0       1         0        0   
2             0       

  encoded_data.append(Source)


Encoded data frame after appending Source:
      Airline  Class Source   Departure Total_stops      Arrival Destination  \
0   SpiceJet    0.0  Delhi  After 6 PM    non-stop   After 6 PM      Mumbai   
1     Indigo    0.0  Delhi  After 6 PM    non-stop  Before 6 AM      Mumbai   
2   GO FIRST    0.0  Delhi  After 6 PM    non-stop  Before 6 AM      Mumbai   
3   SpiceJet    0.0  Delhi  After 6 PM    non-stop   After 6 PM      Mumbai   
4  Air India    0.0  Delhi  After 6 PM    non-stop   After 6 PM      Mumbai   

   Duration_in_hours  Days_left  Fare  Journey_date  Journey_month  \
0             2.0833          1  5335            16              1   
1             2.3333          1  5899            16              1   
2             2.1667          1  5801            16              1   
3             2.0833          1  5794            16              1   
4             2.1667          1  5955            16              1   

   Journey_day_encoded  
0                    1  
1         

  encoded_data.append(Destination)


Encoded data frame after appending Destination:
    Class   Departure Total_stops      Arrival  Duration_in_hours  Days_left  \
0    0.0  After 6 PM    non-stop   After 6 PM             2.0833          1   
1    0.0  After 6 PM    non-stop  Before 6 AM             2.3333          1   
2    0.0  After 6 PM    non-stop  Before 6 AM             2.1667          1   
3    0.0  After 6 PM    non-stop   After 6 PM             2.0833          1   
4    0.0  After 6 PM    non-stop   After 6 PM             2.1667          1   

   Fare  Journey_date  Journey_month  Journey_day_encoded  ...  Source_Delhi  \
0  5335            16              1                    1  ...             1   
1  5899            16              1                    1  ...             1   
2  5801            16              1                    1  ...             1   
3  5794            16              1                    1  ...             1   
4  5955            16              1                    1  ...             1

In [151]:
# print(set(train_data['Departure']) - set(departure_categories))

In [154]:
# Encoding Departure and Arrival using ordinal encoder
# extract unique values of Departure
departure_categories = encoded_data['Departure'].unique()

# sort the categories in the desired order
departure_categories = sorted(departure_categories, key=lambda x: ('Before' in x, 'AM' in x, 'Noon' in x, 'After' in x))

# create a dictionary to map categories to numerical values
departure_dict = {cat: i+1 for i, cat in enumerate(departure_categories)}

# encode the Departure feature using the dictionary
encoded_data['Departure_encoded'] = encoded_data['Departure'].map(departure_dict)

# For Arrival
arrival_categories = encoded_data['Arrival'].unique()

# sort the categories in the desired order
arrival_categories = ['Before 6 AM', '6 AM - 12 Noon', '12 Noon - 6 PM', 'After 6 PM']

# create a dictionary to map categories to numerical values
arrival_dict = {cat: i+1 for i, cat in enumerate(arrival_categories)}

# encode the Departure feature using the dictionary
encoded_data['Arrival_encoded'] = encoded_data['Arrival'].map(departure_dict)

encoded_data.drop(["Arrival", "Departure"], axis = 1, inplace = True)

In [155]:
encoded_data.head()

Unnamed: 0,Class,Total_stops,Duration_in_hours,Days_left,Fare,Journey_date,Journey_month,Journey_day_encoded,AirAsia,AkasaAir,...,Source_Kolkata,Source_Mumbai,Destination_Bangalore,Destination_Chennai,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_Mumbai,Departure_encoded,Arrival_encoded
0,0.0,non-stop,2.0833,1,5335,16,1,1,0,0,...,0,0,0,0,0,0,0,1,2,2
1,0.0,non-stop,2.3333,1,5899,16,1,1,0,0,...,0,0,0,0,0,0,0,1,2,4
2,0.0,non-stop,2.1667,1,5801,16,1,1,0,0,...,0,0,0,0,0,0,0,1,2,4
3,0.0,non-stop,2.0833,1,5794,16,1,1,0,0,...,0,0,0,0,0,0,0,1,2,2
4,0.0,non-stop,2.1667,1,5955,16,1,1,0,0,...,0,0,0,0,0,0,0,1,2,2


In [156]:
# Encoding the Total_stops feature using ordinal encoding
encoded_data['Total_stops'] = encoded_data['Total_stops'].replace(['non-stop', '1-stop', '2+-stop'], [0, 1, 2])

In [157]:
encoded_data = encoded_data.sort_index(axis=1)
encoded_data.head()

Unnamed: 0,AirAsia,AkasaAir,AllianceAir,Arrival_encoded,Class,Days_left,Departure_encoded,Destination_Bangalore,Destination_Chennai,Destination_Delhi,...,Source_Bangalore,Source_Chennai,Source_Delhi,Source_Hyderabad,Source_Kolkata,Source_Mumbai,SpiceJet,StarAir,Total_stops,Vistara
0,0,0,0,2,0.0,1,2,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,0,0,0,4,0.0,1,2,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,4,0.0,1,2,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,2,0.0,1,2,0,0,0,...,0,0,1,0,0,0,1,0,0,0
4,0,0,0,2,0.0,1,2,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [158]:
# Saving the preprocessed data frame in .csv file
encoded_data.to_csv('./../data/interim/pre_processed_dataset_2023.csv', index=False)

In [159]:
encoded_data.head()

Unnamed: 0,AirAsia,AkasaAir,AllianceAir,Arrival_encoded,Class,Days_left,Departure_encoded,Destination_Bangalore,Destination_Chennai,Destination_Delhi,...,Source_Bangalore,Source_Chennai,Source_Delhi,Source_Hyderabad,Source_Kolkata,Source_Mumbai,SpiceJet,StarAir,Total_stops,Vistara
0,0,0,0,2,0.0,1,2,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,0,0,0,4,0.0,1,2,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,4,0.0,1,2,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,2,0.0,1,2,0,0,0,...,0,0,1,0,0,0,1,0,0,0
4,0,0,0,2,0.0,1,2,0,0,0,...,0,0,1,0,0,0,0,0,0,0
