## Preprocessing the dataset

In [17]:
# Importing required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [18]:
# Loading the dataset
data_path = "./../data/processed/cleaned_dataset_2023.csv"
train_data = pd.read_csv(data_path)
train_data.head()

Unnamed: 0,Date_of_journey,Journey_day,Airline,Flight_code,Class,Source,Departure,Total_stops,Arrival,Destination,Duration_in_hours,Days_left,Fare
0,16-01-2023,Monday,SpiceJet,SG-8169,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.0833,1,5335
1,16-01-2023,Monday,Indigo,6E-2519,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai,2.3333,1,5899
2,16-01-2023,Monday,GO FIRST,G8-354,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai,2.1667,1,5801
3,16-01-2023,Monday,SpiceJet,SG-8709,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.0833,1,5794
4,16-01-2023,Monday,Air India,AI-805,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.1667,1,5955


In [19]:
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452088 entries, 0 to 452087
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Date_of_journey    452088 non-null  object 
 1   Journey_day        452088 non-null  object 
 2   Airline            452088 non-null  object 
 3   Flight_code        452088 non-null  object 
 4   Class              452088 non-null  object 
 5   Source             452088 non-null  object 
 6   Departure          452088 non-null  object 
 7   Total_stops        452088 non-null  object 
 8   Arrival            452088 non-null  object 
 9   Destination        452088 non-null  object 
 10  Duration_in_hours  452088 non-null  float64
 11  Days_left          452088 non-null  int64  
 12  Fare               452088 non-null  int64  
dtypes: float64(1), int64(2), object(10)
memory usage: 44.8+ MB
None


In [20]:
# Preprocess the training data

train_data.dropna(inplace=True)

# Extract the day and month of the journey from the "Date_of_Journey" column
train_data["Journey_date"] = pd.to_datetime(train_data["Date_of_journey"], format="%d-%m-%Y").dt.day
train_data["Journey_month"] = pd.to_datetime(train_data["Date_of_journey"], format="%d-%m-%Y").dt.month

# One-hot encoding the Journey_day feature
train_data['Journey_day_encoded'] = train_data['Journey_day'].map({'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4,'Friday': 5, 'Saturday': 6, 'Sunday': 7})


In [21]:
train_data.head()

Unnamed: 0,Date_of_journey,Journey_day,Airline,Flight_code,Class,Source,Departure,Total_stops,Arrival,Destination,Duration_in_hours,Days_left,Fare,Journey_date,Journey_month,Journey_day_encoded
0,16-01-2023,Monday,SpiceJet,SG-8169,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.0833,1,5335,16,1,1
1,16-01-2023,Monday,Indigo,6E-2519,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai,2.3333,1,5899,16,1,1
2,16-01-2023,Monday,GO FIRST,G8-354,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai,2.1667,1,5801,16,1,1
3,16-01-2023,Monday,SpiceJet,SG-8709,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.0833,1,5794,16,1,1
4,16-01-2023,Monday,Air India,AI-805,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.1667,1,5955,16,1,1


In [22]:
# Drop the "Date_of_Journey" column
train_data.drop(["Date_of_journey", "Journey_day"], axis=1, inplace=True)

In [23]:
# Drop the "Flight code" column
train_data.drop(["Flight_code"], axis=1, inplace=True)

In [24]:
# Ordinal encoding the Class feature
from sklearn.preprocessing import OrdinalEncoder

# Extract unique values of Class feature
class_values = train_data['Class'].unique()

# Create OrdinalEncoder object
encoder = OrdinalEncoder(categories=[class_values])

# Fit and transform Class feature
train_data['Class'] = encoder.fit_transform(train_data[['Class']])


In [25]:
# # encoding source destination and airline using one hot encoding
# airline = pd.get_dummies(train_data["Airline"], drop_first=True)
# source = pd.get_dummies(train_data["Source"], drop_first=True)
# destination = pd.get_dummies(train_data["Destination"], drop_first=True)
# train_data.replace({"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}, inplace=True)

# train_data = pd.concat([train_data, airline, source, destination], axis=1)
# train_data.drop(["Airline", "Source", "Destination"], axis=1, inplace=True)

In [26]:
# Encode source and destination features using one hot encoding
Airline = train_data[["Airline"]]
Airline = pd.get_dummies(Airline, drop_first= True)

Source = train_data[["Source"]]
Source = pd.get_dummies(Source, drop_first= True)

Destination = train_data[["Destination"]]
Destination = pd.get_dummies(Destination, drop_first = True)
train_data = pd.concat([train_data, Airline, Source, Destination], axis = 1)
train_data.drop(["Airline", "Source", "Destination"], axis = 1, inplace = True)

In [27]:
# print(set(train_data['Departure']) - set(departure_categories))

In [28]:
# Encoding Departure and Arrival using ordinal encoder
# extract unique values of Departure
departure_categories = train_data['Departure'].unique()

# sort the categories in the desired order
departure_categories = sorted(departure_categories, key=lambda x: ('Before' in x, 'AM' in x, 'Noon' in x, 'After' in x))

# create a dictionary to map categories to numerical values
departure_dict = {cat: i+1 for i, cat in enumerate(departure_categories)}

# encode the Departure feature using the dictionary
train_data['Departure_encoded'] = train_data['Departure'].map(departure_dict)


# For Arrival
arrival_categories = train_data['Arrival'].unique()

# sort the categories in the desired order
arrival_categories = ['Before 6 AM', '6 AM - 12 Noon', '12 Noon - 6 PM', 'After 6 PM']

# create a dictionary to map categories to numerical values
arrival_dict = {cat: i+1 for i, cat in enumerate(arrival_categories)}

# encode the Departure feature using the dictionary
train_data['Arrival_encoded'] = train_data['Arrival'].map(departure_dict)

train_data.drop(["Arrival", "Departure"], axis = 1, inplace = True)

In [29]:
# Encoding the Total_stops feature using ordinal encoding
train_data['Total_stops'] = train_data['Total_stops'].replace(['non-stop', '1-stop', '2+-stop'], [0, 1, 2])

In [30]:
train_data = train_data.sort_index(axis=1)

In [31]:
# Saving the preprocessed data frame in .csv file
train_data.to_csv('./../data/interim/pre_processed_dataset_2023.csv', index=False)

In [32]:
train_data.head()

Unnamed: 0,Airline_AirAsia,Airline_AkasaAir,Airline_AllianceAir,Airline_GO FIRST,Airline_Indigo,Airline_SpiceJet,Airline_StarAir,Airline_Vistara,Arrival_encoded,Class,...,Journey_date,Journey_day_encoded,Journey_month,Source_Bangalore,Source_Chennai,Source_Delhi,Source_Hyderabad,Source_Kolkata,Source_Mumbai,Total_stops
0,0,0,0,0,0,1,0,0,2,0.0,...,16,1,1,0,0,1,0,0,0,0
1,0,0,0,0,1,0,0,0,4,0.0,...,16,1,1,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,4,0.0,...,16,1,1,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,2,0.0,...,16,1,1,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,2,0.0,...,16,1,1,0,0,1,0,0,0,0
