In [101]:
# Flight Price Deep Learning ANN (Artificial Neural Network)

# Sebastian Vinther

In [102]:
# Package installation an libraries

In [103]:
!pip install scikit-learn
!pip install gradio
!pip install shap
!pip install imblearn
!pip install torch



In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tqdm import tqdm_notebook
from imblearn.over_sampling import SMOTE
import torch
from sklearn.preprocessing import OneHotEncoder
import itertools
from sklearn.model_selection import train_test_split


In [105]:
# Loading the Dataset from the github repository
df = pd.read_csv('https://raw.githubusercontent.com/sebvinther/Flight-Price-Deep-Learning-ANN-Artificial-Neural-Network-/main/Clean_Dataset1.csv', index_col=0)

In [106]:
# Looking at the shape and the basics to get an understanding of the data.

df.shape

(300153, 11)

In [107]:
#SLicing: using a max of 20000 samples also to save computing power, RAM.

data = df.iloc[:20000]

In [108]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 0 to 19999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   airline           20000 non-null  object 
 1   flight            20000 non-null  object 
 2   source_city       20000 non-null  object 
 3   departure_time    20000 non-null  object 
 4   stops             20000 non-null  object 
 5   arrival_time      20000 non-null  object 
 6   destination_city  20000 non-null  object 
 7   class             20000 non-null  object 
 8   duration          20000 non-null  float64
 9   days_left         20000 non-null  int64  
 10  price             20000 non-null  int64  
dtypes: float64(1), int64(2), object(8)
memory usage: 1.8+ MB


In [109]:
data.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [110]:
# describing the data
data.describe()

Unnamed: 0,duration,days_left,price
count,20000.0,20000.0,20000.0
mean,9.744345,25.84995,6125.02795
std,6.711598,13.553019,3647.292391
min,2.0,1.0,2281.0
25%,4.92,14.0,3855.0
50%,8.17,26.0,4896.0
75%,12.58,38.0,7424.0
max,36.92,49.0,31917.0


In [111]:
# checking for missing data
data.isnull().sum()

airline             0
flight              0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
dtype: int64

In [112]:
# I want to work with the price of the flights and therefore i put y = price from the dataset
y = data['price']

In [113]:
# selecting the other columns as x.

selected_columns = ['airline', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class', 'duration', 'days_left']
x = data[selected_columns]

In [114]:
y

0        5953
1        5953
2        5956
3        5955
4        5955
         ... 
19995    4496
19996    4496
19997    4496
19998    4496
19999    4496
Name: price, Length: 20000, dtype: int64

In [115]:
x

Unnamed: 0,airline,departure_time,stops,arrival_time,destination_city,class,duration,days_left
0,SpiceJet,Evening,zero,Night,Mumbai,Economy,2.17,1
1,SpiceJet,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1
2,AirAsia,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1
3,Vistara,Morning,zero,Afternoon,Mumbai,Economy,2.25,1
4,Vistara,Morning,zero,Morning,Mumbai,Economy,2.33,1
...,...,...,...,...,...,...,...,...
19995,Vistara,Morning,one,Evening,Bangalore,Economy,6.08,49
19996,Vistara,Afternoon,one,Evening,Bangalore,Economy,6.42,49
19997,Vistara,Afternoon,one,Night,Bangalore,Economy,6.58,49
19998,Vistara,Afternoon,one,Night,Bangalore,Economy,7.33,49


In [116]:
# columns i want to encdode when working with x
columns_to_encode = ['airline', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']

In [117]:
# now im extracting them to make the OHE
data_to_encode = x[columns_to_encode]

In [118]:
encoder = OneHotEncoder(drop='first', sparse=False)

In [119]:
# transforming
encoded_data = encoder.fit_transform(data_to_encode)



In [120]:
# New df created
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(columns_to_encode))

In [121]:
x_encoded = pd.concat([x, encoded_df], axis=1)

In [122]:
x_encoded = x_encoded.drop(columns=columns_to_encode)

print(x_encoded)

       duration  days_left  airline_Air_India  airline_GO_FIRST  \
0          2.17          1                0.0               0.0   
1          2.33          1                0.0               0.0   
2          2.17          1                0.0               0.0   
3          2.25          1                0.0               0.0   
4          2.33          1                0.0               0.0   
...         ...        ...                ...               ...   
19995      6.08         49                0.0               0.0   
19996      6.42         49                0.0               0.0   
19997      6.58         49                0.0               0.0   
19998      7.33         49                0.0               0.0   
19999      7.42         49                0.0               0.0   

       airline_Indigo  airline_SpiceJet  airline_Vistara  \
0                 0.0               1.0              0.0   
1                 0.0               1.0              0.0   
2              

In [123]:
# as mnetioned earlier our target is the price of the flight, which was why i put y = price
X = x_encoded

#splittting the data for test and tgraining (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

 # Display the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (16000, 20)
X_test shape: (4000, 20)
y_train shape: (16000,)
y_test shape: (4000,)


In [124]:
 # scaling the data for x
# standardscaler (sc)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# changing y into arrays for numpy, both test and train
y_train_array = y_train.to_numpy()
y_test_array = y_test.to_numpy()

# Initialize StandardScaler for y
sc_y = StandardScaler()

# Fit and transform y_train
y_train = sc_y.fit_transform(y_train_array.reshape(-1, 1))

# Transform y_test
y_test = sc_y.transform(y_test_array.reshape(-1, 1))

In [125]:
# converting the x and y into pytorch tensor with torch
tensor_data_X = torch.tensor(X_train, dtype=torch.float32)

In [126]:
# Doing the same for y now
tensor_data_Y = torch.tensor(y_train, dtype=torch.float32)

In [127]:
# Reshaping the tensor data
tensor_data_X.reshape(-1, 1).shape

torch.Size([320000, 1])

In [128]:
# takes around 3-4 minutes to run - explanation below

# Initializing Hyperparameters
#epochs = 5
#learning_rate = 2

# Initializing Parameters
#w = 50

#loss_set = {}

# 1. Creating a FeedForwardNetwork
# 1.1 Structure (Architecture) of NN
#model_net1 = torch.nn.Sequential(torch.nn.Linear(20,50),
                                 #torch.nn.ReLU(),
                                 #torch.nn.Dropout(0.2),

                                 #torch.nn.Linear(50,1),
                                 #torch.nn.ReLU(),

                                 #);

# 1.2 Loss Function
#loss_mse = torch.nn.MSELoss()

# 1.3 Optmization Approch
#optimizer = torch.optim.SGD(model_net1.parameters(), lr=learning_rate)

#w_his = []
#w_his.append(w)
# Loop over the number of epochs
#for epoch in tqdm_notebook(range(epochs), desc="Epochs"):
    #epoch_loss = 0.0

    # Loop over each sample in the dataset
    #for i in range(tensor_data_X.size(0)):

      # 2. Forward Pass
      #output = model_net1.forward(tensor_data_X[i].reshape(-1))

      # 3. FeedForward Evaluation
      #loss = loss_mse(output, tensor_data_Y[i].reshape(-1))
      #optimizer.zero_grad();

      # 4. Backward Pass / Gradient Calculation
      #loss.backward()

      # Store the loss for each epoch
      #epoch_loss += loss.item()

      # 5. Back Propagation / Update Weights
      #optimizer.step()

      # Store the weight value for each sample of data
      #w_his.append(float(model_net1[0].weight.data[0][0]))

      # Display the loss for the current sample
      #print(f"Epoch {epoch+1}, Sample {i+1}: Loss: {loss.item():.4f}")


    # Calculate and display average loss for the epoch
    #epoch_loss /= tensor_data_X.size(0)

    # Store the loss for each sample of data
    #loss_set[epoch] = epoch_loss
    #print(f"\nEpoch {epoch+1} Average Loss: {epoch_loss:.4f}\n{'-'*50}\n")

In [129]:
# For the first trial i put 5 epochs with a learningrate of 2, i will stay at 3 epochs for now to save time on running it.
#for the parameters i put the inputsize at 20 and neruons at 50 throughtout

In [130]:
#2. iteration with diffrent hyperparameters and w = 20 and 20 neurons again with relu as activation
epochs = 3
learning_rate = .01


# Initializing Parameters
w = 20

loss_set = {}

# 1. Creating a FeedForwardNetwork
# 1.1 Structure (Architecture) of NN
model_net2 = torch.nn.Sequential(torch.nn.Linear(20,20),
                                 torch.nn.ReLU(),
                                 torch.nn.Dropout(0.33),

                                 torch.nn.Linear(20,1),
                                 torch.nn.ReLU(),

                                 );

# 1.2 Loss Function
loss_mse = torch.nn.MSELoss()

# 1.3 Optmization Approch
optimizer = torch.optim.SGD(model_net2.parameters(), lr=learning_rate)

w_his = []
w_his.append(w)
# Loop over the number of epochs
for epoch in tqdm_notebook(range(epochs), desc="Epochs"):
    epoch_loss = 0.0

    # Loop over each sample in the dataset
    for i in range(tensor_data_X.size(0)):

      # 2. Forward Pass
      output = model_net2.forward(tensor_data_X[i].reshape(-1))

      # 3. FeedForward Evaluation
      loss = loss_mse(output, tensor_data_Y[i].reshape(-1))
      optimizer.zero_grad();

      # 4. Backward Pass / Gradient Calculation
      loss.backward()

      # Store the loss for each epoch
      epoch_loss += loss.item()

      # 5. Back Propagation / Update Weights
      optimizer.step()

      # Store the weight value for each sample of data
      w_his.append(float(model_net2[0].weight.data[0][0]))



    # Calculate and display average loss for the epoch
    epoch_loss /= tensor_data_X.size(0)

    # Store the loss for each sample of data
    loss_set[epoch] = epoch_loss
    print(f"\nEpoch {epoch+1} Average Loss: {epoch_loss:.4f}\n{'-'*50}\n")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for epoch in tqdm_notebook(range(epochs), desc="Epochs"):


Epochs:   0%|          | 0/3 [00:00<?, ?it/s]


Epoch 1 Average Loss: 0.6511
--------------------------------------------------


Epoch 2 Average Loss: 0.5876
--------------------------------------------------


Epoch 3 Average Loss: 0.5757
--------------------------------------------------



In [141]:
#hyperparameters with a lower learning rate again with the input set at 50 and 50 neurons
epochs = 3
learning_rate = .008


# Initializing Parameters
w = 50

loss_set = {}

# 1. Creating a FeedForwardNetwork
# 1.1 Structure (Architecture) of NN
model_net3 = torch.nn.Sequential(torch.nn.Linear(20,50),
                                 torch.nn.ReLU(),
                                 torch.nn.Dropout(0.33),

                                 torch.nn.Linear(50,1),
                                 torch.nn.ReLU(),

                                 );

# 1.2 Loss Function
loss_mse = torch.nn.MSELoss()

# 1.3 Optmization Approch
optimizer = torch.optim.SGD(model_net3.parameters(), lr=learning_rate)

w_his = []
w_his.append(w)
# Loop over the number of epochs
for epoch in tqdm_notebook(range(epochs), desc="Epochs"):
    epoch_loss = 0.0

    # Loop over each sample in the dataset
    for i in range(tensor_data_X.size(0)):

      # 2. Forward Pass
      output = model_net3.forward(tensor_data_X[i].reshape(-1))

      # 3. FeedForward Evaluation
      loss = loss_mse(output, tensor_data_Y[i].reshape(-1))
      optimizer.zero_grad();

      # 4. Backward Pass / Gradient Calculation
      loss.backward()

      # Store the loss for each epoch
      epoch_loss += loss.item()

      # 5. Back Propagation / Update Weights
      optimizer.step()

      # Store the weight value for each sample of data
      w_his.append(float(model_net3[0].weight.data[0][0]))


    # Calculate and display average loss for the epoch
    epoch_loss /= tensor_data_X.size(0)

    # Store the loss for each sample of data
    loss_set[epoch] = epoch_loss
    print(f"\nEpoch {epoch+1} Average Loss: {epoch_loss:.4f}\n{'-'*50}\n")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for epoch in tqdm_notebook(range(epochs), desc="Epochs"):


Epochs:   0%|          | 0/3 [00:00<?, ?it/s]


Epoch 1 Average Loss: 0.6286
--------------------------------------------------


Epoch 2 Average Loss: 0.5665
--------------------------------------------------


Epoch 3 Average Loss: 0.5519
--------------------------------------------------



In [142]:
#For 4th trial: epochs = 3 learning_rate = .01 input_size = 20 Neurons = 30 activation_input = 30
epochs = 3
learning_rate = .01


# Initializing Parameters
w = 50

loss_set = {}

# 1. Creating a FeedForwardNetwork
# 1.1 Structure (Architecture) of NN
model_net4 = torch.nn.Sequential(torch.nn.Linear(20,30),
                                 torch.nn.ReLU(),
                                 torch.nn.Dropout(0.33),

                                 torch.nn.Linear(30,1),
                                 torch.nn.ReLU(),

                                 );

# 1.2 Loss Function
loss_mse = torch.nn.MSELoss()

# 1.3 Optmization Approch
optimizer = torch.optim.SGD(model_net4.parameters(), lr=learning_rate)

w_his = []
w_his.append(w)
# Loop over the number of epochs
for epoch in tqdm_notebook(range(epochs), desc="Epochs"):
    epoch_loss = 0.0

    # Loop over each sample in the dataset
    for i in range(tensor_data_X.size(0)):

      # 2. Forward Pass
      output = model_net4.forward(tensor_data_X[i].reshape(-1))

      # 3. FeedForward Evaluation
      loss = loss_mse(output, tensor_data_Y[i].reshape(-1))
      optimizer.zero_grad();

      # 4. Backward Pass / Gradient Calculation
      loss.backward()

      # Store the loss for each epoch
      epoch_loss += loss.item()

      # 5. Back Propagation / Update Weights
      optimizer.step()

      # Store the weight value for each sample of data
      w_his.append(float(model_net4[0].weight.data[0][0]))


    # Calculate and display average loss for the epoch
    epoch_loss /= tensor_data_X.size(0)

    # Store the loss for each sample of data
    loss_set[epoch] = epoch_loss
    print(f"\nEpoch {epoch+1} Average Loss: {epoch_loss:.4f}\n{'-'*50}\n")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for epoch in tqdm_notebook(range(epochs), desc="Epochs"):


Epochs:   0%|          | 0/3 [00:00<?, ?it/s]


Epoch 1 Average Loss: 0.6375
--------------------------------------------------


Epoch 2 Average Loss: 0.5743
--------------------------------------------------


Epoch 3 Average Loss: 0.5589
--------------------------------------------------



In [133]:
#For 5th trial: epochs = 3 learning_rate = .01 input_size = 20 Neurons = 80 activation_input = 80
epochs = 3
learning_rate = .01


# Initializing Parameters
w = 50

loss_set = {}

# 1. Creating a FeedForwardNetwork
# 1.1 Structure (Architecture) of NN
model_net5 = torch.nn.Sequential(torch.nn.Linear(20,80),
                                 torch.nn.ReLU(),
                                 torch.nn.Dropout(0.33),

                                 torch.nn.Linear(80,1),
                                 torch.nn.ReLU(),

                                 );

# 1.2 Loss Function
loss_mse = torch.nn.MSELoss()

# 1.3 Optmization Approch
optimizer = torch.optim.SGD(model_net5.parameters(), lr=learning_rate)

w_his = []
w_his.append(w)
# Loop over the number of epochs
for epoch in tqdm_notebook(range(epochs), desc="Epochs"):
    epoch_loss = 0.0

    # Loop over each sample in the dataset
    for i in range(tensor_data_X.size(0)):

      # 2. Forward Pass
      output = model_net5.forward(tensor_data_X[i].reshape(-1))

      # 3. FeedForward Evaluation
      loss = loss_mse(output, tensor_data_Y[i].reshape(-1))
      optimizer.zero_grad();

      # 4. Backward Pass / Gradient Calculation
      loss.backward()

      # Store the loss for each epoch
      epoch_loss += loss.item()

      # 5. Back Propagation / Update Weights
      optimizer.step()

      # Store the weight value for each sample of data
      w_his.append(float(model_net5[0].weight.data[0][0]))



    # Calculate and display average loss for the epoch
    epoch_loss /= tensor_data_X.size(0)

    # Store the loss for each sample of data
    loss_set[epoch] = epoch_loss
    print(f"\nEpoch {epoch+1} Average Loss: {epoch_loss:.4f}\n{'-'*50}\n")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for epoch in tqdm_notebook(range(epochs), desc="Epochs"):


Epochs:   0%|          | 0/3 [00:00<?, ?it/s]


Epoch 1 Average Loss: 0.6400
--------------------------------------------------


Epoch 2 Average Loss: 0.5634
--------------------------------------------------


Epoch 3 Average Loss: 0.5513
--------------------------------------------------



In [134]:
# AS the average loss was the lowest at 0.55 at the last trial/ iteration, i will move onwards from there

In [135]:
torch.save(model_net5, 'model_net5.pkl')

In [136]:
# loading the model
model_net5_trained = torch.load('model_net5.pkl')
model_net5_trained.eval()

Sequential(
  (0): Linear(in_features=20, out_features=80, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.33, inplace=False)
  (3): Linear(in_features=80, out_features=1, bias=True)
  (4): ReLU()
)

In [137]:
#showcasing the parameters
for name, param in model_net5_trained.named_parameters():
    print(f"Layer: {name}")
    print(f"Size: {param.size()}")
    print(f"Values: \n{param.data}\n")

Layer: 0.weight
Size: torch.Size([80, 20])
Values: 
tensor([[-0.1597, -0.3625,  0.1145,  ..., -0.0285, -0.1116,  0.2116],
        [ 0.0651, -0.3949,  0.1986,  ..., -0.1453,  0.1793, -0.0168],
        [ 0.0742, -0.1362,  0.0562,  ...,  0.1142,  0.0382,  0.0614],
        ...,
        [-0.3115, -0.6411,  0.1667,  ...,  0.0582, -0.0075,  0.0781],
        [-0.0818,  0.4785,  0.0760,  ..., -0.3339, -0.1194,  0.1884],
        [ 0.1139,  1.5297,  0.1090,  ..., -0.0266, -0.2224,  0.0612]])

Layer: 0.bias
Size: torch.Size([80])
Values: 
tensor([-7.0685e-01, -7.1429e-01, -4.2537e-01,  6.7942e-01, -5.7999e-01,
         1.7832e-01, -1.4638e-02, -1.1420e-01, -3.5644e-02,  1.2921e-01,
        -3.7303e-02, -6.2118e-03, -2.6404e-01, -2.3675e-01,  2.4405e-01,
         8.6814e-02,  3.2858e-01,  8.4890e-02, -7.5581e-01, -1.4255e-04,
         4.3927e-02, -2.0691e-01,  2.2005e-01,  3.1292e-02, -5.4177e-01,
        -1.1396e-01, -1.3670e+00,  3.2803e-02,  2.6044e-02,  5.8770e-02,
         1.7904e-01, -1.4847e

In [138]:
# Convert X_test to PyTorch tensor
tensor_test_X = torch.tensor(X_test, dtype=torch.float32)

# Convert y_test to PyTorch tensor
tensor_test_Y = torch.tensor(y_test, dtype=torch.float32)

In [139]:
with torch.no_grad():
    model_net5.eval()  # evaluation
    predictions = model_net5(tensor_test_X)

# Print the predictions
print(predictions)

tensor([[1.8074],
        [1.2761],
        [0.0000],
        ...,
        [1.4700],
        [0.0000],
        [0.0000]])


In [140]:
#lastly i will find the average loss for the test

test_loss = 0.0

# Loop over each sample in the test dataset
for i in range(tensor_test_X.size(0)):

    # 1. Forward Pass
    output = model_net5.forward(tensor_test_X[i].reshape(-1))

    # 2. Calculate Loss
    loss = loss_mse(output, tensor_test_Y[i].reshape(-1))

    # Aggregate test loss
    test_loss += loss.item()



# Calculate and display average test loss
test_loss /= tensor_test_X.size(0)
print(f"\nAverage Test Loss: {test_loss:.4f}")



Average Test Loss: 0.5664


Overall conclusion:

I could've used a heatmap at the beginning to showcase with diffrent parts of the data that could've had the most correlation with the price. This maybe would've helped or improved the results, showcsing whether the results are better with more variables or fewer variables with more correlation.