<hr style="border:2px solid gray">

#**STEP: 1/5** - Load and explore the dataset.

### Import libraries and define the data path

In [53]:
# import the necessary libraries

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [54]:
# Update the DATA_PATH variable

import sys

if 'google.colab' in sys.modules:
  # If you're on Colab:
  DATA_PATH = 'https://raw.githubusercontent.com/bloominstituteoftechnology/ds_code_along_unit_2/main/data/flight/'
else:
  # If you're working locally:
  DATA_PATH = '..../data/'

### Load the training dataset

In [55]:
# load your training set (we will be working with only this one dataset for this notebook)

df = pd.read_excel(DATA_PATH + 'Data_Train.xlsx')

### Explore the dataset

In [56]:
# print the first 5 rows

df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [57]:
# Determine the shape
df.shape

(10683, 11)

In [58]:
# explore the datatypes of all the columns

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [59]:
# how many null values are there in the dataset

df.isnull().sum()

Unnamed: 0,0
Airline,0
Date_of_Journey,0
Source,0
Destination,0
Route,1
Dep_Time,0
Arrival_Time,0
Duration,0
Total_Stops,1
Additional_Info,0


In [60]:
# Fetching those row(s) where there is/are missing values
df[df['Route'].isna() | df['Total_Stops'].isna()]

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
9039,Air India,6/05/2019,Delhi,Cochin,,09:45,09:25 07 May,23h 40m,,No info,7480


In [61]:
# Drop row(s) containing mising values
df.dropna(inplace=True)

In [62]:
df.isnull().sum()

Unnamed: 0,0
Airline,0
Date_of_Journey,0
Source,0
Destination,0
Route,0
Dep_Time,0
Arrival_Time,0
Duration,0
Total_Stops,0
Additional_Info,0


<hr style="border:2px solid gray">

#**STEP: 2/5** - Clean the two selected features


### Clean the 'Duration' column

In [63]:
# Duration is in a string format. It must be converted into a numerical format.

df['Duration']

Unnamed: 0,Duration
0,2h 50m
1,7h 25m
2,19h
3,5h 25m
4,4h 45m
...,...
10678,2h 30m
10679,2h 35m
10680,3h
10681,2h 40m


In [64]:
# converting duration into minutes

def convert_duration(duration):
  if len(duration.split()) == 2:
    hours = int(duration.split()[0][:-1])
    minutes = int(duration.split()[1][:-1])
    return hours * 60 + minutes
  else:
    return int(duration[:-1]) * 60

df['Duration'] = df['Duration'].apply(convert_duration)

In [65]:
'''
In case you're wondering how a split function works, this is an illustration.
split() will split the string based on a delimiter.
By default, the space character is the delimiter if none is provided.'''

x = '2h 50m'
x.split()

['2h', '50m']

In [66]:
x.split()[1][:-1]

'50'

### Clean the 'Total_Stops' column.

In [67]:
# Total_Stops is in string format. It must be converted into a numerical format.
df['Total_Stops'].value_counts()

Unnamed: 0_level_0,count
Total_Stops,Unnamed: 1_level_1
1 stop,5625
non-stop,3491
2 stops,1520
3 stops,45
4 stops,1


In [68]:
# formatting the stops.

df['Total_Stops'] = df['Total_Stops'].str.split(" ").str[0]
df['Total_Stops'] = df['Total_Stops'].replace('non-stop', 0)

# Converting the stops dtype to int
df['Total_Stops'] = df["Total_Stops"].astype(int)

In [69]:
df['Total_Stops'].value_counts()

Unnamed: 0_level_0,count
Total_Stops,Unnamed: 1_level_1
1,5625
0,3491
2,1520
3,45
4,1


<hr style="border:2px solid gray">

#**STEP: 3/5** - Split the Data & Determine the Baseline

In [71]:
# Split the data into Feature Matrix and Target Vector

X = df[['Total_Stops', 'Duration']]
y = df['Price']

In [72]:
X.head() # 2D, DataFrame

Unnamed: 0,Total_Stops,Duration
0,0,170
1,2,445
2,2,1140
3,1,325
4,1,285


In [73]:
y.head() # 1D, Vector

Unnamed: 0,Price
0,3897
1,7662
2,13882
3,6218
4,13302


In [74]:
# Split data into Train and Test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [75]:
print("The size of training input is", X_train.shape)
print("The size of training output is", y_train.shape)
print(50 *'*')
print("The size of testing input is", X_test.shape)
print("The size of testing output is", y_test.shape)

The size of training input is (8545, 2)
The size of training output is (8545,)
**************************************************
The size of testing input is (2137, 2)
The size of testing output is (2137,)


In [76]:
# Determine the baseline MAE

y_pred_baseline = [y_train.mean()] * len(y_train)
print('BASELINE MAE', mean_absolute_error(y_train, y_pred_baseline))

BASELINE MAE 3649.7532864424425


<hr style="border:2px solid gray">

#**STEP: 4/5** - Build & Evaluate the model

In [77]:
# Instantiate the predictor
model = LinearRegression()

In [78]:
# Fitting the model on training data
model.fit(X_train, y_train)

In [79]:
# MAE of LinearRegression model.

print('Linear Regression TRAIN MAE',mean_absolute_error(y_train, model.predict(X_train)))
print('Linear Regression TEST MAE',mean_absolute_error(y_test, model.predict(X_test)))

Linear Regression TRAIN MAE 2516.653892454442
Linear Regression TEST MAE 2546.1121781801967


In [81]:
# RMSE of LinearRegression model.

print('Linear Regression TRAIN RMSE',mean_squared_error(y_train, model.predict(X_train))**0.5)
print('Linear Regression TEST RMSE',mean_squared_error(y_test, model.predict(X_test))**0.5)

Linear Regression TRAIN RMSE 3635.0385724073353
Linear Regression TEST RMSE 3718.9564145578356


In [82]:
# r2 of LinearRegression model.

print('Linear Regression TRAIN R2',r2_score(y_train, model.predict(X_train)))
print('Linear Regression TEST R2',r2_score(y_test, model.predict(X_test)))

Linear Regression TRAIN R2 0.37641421683041154
Linear Regression TEST R2 0.3585659639575898


<hr style="border:2px solid gray">

#**STEP: 5/5** - Determine the Linear Equation used by the Model

In [83]:
# Checking the coefficient(slope) and intercepts
coef = model.coef_
intercept = model.intercept_
print(coef, intercept) # Total Stops, Duration for coefs

[3.46752107e+03 1.21528482e+00] 5436.110336564698


In [84]:
# final equation

print(f'Price={intercept} + {coef[0]}* Total_Stops + {coef[1]}* Duration')

Price=5436.110336564698 + 3467.5210668253176* Total_Stops + 1.2152848161280996* Duration
