#Flight Price Prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_dataset = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Interview Topics/flight_price_prediction/Data/Data_Train.xlsx")
test_dataset = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Interview Topics/flight_price_prediction/Data/Test_set.xlsx")

In [3]:
test_dataset.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info


In [4]:
train_dataset.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [5]:
# Combine Test and train dataset, although test has no price column
master_df = train_dataset.append(test_dataset, sort=False)
master_df.tail()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
2666,Air India,6/06/2019,Kolkata,Banglore,CCU → DEL → BLR,20:30,20:25 07 Jun,23h 55m,1 stop,No info,
2667,IndiGo,27/03/2019,Kolkata,Banglore,CCU → BLR,14:20,16:55,2h 35m,non-stop,No info,
2668,Jet Airways,6/03/2019,Delhi,Cochin,DEL → BOM → COK,21:50,04:25 07 Mar,6h 35m,1 stop,No info,
2669,Air India,6/03/2019,Delhi,Cochin,DEL → BOM → COK,04:00,19:15,15h 15m,1 stop,No info,
2670,Multiple carriers,15/06/2019,Delhi,Cochin,DEL → BOM → COK,04:55,19:15,14h 20m,1 stop,No info,


# Feature Engineering

In [6]:
master_df.dtypes

Airline             object
Date_of_Journey     object
Source              object
Destination         object
Route               object
Dep_Time            object
Arrival_Time        object
Duration            object
Total_Stops         object
Additional_Info     object
Price              float64
dtype: object

In [7]:
set(master_df["Airline"])

{'Air Asia',
 'Air India',
 'GoAir',
 'IndiGo',
 'Jet Airways',
 'Jet Airways Business',
 'Multiple carriers',
 'Multiple carriers Premium economy',
 'SpiceJet',
 'Trujet',
 'Vistara',
 'Vistara Premium economy'}

In [8]:
set(master_df["Source"])

{'Banglore', 'Chennai', 'Delhi', 'Kolkata', 'Mumbai'}

In [9]:
set(master_df["Destination"])

{'Banglore', 'Cochin', 'Delhi', 'Hyderabad', 'Kolkata', 'New Delhi'}

In [10]:
# Split Date of Journey in to Date, month, year
master_df['Date'] = master_df['Date_of_Journey'].str.split('/').str[0]
master_df['Month'] = master_df['Date_of_Journey'].str.split('/').str[1]
master_df['Year'] = master_df['Date_of_Journey'].str.split('/').str[2]

In [11]:
master_df['Date'] = master_df['Date'].astype(int)
master_df['Month'] = master_df['Month'].astype(int)
master_df['Year'] = master_df['Year'].astype(int)

# Dropping date of journey column
master_df = master_df.drop(['Date_of_Journey'], axis=1)
master_df.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897.0,24,3,2019
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662.0,1,5,2019
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882.0,9,6,2019
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218.0,12,5,2019
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302.0,1,3,2019


In [12]:
# Cleaning Arrival Time Column
master_df['Arrival_Time'] = master_df['Arrival_Time'].str.split(' ').str[0]
master_df.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10,2h 50m,non-stop,No info,3897.0,24,3,2019
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662.0,1,5,2019
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25,19h,2 stops,No info,13882.0,9,6,2019
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218.0,12,5,2019
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302.0,1,3,2019


In [13]:
# Total Stops
master_df[master_df['Total_Stops'].isnull()]

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year
9039,Air India,Delhi,Cochin,,09:45,09:25,23h 40m,,No info,7480.0,6,5,2019


In [14]:
master_df['Total_Stops'] = master_df['Total_Stops'].fillna('1 stop')

In [15]:
master_df['Total_Stops'] = master_df['Total_Stops'].replace('non-stop', '0 stop')
master_df['Total_Stops'] = master_df['Total_Stops'].str.split(' ').str[0]
master_df['Total_Stops'] = master_df['Total_Stops'].astype(int)
master_df.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10,2h 50m,0,No info,3897.0,24,3,2019
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2,No info,7662.0,1,5,2019
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25,19h,2,No info,13882.0,9,6,2019
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1,No info,6218.0,12,5,2019
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1,No info,13302.0,1,3,2019


In [16]:
# splitting Departure and Arrival time into hours and minutes
master_df['Dep_Hour'] = master_df['Dep_Time'].str.split(':').str[0].astype(int)
master_df['Dep_Minute'] = master_df['Dep_Time'].str.split(':').str[1].astype(int)

master_df['Arr_Hour'] = master_df['Arrival_Time'].str.split(':').str[0].astype(int)
master_df['Arr_Minute'] = master_df['Arrival_Time'].str.split(':').str[1].astype(int)

master_df = master_df.drop(['Arrival_Time'], axis=1)
master_df = master_df.drop(['Dep_Time'], axis=1)
master_df.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Dep_Hour,Dep_Minute,Arr_Hour,Arr_Minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,0,No info,3897.0,24,3,2019,22,20,1,10
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2,No info,7662.0,1,5,2019,5,50,13,15
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,19h,2,No info,13882.0,9,6,2019,9,25,4,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,5h 25m,1,No info,6218.0,12,5,2019,18,5,23,30
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,4h 45m,1,No info,13302.0,1,3,2019,16,50,21,35


In [17]:
# Splitting Routes
# we have max five elements in routes
master_df['Route_1'] = master_df['Route'].str.split('→ ').str[0]
master_df['Route_2'] = master_df['Route'].str.split('→ ').str[1]
master_df['Route_3'] = master_df['Route'].str.split('→ ').str[2]
master_df['Route_4'] = master_df['Route'].str.split('→ ').str[3]
master_df['Route_5'] = master_df['Route'].str.split('→ ').str[4]

master_df = master_df.drop(['Route'], axis=1)

In [18]:
master_df.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Dep_Hour,Dep_Minute,Arr_Hour,Arr_Minute,Route_1,Route_2,Route_3,Route_4,Route_5
0,IndiGo,Banglore,New Delhi,2h 50m,0,No info,3897.0,24,3,2019,22,20,1,10,BLR,DEL,,,
1,Air India,Kolkata,Banglore,7h 25m,2,No info,7662.0,1,5,2019,5,50,13,15,CCU,IXR,BBI,BLR,
2,Jet Airways,Delhi,Cochin,19h,2,No info,13882.0,9,6,2019,9,25,4,25,DEL,LKO,BOM,COK,
3,IndiGo,Kolkata,Banglore,5h 25m,1,No info,6218.0,12,5,2019,18,5,23,30,CCU,NAG,BLR,,
4,IndiGo,Banglore,New Delhi,4h 45m,1,No info,13302.0,1,3,2019,16,50,21,35,BLR,NAG,DEL,,


In [19]:
# Handling NaN values of price
master_df['Price'].isnull().sum()
# These NaN values are because of added Test dataset

2671

In [20]:
master_df['Price'].fillna((master_df['Price'].mean()), inplace=True)

In [21]:
master_df["Route_1"].fillna("None", inplace=True)
master_df["Route_2"].fillna("None", inplace=True)
master_df["Route_3"].fillna("None", inplace=True)
master_df["Route_4"].fillna("None", inplace=True)
master_df["Route_5"].fillna("None", inplace=True)

In [22]:
master_df.isnull().sum()

Airline            0
Source             0
Destination        0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
Date               0
Month              0
Year               0
Dep_Hour           0
Dep_Minute         0
Arr_Hour           0
Arr_Minute         0
Route_1            0
Route_2            0
Route_3            0
Route_4            0
Route_5            0
dtype: int64

# Label Encoding of Categorical Variables

In [23]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [24]:
set(master_df['Additional_Info'])

{'1 Long layover',
 '1 Short layover',
 '2 Long layover',
 'Business class',
 'Change airports',
 'In-flight meal not included',
 'No Info',
 'No check-in baggage included',
 'No info',
 'Red-eye flight'}

In [25]:
master_df['Airline'] = encoder.fit_transform(master_df['Airline'])
master_df['Source'] = encoder.fit_transform(master_df['Source'])
master_df['Destination'] = encoder.fit_transform(master_df['Destination'])
master_df['Additional_Info'] = encoder.fit_transform(master_df['Additional_Info'])
master_df['Route_1'] = encoder.fit_transform(master_df['Route_1'])
master_df['Route_2'] = encoder.fit_transform(master_df['Route_2'])
master_df['Route_3'] = encoder.fit_transform(master_df['Route_3'])
master_df['Route_4'] = encoder.fit_transform(master_df['Route_4'])
master_df['Route_5'] = encoder.fit_transform(master_df['Route_5'])

In [26]:
master_df.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Dep_Hour,Dep_Minute,Arr_Hour,Arr_Minute,Route_1,Route_2,Route_3,Route_4,Route_5
0,3,0,5,2h 50m,0,8,3897.0,24,3,2019,22,20,1,10,0,13,24,12,4
1,1,3,0,7h 25m,2,8,7662.0,1,5,2019,5,50,13,15,2,25,1,3,4
2,4,2,1,19h,2,8,13882.0,9,6,2019,9,25,4,25,3,32,4,5,4
3,3,3,0,5h 25m,1,8,6218.0,12,5,2019,18,5,23,30,2,34,3,12,4
4,3,0,5,4h 45m,1,8,13302.0,1,3,2019,16,50,21,35,0,34,8,12,4


In [27]:
master_df = master_df.drop(['Duration'], axis=1)

# Feature Selection

Using Lasso Regression

In [28]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [29]:
train_df = master_df[0:10683]
test_df = master_df[10683:]

In [30]:
X = train_df.drop(['Price'], axis=1)
y = train_df.Price

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [32]:
model = SelectFromModel(Lasso(alpha=0.005, random_state=0))
model.fit(X_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=0,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [33]:
model.get_support()

array([ True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True])

In [34]:
selected_features = X_train.columns[(model.get_support())]
selected_features

Index(['Airline', 'Source', 'Destination', 'Total_Stops', 'Additional_Info',
       'Date', 'Month', 'Dep_Hour', 'Dep_Minute', 'Arr_Hour', 'Arr_Minute',
       'Route_1', 'Route_2', 'Route_3', 'Route_4', 'Route_5'],
      dtype='object')

**So Year Column isn't selected, we need to drop that**

In [35]:
X_train = X_train.drop(['Year'], axis=1)
X_test = X_test.drop(['Year'], axis=1)

# Random Forest Regressor

In [36]:
from sklearn.model_selection import RandomizedSearchCV

In [37]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [38]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_split': [2, 5, 10, 15, 100], 'min_samples_leaf': [1, 2, 5, 10]}


In [39]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [40]:
# Random search of parameters, using 3 fold cross validation, 
# search across 50 different combinations
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [41]:
# rf_random.fit(X_train,y_train)

In [42]:
# y_pred=rf_random.predict(X_test)
# sns.distplot(y_test-y_pred)

In [44]:
pip freeze > requirements.txt