# FLIGHT PRICE PREDICTION

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, precision_score, recall_score, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_excel("Data_Train.xlsx")
data.head(10)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302
5,SpiceJet,24/06/2019,Kolkata,Banglore,CCU → BLR,09:00,11:25,2h 25m,non-stop,No info,3873
6,Jet Airways,12/03/2019,Banglore,New Delhi,BLR → BOM → DEL,18:55,10:25 13 Mar,15h 30m,1 stop,In-flight meal not included,11087
7,Jet Airways,01/03/2019,Banglore,New Delhi,BLR → BOM → DEL,08:00,05:05 02 Mar,21h 5m,1 stop,No info,22270
8,Jet Airways,12/03/2019,Banglore,New Delhi,BLR → BOM → DEL,08:55,10:25 13 Mar,25h 30m,1 stop,In-flight meal not included,11087
9,Multiple carriers,27/05/2019,Delhi,Cochin,DEL → BOM → COK,11:25,19:15,7h 50m,1 stop,No info,8625


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [4]:
data.describe()

Unnamed: 0,Price
count,10683.0
mean,9087.064121
std,4611.359167
min,1759.0
25%,5277.0
50%,8372.0
75%,12373.0
max,79512.0


In [5]:
data.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [6]:
# we can see that there is single row nan values in both "Route" and "Total_Stops" columns
# since we droping them won't affect the model perfomance,let's drop them

In [7]:
data.dropna(inplace=True)
data.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
dtype: int64

In [8]:
# Every column, except the "Price" column, which is our target is of object data type. we need to convert
# them into categorical values

In [9]:
# Label Encoding

In [10]:
data["Airline"] = data["Airline"].astype("category")
data["Source"] = data["Source"].astype("category")
data["Destination"] = data["Destination"].astype("category")
data["Route"] = data["Route"].astype("category")
data["Total_Stops"] = data["Total_Stops"].astype("category")
data["Additional_Info"] = data["Additional_Info"].astype("category")

In [11]:
data["Airline"] = data["Airline"].cat.codes
data["Source"] = data["Source"].cat.codes
data["Destination"] = data["Destination"].cat.codes
data["Route"] = data["Route"].cat.codes
data["Total_Stops"] = data["Total_Stops"].cat.codes
data["Additional_Info"] = data["Additional_Info"].cat.codes

In [12]:
data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,3,24/03/2019,0,5,18,22:20,01:10 22 Mar,2h 50m,4,8,3897
1,1,1/05/2019,3,0,84,05:50,13:15,7h 25m,1,8,7662
2,4,9/06/2019,2,1,118,09:25,04:25 10 Jun,19h,1,8,13882
3,3,12/05/2019,3,0,91,18:05,23:30,5h 25m,0,8,6218
4,3,01/03/2019,0,5,29,16:50,21:35,4h 45m,0,8,13302


In [13]:
# We need to change the date and time formats next.

In [14]:
data[["Day", "Month", "Year"]] = data["Date_of_Journey"].str.split("/", expand=True)

In [15]:
data.head(10)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Day,Month,Year
0,3,24/03/2019,0,5,18,22:20,01:10 22 Mar,2h 50m,4,8,3897,24,3,2019
1,1,1/05/2019,3,0,84,05:50,13:15,7h 25m,1,8,7662,1,5,2019
2,4,9/06/2019,2,1,118,09:25,04:25 10 Jun,19h,1,8,13882,9,6,2019
3,3,12/05/2019,3,0,91,18:05,23:30,5h 25m,0,8,6218,12,5,2019
4,3,01/03/2019,0,5,29,16:50,21:35,4h 45m,0,8,13302,1,3,2019
5,8,24/06/2019,3,0,64,09:00,11:25,2h 25m,4,8,3873,24,6,2019
6,4,12/03/2019,0,5,5,18:55,10:25 13 Mar,15h 30m,0,5,11087,12,3,2019
7,4,01/03/2019,0,5,5,08:00,05:05 02 Mar,21h 5m,0,8,22270,1,3,2019
8,4,12/03/2019,0,5,5,08:55,10:25 13 Mar,25h 30m,0,5,11087,12,3,2019
9,6,27/05/2019,2,1,104,11:25,19:15,7h 50m,0,8,8625,27,5,2019


In [16]:
data.drop(["Date_of_Journey"], axis=1, inplace=True)

In [17]:
data["Hour"] = pd.to_datetime(data.Dep_Time).dt.hour

In [18]:
data["Minute"] = pd.to_datetime(data.Dep_Time).dt.minute

In [19]:
data.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Day,Month,Year,Hour,Minute
0,3,0,5,18,22:20,01:10 22 Mar,2h 50m,4,8,3897,24,3,2019,22,20
1,1,3,0,84,05:50,13:15,7h 25m,1,8,7662,1,5,2019,5,50
2,4,2,1,118,09:25,04:25 10 Jun,19h,1,8,13882,9,6,2019,9,25
3,3,3,0,91,18:05,23:30,5h 25m,0,8,6218,12,5,2019,18,5
4,3,0,5,29,16:50,21:35,4h 45m,0,8,13302,1,3,2019,16,50


In [20]:
# Lets convert the departure time int minutes

In [21]:
data['DURATION']=  data['Duration'].str.replace("h", '*60').str.replace(' ','+').str.replace('m','*1').apply(eval)

In [22]:
data.head(15)

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Day,Month,Year,Hour,Minute,DURATION
0,3,0,5,18,22:20,01:10 22 Mar,2h 50m,4,8,3897,24,3,2019,22,20,170
1,1,3,0,84,05:50,13:15,7h 25m,1,8,7662,1,5,2019,5,50,445
2,4,2,1,118,09:25,04:25 10 Jun,19h,1,8,13882,9,6,2019,9,25,1140
3,3,3,0,91,18:05,23:30,5h 25m,0,8,6218,12,5,2019,18,5,325
4,3,0,5,29,16:50,21:35,4h 45m,0,8,13302,1,3,2019,16,50,285
5,8,3,0,64,09:00,11:25,2h 25m,4,8,3873,24,6,2019,9,0,145
6,4,0,5,5,18:55,10:25 13 Mar,15h 30m,0,5,11087,12,3,2019,18,55,930
7,4,0,5,5,08:00,05:05 02 Mar,21h 5m,0,8,22270,1,3,2019,8,0,1265
8,4,0,5,5,08:55,10:25 13 Mar,25h 30m,0,5,11087,12,3,2019,8,55,1530
9,6,2,1,104,11:25,19:15,7h 50m,0,8,8625,27,5,2019,11,25,470


In [23]:
data.drop("Duration", axis=1, inplace=True)

In [24]:
data.drop("Dep_Time", axis=1, inplace=True)

In [25]:
data.head(10)

Unnamed: 0,Airline,Source,Destination,Route,Arrival_Time,Total_Stops,Additional_Info,Price,Day,Month,Year,Hour,Minute,DURATION
0,3,0,5,18,01:10 22 Mar,4,8,3897,24,3,2019,22,20,170
1,1,3,0,84,13:15,1,8,7662,1,5,2019,5,50,445
2,4,2,1,118,04:25 10 Jun,1,8,13882,9,6,2019,9,25,1140
3,3,3,0,91,23:30,0,8,6218,12,5,2019,18,5,325
4,3,0,5,29,21:35,0,8,13302,1,3,2019,16,50,285
5,8,3,0,64,11:25,4,8,3873,24,6,2019,9,0,145
6,4,0,5,5,10:25 13 Mar,0,5,11087,12,3,2019,18,55,930
7,4,0,5,5,05:05 02 Mar,0,8,22270,1,3,2019,8,0,1265
8,4,0,5,5,10:25 13 Mar,0,5,11087,12,3,2019,8,55,1530
9,6,2,1,104,19:15,0,8,8625,27,5,2019,11,25,470


In [26]:
data["Arrival_Hour"] = pd.to_datetime(data.Arrival_Time).dt.hour

In [27]:
data["Arrival_Minute"] = pd.to_datetime(data.Arrival_Time).dt.minute

In [28]:
data.head(10)

Unnamed: 0,Airline,Source,Destination,Route,Arrival_Time,Total_Stops,Additional_Info,Price,Day,Month,Year,Hour,Minute,DURATION,Arrival_Hour,Arrival_Minute
0,3,0,5,18,01:10 22 Mar,4,8,3897,24,3,2019,22,20,170,1,10
1,1,3,0,84,13:15,1,8,7662,1,5,2019,5,50,445,13,15
2,4,2,1,118,04:25 10 Jun,1,8,13882,9,6,2019,9,25,1140,4,25
3,3,3,0,91,23:30,0,8,6218,12,5,2019,18,5,325,23,30
4,3,0,5,29,21:35,0,8,13302,1,3,2019,16,50,285,21,35
5,8,3,0,64,11:25,4,8,3873,24,6,2019,9,0,145,11,25
6,4,0,5,5,10:25 13 Mar,0,5,11087,12,3,2019,18,55,930,10,25
7,4,0,5,5,05:05 02 Mar,0,8,22270,1,3,2019,8,0,1265,5,5
8,4,0,5,5,10:25 13 Mar,0,5,11087,12,3,2019,8,55,1530,10,25
9,6,2,1,104,19:15,0,8,8625,27,5,2019,11,25,470,19,15


In [29]:
data.drop("Arrival_Time", axis=1, inplace=True)

In [30]:
data.head(10)

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,Price,Day,Month,Year,Hour,Minute,DURATION,Arrival_Hour,Arrival_Minute
0,3,0,5,18,4,8,3897,24,3,2019,22,20,170,1,10
1,1,3,0,84,1,8,7662,1,5,2019,5,50,445,13,15
2,4,2,1,118,1,8,13882,9,6,2019,9,25,1140,4,25
3,3,3,0,91,0,8,6218,12,5,2019,18,5,325,23,30
4,3,0,5,29,0,8,13302,1,3,2019,16,50,285,21,35
5,8,3,0,64,4,8,3873,24,6,2019,9,0,145,11,25
6,4,0,5,5,0,5,11087,12,3,2019,18,55,930,10,25
7,4,0,5,5,0,8,22270,1,3,2019,8,0,1265,5,5
8,4,0,5,5,0,5,11087,12,3,2019,8,55,1530,10,25
9,6,2,1,104,0,8,8625,27,5,2019,11,25,470,19,15


In [31]:
data.drop_duplicates(inplace=True)

In [32]:
data.head()

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,Price,Day,Month,Year,Hour,Minute,DURATION,Arrival_Hour,Arrival_Minute
0,3,0,5,18,4,8,3897,24,3,2019,22,20,170,1,10
1,1,3,0,84,1,8,7662,1,5,2019,5,50,445,13,15
2,4,2,1,118,1,8,13882,9,6,2019,9,25,1140,4,25
3,3,3,0,91,0,8,6218,12,5,2019,18,5,325,23,30
4,3,0,5,29,0,8,13302,1,3,2019,16,50,285,21,35


In [33]:
data.isnull().sum()

Airline            0
Source             0
Destination        0
Route              0
Total_Stops        0
Additional_Info    0
Price              0
Day                0
Month              0
Year               0
Hour               0
Minute             0
DURATION           0
Arrival_Hour       0
Arrival_Minute     0
dtype: int64

In [34]:
data.shape

(10460, 15)

### We have done the data preparation. Lets move on to the model building part

In [35]:
x = data.loc[:, ["Airline","Source", "Destination", "Route", "Total_Stops", "Additional_Info","Day", "Month", "Year","Hour","Minute","DURATION","Arrival_Hour","Arrival_Minute"]]

In [36]:
x

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,Day,Month,Year,Hour,Minute,DURATION,Arrival_Hour,Arrival_Minute
0,3,0,5,18,4,8,24,03,2019,22,20,170,1,10
1,1,3,0,84,1,8,1,05,2019,5,50,445,13,15
2,4,2,1,118,1,8,9,06,2019,9,25,1140,4,25
3,3,3,0,91,0,8,12,05,2019,18,5,325,23,30
4,3,0,5,29,0,8,01,03,2019,16,50,285,21,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,0,3,0,64,4,8,9,04,2019,19,55,150,22,25
10679,1,3,0,64,4,8,27,04,2019,20,45,155,23,20
10680,4,0,2,18,4,8,27,04,2019,8,20,180,11,20
10681,10,0,5,18,4,8,01,03,2019,11,30,160,14,10


In [37]:
sc = StandardScaler()
x = sc.fit_transform(x)

In [38]:
y = data.loc[:, "Price"]

In [39]:
y

0         3897
1         7662
2        13882
3         6218
4        13302
         ...  
10678     4107
10679     4145
10680     7229
10681    12648
10682    11753
Name: Price, Length: 10460, dtype: int64

In [40]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=0)

## Linear Regression

In [41]:
lr = LinearRegression()

In [42]:
lr.fit(x_train, y_train)

LinearRegression()

In [43]:
lr_predict = lr.predict(x_test)

In [44]:
lr.score(x_test, y_test)

0.40428085653144685

In [45]:
r2_score(y_test, lr_predict)

0.40428085653144685

## Random Forest Regresor

In [46]:
from sklearn.ensemble import RandomForestRegressor

In [47]:
rfr = RandomForestRegressor(n_estimators=1000, random_state=0)

In [48]:
rfr.fit(x_train, y_train)

RandomForestRegressor(n_estimators=1000, random_state=0)

In [49]:
rfr_predict = rfr.predict(x_test)

In [50]:
rfr.score(x_test, y_test)

0.8824298002227488

# Working on Test_set

In [51]:
# We have to perform similar actions for this dataframe too, in order to extract the features:

In [52]:
test_data = pd.read_excel("Test_set.xlsx")

In [53]:
test_data.head(10)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info
5,Jet Airways,12/06/2019,Delhi,Cochin,DEL → BOM → COK,18:15,12:35 13 Jun,18h 20m,1 stop,In-flight meal not included
6,Air India,12/03/2019,Banglore,New Delhi,BLR → TRV → DEL,07:30,22:35,15h 5m,1 stop,No info
7,IndiGo,1/05/2019,Kolkata,Banglore,CCU → HYD → BLR,15:15,20:30,5h 15m,1 stop,No info
8,IndiGo,15/03/2019,Kolkata,Banglore,CCU → BLR,10:10,12:55,2h 45m,non-stop,No info
9,Jet Airways,18/05/2019,Kolkata,Banglore,CCU → BOM → BLR,16:30,22:35,6h 5m,1 stop,No info


In [54]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2671 entries, 0 to 2670
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          2671 non-null   object
 1   Date_of_Journey  2671 non-null   object
 2   Source           2671 non-null   object
 3   Destination      2671 non-null   object
 4   Route            2671 non-null   object
 5   Dep_Time         2671 non-null   object
 6   Arrival_Time     2671 non-null   object
 7   Duration         2671 non-null   object
 8   Total_Stops      2671 non-null   object
 9   Additional_Info  2671 non-null   object
dtypes: object(10)
memory usage: 208.8+ KB


In [55]:
test_data.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
dtype: int64

In [56]:
test_data["Airline"] = test_data["Airline"].astype("category")
test_data["Source"] = test_data["Source"].astype("category")
test_data["Destination"] = test_data["Destination"].astype("category")
test_data["Route"] = test_data["Route"].astype("category")
test_data["Total_Stops"] = test_data["Total_Stops"].astype("category")
test_data["Additional_Info"] = test_data["Additional_Info"].astype("category")

In [57]:
test_data["Airline"] = test_data["Airline"].cat.codes
test_data["Source"] = test_data["Source"].cat.codes
test_data["Destination"] = test_data["Destination"].cat.codes
test_data["Route"] = test_data["Route"].cat.codes
test_data["Total_Stops"] = test_data["Total_Stops"].cat.codes
test_data["Additional_Info"] = test_data["Additional_Info"].cat.codes

In [58]:
test_data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,4,6/06/2019,2,1,76,17:30,04:25 07 Jun,10h 55m,0,5
1,3,12/05/2019,3,0,65,06:20,10:20,4h,0,5
2,4,21/05/2019,2,1,76,19:15,19:00 22 May,23h 45m,0,3
3,6,21/05/2019,2,1,76,08:00,21:00,13h,0,5
4,0,24/06/2019,0,2,16,23:55,02:45 25 Jun,2h 50m,4,5


In [59]:
test_data[["Day", "Month", "Year"]] = test_data["Date_of_Journey"].str.split("/", expand=True)

In [60]:
test_data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Day,Month,Year
0,4,6/06/2019,2,1,76,17:30,04:25 07 Jun,10h 55m,0,5,6,6,2019
1,3,12/05/2019,3,0,65,06:20,10:20,4h,0,5,12,5,2019
2,4,21/05/2019,2,1,76,19:15,19:00 22 May,23h 45m,0,3,21,5,2019
3,6,21/05/2019,2,1,76,08:00,21:00,13h,0,5,21,5,2019
4,0,24/06/2019,0,2,16,23:55,02:45 25 Jun,2h 50m,4,5,24,6,2019


In [61]:
test_data.drop(["Date_of_Journey"], axis=1, inplace=True)

In [62]:
test_data["Hour"] = pd.to_datetime(test_data.Dep_Time).dt.hour
test_data["Minute"] = pd.to_datetime(test_data.Dep_Time).dt.minute
test_data.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Day,Month,Year,Hour,Minute
0,4,2,1,76,17:30,04:25 07 Jun,10h 55m,0,5,6,6,2019,17,30
1,3,3,0,65,06:20,10:20,4h,0,5,12,5,2019,6,20
2,4,2,1,76,19:15,19:00 22 May,23h 45m,0,3,21,5,2019,19,15
3,6,2,1,76,08:00,21:00,13h,0,5,21,5,2019,8,0
4,0,0,2,16,23:55,02:45 25 Jun,2h 50m,4,5,24,6,2019,23,55


In [63]:
test_data['DURATION']=  test_data['Duration'].str.replace("h", '*60').str.replace(' ','+').str.replace('m','*1').apply(eval)

In [64]:
test_data.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Day,Month,Year,Hour,Minute,DURATION
0,4,2,1,76,17:30,04:25 07 Jun,10h 55m,0,5,6,6,2019,17,30,655
1,3,3,0,65,06:20,10:20,4h,0,5,12,5,2019,6,20,240
2,4,2,1,76,19:15,19:00 22 May,23h 45m,0,3,21,5,2019,19,15,1425
3,6,2,1,76,08:00,21:00,13h,0,5,21,5,2019,8,0,780
4,0,0,2,16,23:55,02:45 25 Jun,2h 50m,4,5,24,6,2019,23,55,170


In [65]:
test_data["Arrival_Hour"] = pd.to_datetime(test_data.Arrival_Time).dt.hour
test_data["Arrival_Minute"] = pd.to_datetime(test_data.Arrival_Time).dt.minute
test_data.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Day,Month,Year,Hour,Minute,DURATION,Arrival_Hour,Arrival_Minute
0,4,2,1,76,17:30,04:25 07 Jun,10h 55m,0,5,6,6,2019,17,30,655,4,25
1,3,3,0,65,06:20,10:20,4h,0,5,12,5,2019,6,20,240,10,20
2,4,2,1,76,19:15,19:00 22 May,23h 45m,0,3,21,5,2019,19,15,1425,19,0
3,6,2,1,76,08:00,21:00,13h,0,5,21,5,2019,8,0,780,21,0
4,0,0,2,16,23:55,02:45 25 Jun,2h 50m,4,5,24,6,2019,23,55,170,2,45


In [66]:
test_data.drop("Arrival_Time", axis=1, inplace=True)

In [67]:
test_data.drop("Dep_Time", axis=1, inplace=True)
test_data.drop("Duration", axis=1, inplace=True)

In [68]:
test_data.drop_duplicates(inplace=True)

In [69]:
test_data.head()

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,Day,Month,Year,Hour,Minute,DURATION,Arrival_Hour,Arrival_Minute
0,4,2,1,76,0,5,6,6,2019,17,30,655,4,25
1,3,3,0,65,0,5,12,5,2019,6,20,240,10,20
2,4,2,1,76,0,3,21,5,2019,19,15,1425,19,0
3,6,2,1,76,0,5,21,5,2019,8,0,780,21,0
4,0,0,2,16,4,5,24,6,2019,23,55,170,2,45


In [70]:
X = data.loc[:, ["Airline","Source", "Destination", "Route", "Total_Stops", "Additional_Info","Day", "Month", "Year","Hour","Minute","DURATION","Arrival_Hour","Arrival_Minute"]]
X = sc.fit_transform(X)

In [71]:
Y_rfr = rfr.predict(X)
Y_lr = lr.predict(X)

In [72]:
Y_lr, Y_rfr

(array([ 6764.10209477,  8992.45775316, 10540.11836301, ...,
         4987.87129621,  8257.03362269,  8934.18514844]),
 array([ 5242.99467063,  7459.748     , 13880.264     , ...,
         7226.5       , 14595.574     , 12852.29533333]))

# FINAL RESULTS

In [73]:
# Lets put all the price predictions into a seperate dataframe 

In [74]:
predictions = pd.DataFrame(zip(Y_lr,Y_rfr), columns=["LinearRegression","RandomForestRegression"])

In [75]:
predictions.head()

Unnamed: 0,LinearRegression,RandomForestRegression
0,6764.102095,5242.994671
1,8992.457753,7459.748
2,10540.118363,13880.264
3,9549.81391,6232.333
4,12997.512788,12334.463


## INFERENCE:
### We can see that the model having higher accuracy is of RandomForestRegressor.