In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import re

In [2]:
train_data = pd.read_excel("Flt_Data_Train.xlsx")
train_data.shape

(10683, 11)

In [3]:
train_data.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [4]:
test_data = pd.read_excel("Flt_Test_set.xlsx")
test_data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info


In [5]:
train_data.dtypes

Airline            object
Date_of_Journey    object
Source             object
Destination        object
Route              object
Dep_Time           object
Arrival_Time       object
Duration           object
Total_Stops        object
Additional_Info    object
Price               int64
dtype: object

In [6]:
data = pd.concat([train_data,test_data])
print(data.shape)

(13354, 11)


In [7]:
data['date'] = data['Date_of_Journey'].str.split('/').str[0]
data['month'] = data['Date_of_Journey'].str.split('/').str[1]
data['year'] = data['Date_of_Journey'].str.split('/').str[2]

In [8]:
data['Arrival_Time'] = data['Arrival_Time'].str.split(' ').str[0]
data['Arrival_Hour'] = data['Arrival_Time'].str.split(':').str[0].astype(int)
data['Arrival_Min'] = data['Arrival_Time'].str.split(':').str[1].astype(int)

In [9]:
data['Total_Stops'].fillna('0 stop',inplace=True)
data['Total_Stops'].isnull().sum()

0

In [10]:
data['Total_Stops'].replace('non-stop','0 stop',inplace=True)
data['Stop'] = data['Total_Stops'].str.split(' ').str[0]
data['Stop']= data['Stop'].astype(int)

In [11]:
data['Dep_Hour'] = data['Dep_Time'].str.split(':').str[0].astype(int)
data['Dep_Min']= data['Dep_Time'].str.split(':').str[1].astype(int)

In [12]:
data['Route'] = data['Route'].fillna("None")
data['Route_1'] = data['Route'].str.split('→ ').str[0]
data['Route_2'] = data['Route'].str.split('→ ').str[1]
data['Route_3'] = data['Route'].str.split('→ ').str[2]
data['Route_4'] = data['Route'].str.split('→ ').str[3]
data['Route_5'] = data['Route'].str.split('→ ').str[4]

In [13]:
data['Route_1'].fillna("None",inplace=True)
data['Route_2'].fillna("None",inplace=True)
data['Route_3'].fillna("None",inplace=True)
data['Route_4'].fillna("None",inplace=True)
data['Route_5'].fillna("None",inplace=True)

In [14]:
data['Duration_Hr']= data['Duration'].str.split(' ').str[0]
data['Duration_Hr']= data['Duration_Hr'].str.strip('h')
data['Duration_Hr']= data['Duration_Hr'].apply(lambda x: '0' if 'm' in x else x)
data[data['Duration_Hr']=='*m']

Unnamed: 0,Additional_Info,Airline,Arrival_Time,Date_of_Journey,Dep_Time,Destination,Duration,Price,Route,Source,...,Arrival_Min,Stop,Dep_Hour,Dep_Min,Route_1,Route_2,Route_3,Route_4,Route_5,Duration_Hr


In [15]:
data['Duration_Min']= data['Duration'].str.split(' ').str[1]
data['Duration_Min']= data['Duration_Min'].str.strip('m')

In [16]:
data['Duration_Min'].isnull().sum()

1286

In [17]:
data['Duration_Min'].fillna(0,inplace=True)
data['Duration_Min'].isnull().sum()

0

In [18]:
data['Duration_Hr']= data['Duration_Hr'].astype(int)
data['Duration_Min']= data['Duration_Min'].astype(int)

In [19]:
data.head()

Unnamed: 0,Additional_Info,Airline,Arrival_Time,Date_of_Journey,Dep_Time,Destination,Duration,Price,Route,Source,...,Stop,Dep_Hour,Dep_Min,Route_1,Route_2,Route_3,Route_4,Route_5,Duration_Hr,Duration_Min
0,No info,IndiGo,01:10,24/03/2019,22:20,New Delhi,2h 50m,3897.0,BLR → DEL,Banglore,...,0,22,20,BLR,DEL,,,,2,50
1,No info,Air India,13:15,1/05/2019,05:50,Banglore,7h 25m,7662.0,CCU → IXR → BBI → BLR,Kolkata,...,2,5,50,CCU,IXR,BBI,BLR,,7,25
2,No info,Jet Airways,04:25,9/06/2019,09:25,Cochin,19h,13882.0,DEL → LKO → BOM → COK,Delhi,...,2,9,25,DEL,LKO,BOM,COK,,19,0
3,No info,IndiGo,23:30,12/05/2019,18:05,Banglore,5h 25m,6218.0,CCU → NAG → BLR,Kolkata,...,1,18,5,CCU,NAG,BLR,,,5,25
4,No info,IndiGo,21:35,01/03/2019,16:50,New Delhi,4h 45m,13302.0,BLR → NAG → DEL,Banglore,...,1,16,50,BLR,NAG,DEL,,,4,45


In [20]:
data['date']=data['date'].astype(int)
data['month']=data['month'].astype(int)
data['year']= data['year'].astype(int)

In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['Additional_Info']= le.fit_transform(data['Additional_Info'])
data['Airline']= le.fit_transform(data['Airline'])
data['Destination']=le.fit_transform(data['Destination'])
data['Source']= le.fit_transform(data['Source'])
data['Route_1']=le.fit_transform(data['Route_1'])
data['Route_2']=le.fit_transform(data['Route_2'])
data['Route_3']=le.fit_transform(data['Route_3'])
data['Route_4']=le.fit_transform(data['Route_4'])
data['Route_5']=le.fit_transform(data['Route_5'])

In [22]:
train= data[0:10683]
test= data[10683:]
test = test.drop('Price',axis=1)

In [23]:
X = train.drop(['Arrival_Time','Total_Stops','Date_of_Journey','Duration','Dep_Time','Price','Route'],axis=1)
y = train['Price']

In [24]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10683 entries, 0 to 10682
Data columns (total 19 columns):
Additional_Info    10683 non-null int32
Airline            10683 non-null int32
Destination        10683 non-null int32
Source             10683 non-null int32
date               10683 non-null int32
month              10683 non-null int32
year               10683 non-null int32
Arrival_Hour       10683 non-null int32
Arrival_Min        10683 non-null int32
Stop               10683 non-null int32
Dep_Hour           10683 non-null int32
Dep_Min            10683 non-null int32
Route_1            10683 non-null int32
Route_2            10683 non-null int32
Route_3            10683 non-null int32
Route_4            10683 non-null int32
Route_5            10683 non-null int32
Duration_Hr        10683 non-null int32
Duration_Min       10683 non-null int32
dtypes: int32(19)
memory usage: 876.3 KB


In [25]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [26]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

In [27]:
from sklearn.model_selection import KFold, cross_val_score
kfolds = KFold(n_splits=10,random_state=42,shuffle=True)
cv_results = cross_val_score(lm,X,y,cv=kfolds,scoring='r2')
cv_results.mean()

0.4994881978034377

In [28]:
cv_results1= cross_val_score(lm,X,y,scoring='neg_mean_squared_error')
cv_results1.mean()

-10724297.398189923

In [29]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=4)
X_poly=poly.fit_transform(X)

In [30]:
cv_results = cross_val_score(lm,X_poly,y,cv=kfolds,scoring='r2')
cv_results.mean()

-2.041032580815341e+19