In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [3]:
model = joblib.load('gbht.sav')
model

GradientBoostingRegressor(max_depth=6, n_estimators=300)

In [6]:
model.n_features_in_

20

In [None]:
Airline                Jet Airways
Date_of_Journey         01/03/2019
Source                    Banglore
Destination              New Delhi
Route              BLR → BOM → DEL
Dep_Time                     08:00
Arrival_Time          05:05 02 Mar
Duration                    21h 5m
Total_Stops                 1 stop
Additional Info            No info
Price                        22271

In [21]:
df = {}
# Route not considered.
df['Airline'] =                        ['IndiGo']
df['Date_of_Journey']=                ['01/03/2019']
df['Source'] =                         ['Banglore']
df['Destination'] =                    ['New Delhi']
#Route              CCU → IXR → BBI → BLR
df['Dep_Time']  =                         ['16:50']
df['Arrival_Time'] =                       [ '21:35']
df['Duration'] =                         ['4h 45m']
df['Total_Stops'] =                     ['1 stop']
df['Airline range'] = 'init'
#df['Additional Info'] =                 ['No info'] Additional info selected features as drop down and if clicked 1 else 0.
#Price                                13303
df = pd.DataFrame(df)
df

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Dep_Time,Arrival_Time,Duration,Total_Stops,Airline range
0,IndiGo,01/03/2019,Banglore,New Delhi,16:50,21:35,4h 45m,1 stop,init


In [22]:
AdditionalInfo = ['Additional Info_1 Long layover', 'Additional Info_1 Short layover',
       'Additional Info_2 Long layover', 'Additional Info_Business class',
       'Additional Info_Change airports',
       'Additional Info_In-flight meal not included',
       'Additional Info_No check-in baggage included']
for i in AdditionalInfo:
    df[i] = 0
   

In [23]:
#Range
low = ['IndiGo','SpiceJet','GoAir','Air Asia','Trujet']
medium = ['Air India','Jet Airways','Multiple carriers', 'Vistara','Vistara Premium economy',
          'Multiple carriers Premium economy']
high = ['Jet Airways Business']



df.loc[df['Airline'].isin(low),'Airline range'] = 'low'
df.loc[df['Airline'].isin(medium),'Airline range'] = 'medium'
df.loc[df['Airline'].isin(high),'Airline range'] = 'high'

In [24]:
Airlines = joblib.load('dictionary_airlines.sav')
range = joblib.load('dictionary_AirlineRange.sav')
range

{'high': 0, 'low': 1, 'medium': 2}

In [25]:
df['Airline'] = df['Airline'].map(Airlines)
df['Airline range'] = df['Airline range'].map(range)
df

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Dep_Time,Arrival_Time,Duration,Total_Stops,Airline range,Additional Info_1 Long layover,Additional Info_1 Short layover,Additional Info_2 Long layover,Additional Info_Business class,Additional Info_Change airports,Additional Info_In-flight meal not included,Additional Info_No check-in baggage included
0,3,01/03/2019,Banglore,New Delhi,16:50,21:35,4h 45m,1 stop,1,0,0,0,0,0,0,0


In [26]:
# function for month extraction 
import re # regular expression for string manipulation
def getSeason(month):
    if (month in [12, 1, 2]):
        return "winter"
    elif (month in [3, 4, 5]):
        return "summer"
    elif (month in [6, 7, 8]):
        return "monsoon"
    else:
        return "spring"

# function for period extraction 
def getPeriodOfDay(x):
    x = int(x[:2])
    if (x > 4) and (x <= 8):
        return 'Early Morning'
    elif (x > 8) and (x <= 12 ):
        return 'Morning'
    elif (x > 12) and (x <= 16):
        return'Noon'
    elif (x > 16) and (x <= 20) :
        return 'Eve'
    elif (x > 20) and (x <= 24):
        return'Night'
    elif (x <= 4):
        return'Late Night'
# function for duration counting     
def getDuration(x):
    replacements = [
    ('h', ':'),
    ('m', ''),
    (' ', '')]
    for old, new in replacements:
        x = re.sub(old, new, x) # regular expression 
    splt = x.split(':')
    hours_to_min = int(splt[0])*60
    if len(splt) == 2 and splt[1].isdigit(): # to add the remaining minutes
        fin = hours_to_min + int(splt[1])
    else:
        fin = hours_to_min
    return fin
# function for duration counting     
def getDurationHours(x):
    replacements = [
    ('h', ':'),
    ('m', ''),
    (' ', '')]
    for old, new in replacements:
        x = re.sub(old, new, x)
    splt = x.split(':')
    hours_to_min = int(splt[0])*60
    if len(splt) == 2 and splt[1].isdigit():
        min = int(splt[1])
    else:
        min =0
    return float(splt[0])+round(min/60,2)

df['month'] = pd.DatetimeIndex(df['Date_of_Journey']).month # month
df['day'] = pd.DatetimeIndex(df['Date_of_Journey']).day # day
df['season'] = df['month'].apply(getSeason) # season
df['Dep_Time_Period'] = df['Dep_Time'].apply(getPeriodOfDay) # period of day (departure time)
df['Arrival_Time_Period'] =df['Arrival_Time'].apply(getPeriodOfDay) # period of day (arrival time)
df['Duration_Minutes'] = df['Duration'].apply(getDuration).astype(int) # duration of a flight
df['Duration_Hours'] =df['Duration'].apply(getDurationHours) # duration of a flight in hours
df['weekday'] = pd.DatetimeIndex(df['Date_of_Journey']).weekday #The day of the week with Monday=0, Sunday=6.


In [27]:
#Encoding
df["Total_Stops"] = df["Total_Stops"].map({'non-stop': 0, '1 stop': 1, '2 stops': 2, '3 stops': 3, '4 stops': 4})  # total stops in a period of flight 
df["season_enc"] = df["season"].map({'spring': 0, 'summer': 1, 'monsoon': 2, 'winter': 3}) # season of flight 
df["Arrival_Time_Period_enc"] = df["Arrival_Time_Period"].map({'Early Morning': 0, 'Morning': 1,
                                                                           'Noon': 2, 'Eve': 3, 'Night': 4, 'Late Night': 5}) # name of day periods 
df["Dep_Time_Period_enc"] = df["Dep_Time_Period"].map({'Early Morning': 0, 'Morning': 1,
                                                                           'Noon': 2, 'Eve': 3, 'Night': 4, 'Late Night': 5}) # name of day periods 
df['Source'] = df['Source'].map({'Delhi':1, 'Kolkata':2, 'Banglore':3, 'Mumbai':4, 'Chennai':5})
df['Destination'] = df['Destination'].map({'Cochin':1, 'Banglore':2, 'Delhi':3, 'New Delhi':4, 'Hyderabad':5, 'Kolkata':6})

In [28]:
drop = ['Date_of_Journey','Dep_Time','Arrival_Time','Duration','Dep_Time_Period', 'Arrival_Time_Period','season']
df =df.drop(drop, axis =1)


In [29]:
df

Unnamed: 0,Airline,Source,Destination,Total_Stops,Airline range,Additional Info_1 Long layover,Additional Info_1 Short layover,Additional Info_2 Long layover,Additional Info_Business class,Additional Info_Change airports,Additional Info_In-flight meal not included,Additional Info_No check-in baggage included,month,day,Duration_Minutes,Duration_Hours,weekday,season_enc,Arrival_Time_Period_enc,Dep_Time_Period_enc
0,3,3,4,1,1,0,0,0,0,0,0,0,1,3,285,4.75,3,3,4,2


In [30]:
# scm = joblib.load('Minutescale')
# sch =joblib.load('Hourscale') 
# df['Duration_Minutes'] =sch.fit_transform(df[['Duration_Minutes']])
# df['Duration_Hours'] = sch.fit_transform(df[['Duration_Hours']])

In [31]:
scm.var_

NameError: name 'scm' is not defined

In [32]:
sch.var_

NameError: name 'sch' is not defined

In [33]:
# minutes sc1.mean_,sc1.var_
#(array([640.23403857]), 499.44841461740924)

df['Duration_Minutes'] = (df['Duration_Minutes'] - 640.23403857)/499.44841461740924

#(array([10.67044561]), 8.324281856204824)
df['Duration_Hours'] = (df['Duration_Hours'] -10.67044561)/ 8.324281856204824
#can be considered dropping as 2 are giving same info

In [None]:
df

In [34]:
ypred = model.predict(df)

In [35]:
param_1 = joblib.load('invBoxcox.sav')

In [36]:
ypred

array([53.88372831])

In [37]:
from scipy.special import inv_boxcox
pred_inv = inv_boxcox(ypred, param_1)


In [38]:
pred_inv

array([11633.81980074])

In [39]:
columns = ['Airline', 'Source', 'Destination', 'Total_Stops', 'Airline range',
       'Additional Info_1 Long layover', 'Additional Info_1 Short layover',
       'Additional Info_2 Long layover', 'Additional Info_Business class',
       'Additional Info_Change airports',
       'Additional Info_In-flight meal not included',
       'Additional Info_No check-in baggage included', 'month', 'day',
       'Duration_Minutes', 'Duration_Hours', 'weekday', 'season_enc',
       'Arrival_Time_Period_enc', 'Dep_Time_Period_enc']
df.columns

Index(['Airline', 'Source', 'Destination', 'Total_Stops', 'Airline range',
       'Additional Info_1 Long layover', 'Additional Info_1 Short layover',
       'Additional Info_2 Long layover', 'Additional Info_Business class',
       'Additional Info_Change airports',
       'Additional Info_In-flight meal not included',
       'Additional Info_No check-in baggage included', 'month', 'day',
       'Duration_Minutes', 'Duration_Hours', 'weekday', 'season_enc',
       'Arrival_Time_Period_enc', 'Dep_Time_Period_enc'],
      dtype='object')