In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency 
from sklearn import preprocessing 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

In [2]:
train=pd.read_excel("Data_Train.xlsx")

In [3]:
train.head(2)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662


In [4]:
print(train.isnull().sum())

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64


In [5]:
train=train.dropna()

In [6]:
train=train.drop_duplicates()

In [7]:
#save numeric names
cnames =  ["Price"]
# #Detect and delete outliers from data
for i in cnames:
    q75, q25 = np.percentile(train.loc[:,i], [75 ,25])
    iqr = q75 - q25
    min = q25 - (iqr*1.5)
    max = q75 + (iqr*1.5)
    train = train.drop(train[train.loc[:,i] < min].index)
    train = train.drop(train[train.loc[:,i] > max].index)

In [8]:
train['Journey_Day'] = pd.to_datetime(train.Date_of_Journey, format='%d/%m/%Y').dt.day
train['Journey_Month'] = pd.to_datetime(train.Date_of_Journey, format='%d/%m/%Y').dt.month
train['weekday']= pd.to_datetime(train.Date_of_Journey, format='%d/%m/%Y').dt.weekday

In [9]:
#Transforming duration to minutes
def duration(test):
    test = test.strip()
    total=test.split(' ')
    to=total[0]
    hrs=(int)(to[:-1])*60
    if((len(total))==2):
        mint=(int)(total[1][:-1])
        hrs=hrs+mint
    test=str(hrs)
    return test
train['Duration']=train['Duration'].apply(duration)

In [10]:
#Categorising departure and arrival time
def deparrtime(x):
    x=x.strip()
    tt=(int)(x.split(':')[0])
    if(tt>=16 and tt<21):
        x='Evening'
    elif(tt>=21 or tt<5):
        x='Night'
    elif(tt>=5 and tt<11):
        x='Morning'
    elif(tt>=11 and tt<16):
        x='Afternoon'
    return x
train['Dep_Time']=train['Dep_Time'].apply(deparrtime)
train['Arrival_Time']=train['Arrival_Time'].apply(deparrtime)

In [11]:
#Refining total stops column
def stops(x):
    if(x=='non-stop'):
        x=str(0)
    else:
        x.strip()
        stps=x.split(' ')[0]
        x=stps
    return x
train['Total_Stops']=train['Total_Stops'].apply(stops)

In [12]:
train.head(2)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,weekday
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,Night,Night,170,0,No info,3897,24,3,6
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,Morning,Afternoon,445,2,No info,7662,1,5,2


In [13]:
train=train.drop(["Date_of_Journey","Route"],axis=1) #Date of journey, Route dropped because they dont add value to the prediction much

In [14]:
train.head(2)

Unnamed: 0,Airline,Source,Destination,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,weekday
0,IndiGo,Banglore,New Delhi,Night,Night,170,0,No info,3897,24,3,6
1,Air India,Kolkata,Banglore,Morning,Afternoon,445,2,No info,7662,1,5,2


In [17]:
test=pd.read_excel("Test_set.xlsx")

In [18]:
test['Journey_Day'] = pd.to_datetime(test.Date_of_Journey, format='%d/%m/%Y').dt.day
test['Journey_Month'] = pd.to_datetime(test.Date_of_Journey, format='%d/%m/%Y').dt.month
test['weekday']= pd.to_datetime(test.Date_of_Journey, format='%d/%m/%Y').dt.weekday
test['Duration']=test['Duration'].apply(duration)
test['Dep_Time']=test['Dep_Time'].apply(deparrtime)
test['Arrival_Time']=test['Arrival_Time'].apply(deparrtime)
test['Total_Stops']=test['Total_Stops'].apply(stops)
test=test.drop(["Date_of_Journey","Route"],axis=1) 

In [24]:
print(train["Total_Stops"].value_counts())
print()
print(test["Total_Stops"].value_counts())

1    5550
0    3472
2    1302
3      43
4       1
Name: Total_Stops, dtype: int64

1    1431
0     849
2     379
3      11
4       1
Name: Total_Stops, dtype: int64


In [37]:
#Grouping classes with very low frequency with Airline
for i in range(0,len(train["Airline"])):
    if train["Airline"].iloc[i]=="Multiple carriers Premium economy":
         train["Airline"].iloc[i]="Others"
    elif train["Airline"].iloc[i]=="Vistara Premium economy":
         train["Airline"].iloc[i]="Others"
    elif train["Airline"].iloc[i]=="Trujet":
         train["Airline"].iloc[i]="Others"

In [39]:
#Refining No info class in Additional info
for i in range(train.shape[0]):
    if(train['Additional_Info'].iloc[i]=='No info'):
        train['Additional_Info'].iloc[i]='No Info'

In [42]:
#Grouping classes with very low frequency with Additional_Info
for i in range(0,len(train["Additional_Info"])):
    if train["Additional_Info"].iloc[i]== 'No check-in baggage included':
         train["Additional_Info"].iloc[i]="Others"
    elif train["Additional_Info"].iloc[i]=='1 Long layover':
         train["Additional_Info"].iloc[i]="Others"
    elif train["Additional_Info"].iloc[i]== 'Change airports':
         train["Additional_Info"].iloc[i]="Others"
    elif train["Additional_Info"].iloc[i]=='Red-eye flight':
         train["Additional_Info"].iloc[i]="Others"

In [44]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10368 entries, 0 to 10682
Data columns (total 12 columns):
Airline            10368 non-null object
Source             10368 non-null object
Destination        10368 non-null object
Dep_Time           10368 non-null object
Arrival_Time       10368 non-null object
Duration           10368 non-null object
Total_Stops        10368 non-null object
Additional_Info    10368 non-null object
Price              10368 non-null int64
Journey_Day        10368 non-null int64
Journey_Month      10368 non-null int64
weekday            10368 non-null int64
dtypes: int64(4), object(8)
memory usage: 1.0+ MB


In [45]:
train["Duration"]=train["Duration"].astype(int)
train["weekday"]=train["weekday"].astype(object)
train["Journey_Day"]=train["Journey_Day"].astype(object)
train["Journey_Month"]=train["Journey_Month"].astype(object)

In [46]:
# Import label encoder 
colnames = list(train.columns)
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
  
for col in colnames:
    if train[col].dtype==object:
        train[col]= label_encoder.fit_transform(train[col])
        train[col]=train[col].astype(object)

In [47]:
train["Duration"]= (train["Duration"] - train["Duration"].mean())/train["Duration"].std()

In [48]:
X=train.drop(["Price"],axis=1)
Y=train["Price"]
x=np.array(X)
y=np.array(Y)

In [49]:
#Ramdom Forest
estimator = RandomForestRegressor()
param_grid = { 
            "n_estimators"      : [100,300,500],
            "max_features"      : ["auto", "sqrt", "log2"],
            "min_samples_split" : [2,4,8],
            "bootstrap": [True,False],
            }
rfmodel = GridSearchCV(estimator, param_grid, n_jobs=-1, cv=5)
rfmodel.fit(x, y)
scores = cross_val_score(rfmodel, x, y, cv=5)
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))

Cross validation score: 89.99% (+/- 1.06%)


In [50]:
#Gradient Boosting and XGBoost
gbm = GradientBoostingRegressor()
xgb = XGBRegressor()
best_gbm = GridSearchCV(gbm, param_grid={'learning_rate':[0.01,0.05,0.1],'max_depth':[3,5,7],'n_estimators':[500]}, cv=5, n_jobs=-1)
best_xgb = GridSearchCV(xgb, param_grid={'learning_rate':[0.01,0.05,0.1],'max_depth':[3,5,7],'n_estimators':[500]}, cv=5, n_jobs=-1)
best_gbm.fit(x,y)
best_xgb.fit(x,y)
scores = cross_val_score(best_gbm.best_estimator_, x, y, cv=5)
print("GBM Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
scores = cross_val_score(best_xgb.best_estimator_, x, y, cv=5)
print("XGBoost Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))

GBM Cross validation score: 91.38% (+/- 1.02%)
XGBoost Cross validation score: 91.54% (+/- 0.99%)


In [97]:
#Doing similar pre-processing in test data

In [60]:
test=pd.read_excel("Test_set.xlsx")

In [61]:
test['Journey_Day'] = pd.to_datetime(test.Date_of_Journey, format='%d/%m/%Y').dt.day
test['Journey_Month'] = pd.to_datetime(test.Date_of_Journey, format='%d/%m/%Y').dt.month
test['weekday']= pd.to_datetime(test.Date_of_Journey, format='%d/%m/%Y').dt.weekday

In [62]:
test['Duration']=test['Duration'].apply(duration)

In [63]:
test['Dep_Time']=test['Dep_Time'].apply(deparrtime)
test['Arrival_Time']=test['Arrival_Time'].apply(deparrtime)

In [64]:
test['Total_Stops']=test['Total_Stops'].apply(stops)

In [65]:
test=test.drop(["Date_of_Journey","Route"],axis=1)

In [67]:
for i in range(0,len(test["Airline"])):
    if test["Airline"].iloc[i]=="Multiple carriers Premium economy":
         test["Airline"].iloc[i]="Others"
    elif test["Airline"].iloc[i]=="Vistara Premium economy":
         test["Airline"].iloc[i]="Others"
    elif test["Airline"].iloc[i]=="Trujet":
         test["Airline"].iloc[i]="Others"
    elif test["Airline"].iloc[i]=="Jet Airways Business":
         test["Airline"].iloc[i]="Others"

In [72]:
for i in range(0,len(test["Additional_Info"])):
    if test["Additional_Info"].iloc[i]== 'No check-in baggage included':
         test["Additional_Info"].iloc[i]="Others"
    elif test["Additional_Info"].iloc[i]=='1 Long layover':
         test["Additional_Info"].iloc[i]="Others"
    elif test["Additional_Info"].iloc[i]== 'Change airports':
         test["Additional_Info"].iloc[i]="Others"
    elif test["Additional_Info"].iloc[i]=='Business class':
         test["Additional_Info"].iloc[i]="Others"

In [74]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2671 entries, 0 to 2670
Data columns (total 11 columns):
Airline            2671 non-null object
Source             2671 non-null object
Destination        2671 non-null object
Dep_Time           2671 non-null object
Arrival_Time       2671 non-null object
Duration           2671 non-null object
Total_Stops        2671 non-null object
Additional_Info    2671 non-null object
Journey_Day        2671 non-null int64
Journey_Month      2671 non-null int64
weekday            2671 non-null int64
dtypes: int64(3), object(8)
memory usage: 229.7+ KB


In [76]:
test["Duration"]=test["Duration"].astype(int)
test["weekday"]=test["weekday"].astype(object)
test["Journey_Day"]=test["Journey_Day"].astype(object)
test["Journey_Month"]=test["Journey_Month"].astype(object)

In [77]:
# Import label encoder 
colnames = list(test.columns)
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
  
for col in colnames:
    if test[col].dtype==object:
        test[col]= label_encoder.fit_transform(test[col])
        test[col]=test[col].astype(object)

In [78]:
test["Duration"]= (test["Duration"] - test["Duration"].mean())/test["Duration"].std()

In [79]:
x=np.array(test)

In [81]:
ypred=best_xgb.best_estimator_.predict(x)

In [82]:
test["Price"]=ypred

In [83]:
test["Price"].to_csv("submissionflight.csv")

  """Entry point for launching an IPython kernel.
