In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge, HuberRegressor, LogisticRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [None]:
import pandas as pd
df = pd.read_csv('Final.csv')
df.shape

(9650, 21)

In [None]:
df.head(2)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Day,Month,Year,Dep_Hr,Dep_Min,Arr_Hr,Arr_Min,Duration_Hr,Duration_Min,Duration_bool
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10,2:50,0,No Info,3897,24,MAR,2019,22,20,1,10,2,50.0,170.0
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7:25,2,No Info,7662,1,MAY,2019,5,50,13,15,7,25.0,445.0


In [None]:
df.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price', 'Day', 'Month', 'Year', 'Dep_Hr', 'Dep_Min',
       'Arr_Hr', 'Arr_Min', 'Duration_Hr', 'Duration_Min', 'Duration_bool'],
      dtype='object')

In [None]:
df1 = df[['Airline', 'Source', 'Destination', 'Total_Stops',
         'Additional_Info', 'Price', 'Day', 'Month', 'Duration_bool']]
df1.shape

(9650, 9)

In [None]:
df1.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Day,Month,Duration_bool
0,IndiGo,Banglore,New Delhi,0,No Info,3897,24,MAR,170.0
1,Air India,Kolkata,Banglore,2,No Info,7662,1,MAY,445.0
2,IndiGo,Kolkata,Banglore,1,No Info,6218,12,MAY,325.0
3,IndiGo,Banglore,New Delhi,1,No Info,13302,1,MAR,285.0
4,SpiceJet,Kolkata,Banglore,0,No Info,3873,24,JUN,145.0


In [None]:
df1 = df1.rename(columns={'Duration_bool': 'Duration'})

In [None]:
df1.isnull().any().any()

False

In [None]:
df1['Month'] = df1['Month'].map({
    'JAN':1,
    'FEB':2,
    'MAR':3,
    'APR':4,
    'MAY':5,
    'JUN':6,
    'JUL':7,
    'AUG':8,
    'SEP':9,
    'OCT':10,
    'NOV':11,
    'DEC':12
})

In [None]:
df1['Additional_Info'] = df1['Additional_Info'].map({
    'No Info':0, 
    'In-flight meal not included':1,
    'No check-in baggage included':1,
    '1 Short layover':3,
    '1 Long layover':4,
    'Change airports':5,
    'Business class':6,
    'Red-eye flight':7,
    '2 Long layover':8
})

In [None]:
dummies = pd.get_dummies(df1[['Airline', 'Source', 'Destination']])

In [None]:
df2 = pd.concat([df1,dummies], axis=1)
df2.shape

(9650, 32)

In [None]:
df2 = df2.drop(['Airline', 'Source', 'Destination'], axis=1)
df2.shape

(9650, 29)

In [None]:
df2.head()

Unnamed: 0,Total_Stops,Additional_Info,Price,Day,Month,Duration,Airline_Air Asia,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Jet Airways Business,Airline_Multiple carriers,Airline_Multiple carriers Premium economy,Airline_SpiceJet,Airline_Trujet,Airline_Vistara,Airline_Vistara Premium economy,Source_Banglore,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Banglore,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
0,0,0,3897,24,3,170.0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
1,2,0,7662,1,5,445.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2,1,0,6218,12,5,325.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
3,1,0,13302,1,3,285.0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
4,0,0,3873,24,6,145.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0


In [None]:
df2.columns

Index(['Total_Stops', 'Additional_Info', 'Price', 'Day', 'Month', 'Duration',
       'Airline_Air Asia', 'Airline_Air India', 'Airline_GoAir',
       'Airline_IndiGo', 'Airline_Jet Airways', 'Airline_Jet Airways Business',
       'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy',
       'Source_Banglore', 'Source_Chennai', 'Source_Delhi', 'Source_Kolkata',
       'Source_Mumbai', 'Destination_Banglore', 'Destination_Cochin',
       'Destination_Delhi', 'Destination_Hyderabad', 'Destination_Kolkata',
       'Destination_New Delhi'],
      dtype='object')

In [None]:
df2['Additional_Info'].unique()

array([0, 1, 3, 4, 5, 6, 7, 8])

In [None]:
X = df2.drop('Price', axis=1)
y = df2['Price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6755, 28), (2895, 28), (6755,), (2895,))

In [None]:
models = [['Lasso : ', Lasso()],
          ['Ridge : ', Ridge()],
          ['ExtraTreeRegressor : ', ExtraTreeRegressor()],
          ['HuberRegressor : ', HuberRegressor()],
          ['XGBRegressor : ', XGBRegressor()]]

In [None]:
for name, model in models:
    model=model
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(name, (np.sqrt(mean_squared_error(y_test, predictions))))

Lasso :  2759.449381312224
Ridge :  2710.8476127741087
ExtraTreeRegressor :  2132.101732934933
HuberRegressor :  3143.2941295920737
XGBRegressor :  1932.6471139740543


In [None]:
algorithms = {
    'XGBRegressor' : {
        'model' : XGBRegressor(),
        'param' : {
            'learning_rate' : [0.5, 0.8, 0.1, 0.20, 0.25, 0.30],
            'max_depth' : [3, 5, 7, 9, 11, 13, 15],
            'gamma' : [0.1,0.2, 0.3, 0.4, 0.5],
            'min_child_weight' : [1, 3, 5, 7, 9],
            'colsample_bytree' : [0.5, 0.8, 0.1, 0.20, 0.25, 0.30]
        }
    },
    'Lasso' : {
        'model' : Lasso()
    },
    'Ridge' : {
        'model' : Ridge()
    }
}

In [None]:
from sklearn.metrics import r2_score
model_lasso = Lasso(alpha=0.01)
model_lasso.fit(X_train, y_train) 
pred_train_lasso= model_lasso.predict(X_train)
# print(np.sqrt(mean_squared_error(y_train,pred_train_lasso)))
# print(r2_score(y_train, pred_train_lasso))
pred_test_lasso= model_lasso.predict(X_test)
# print(np.sqrt(mean_squared_error(y_test,pred_test_lasso))) 
print(r2_score(y_test, pred_test_lasso)*100)

62.94320440091734


In [None]:
from sklearn.metrics import r2_score
model_ridge = Ridge(alpha=0.01)
model_ridge.fit(X_train, y_train) 
pred_train_ridge= model_ridge.predict(X_train)
# print(np.sqrt(mean_squared_error(y_train,pred_train_ridge)))
print(r2_score(y_train, pred_train_lasso)*100)
pred_test_ridge= model_ridge.predict(X_test)
# print(np.sqrt(mean_squared_error(y_test,pred_test_ridge))) 
# print(r2_score(y_test, pred_test_ridge)*100)

64.49027452997123


In [None]:
from sklearn.metrics import r2_score
model_XGBRegressor = XGBRegressor(alpha=0.01)
model_XGBRegressor.fit(X_train, y_train) 
pred_train_XGBRegressor= model_XGBRegressor.predict(X_train)
# print(np.sqrt(mean_squared_error(y_train,pred_train_XGBRegressor)))
print(r2_score(y_train, pred_train_XGBRegressor)*100)
pred_test_XGBRegressor= model_XGBRegressor.predict(X_test)
# print(np.sqrt(mean_squared_error(y_test,pred_test_XGBRegressor))) 
# print(r2_score(y_test, pred_test_XGBRegressor)*100)

82.71250011767667


In [None]:
from sklearn.metrics import r2_score
model_HuberRegressor = HuberRegressor()
model_HuberRegressor.fit(X_train, y_train) 
pred_train_HuberRegressor= model_HuberRegressor.predict(X_train)
# print(np.sqrt(mean_squared_error(y_train,pred_train_HuberRegressor)))
print(r2_score(y_train, pred_train_HuberRegressor)*100)
pred_test_HuberRegressor= model_HuberRegressor.predict(X_test)
# print(np.sqrt(mean_squared_error(y_test,pred_test_HuberRegressor))) 
# print(r2_score(y_test, pred_test_HuberRegressor)*100)

48.71631836806618


In [None]:
from sklearn.metrics import r2_score
model_ExtraTreeRegressor = ExtraTreeRegressor()
model_ExtraTreeRegressor.fit(X_train, y_train) 
pred_train_ExtraTreeRegressor= model_ExtraTreeRegressor.predict(X_train)
# print(np.sqrt(mean_squared_error(y_train,pred_train_ExtraTreeRegressor)))
# print(r2_score(y_train, pred_train_ExtraTreeRegressor)*100)
pred_test_ExtraTreeRegressor= model_ExtraTreeRegressor.predict(X_test)
# print(np.sqrt(mean_squared_error(y_test,pred_test_ExtraTreeRegressor))) 
print(r2_score(y_test, pred_test_ExtraTreeRegressor)*100)

78.29283651546713
