In [153]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge, HuberRegressor, LogisticRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [154]:
df = pd.read_csv('Final.csv')
df.shape

(9650, 21)

In [155]:
df.head(2)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Day,Month,Year,Dep_Hr,Dep_Min,Arr_Hr,Arr_Min,Duration_Hr,Duration_Min,Duration_bool
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10,2:50,0,No Info,3897,24,MAR,2019,22,20,1,10,2,50.0,170.0
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7:25,2,No Info,7662,1,MAY,2019,5,50,13,15,7,25.0,445.0


In [156]:
df.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price', 'Day', 'Month', 'Year', 'Dep_Hr', 'Dep_Min',
       'Arr_Hr', 'Arr_Min', 'Duration_Hr', 'Duration_Min', 'Duration_bool'],
      dtype='object')

In [157]:
df1 = df[['Airline', 'Source', 'Destination', 'Total_Stops',
       'Additional_Info', 'Price', 'Day', 'Month', 'Duration_bool']]

df1.shape

(9650, 9)

In [158]:
df1.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Day,Month,Duration_bool
0,IndiGo,Banglore,New Delhi,0,No Info,3897,24,MAR,170.0
1,Air India,Kolkata,Banglore,2,No Info,7662,1,MAY,445.0
2,IndiGo,Kolkata,Banglore,1,No Info,6218,12,MAY,325.0
3,IndiGo,Banglore,New Delhi,1,No Info,13302,1,MAR,285.0
4,SpiceJet,Kolkata,Banglore,0,No Info,3873,24,JUN,145.0


In [159]:
df1 = df1.rename(columns={'Duration_bool': 'Duration'})

In [160]:
df1.isnull().any().any()

False

In [161]:
df1['Month'] = df1['Month'].map({
    'JAN':1,
    'FEB':2,
    'MAR':3,
    'APR':4,
    'MAY':5,
    'JUN':6,
    'JUL':7,
    'AUG':8,
    'SEP':9,
    'OCT':10,
    'NOV':11,
    'DEC':12
})

In [162]:
df1['Additional_Info'] = df1['Additional_Info'].map({
    'No Info':0,
    'In-flight meal not included':1,
    'No check-in baggage included':2,
    '1 Short layover':3,
    '1 Long layover':4,
    'Change airports':5,
    'Business class':6,
    'Red-eye flight':7,
    '2 Long layover':8
})

In [163]:
dummies = pd.get_dummies(df1[['Airline', 'Source', 'Destination']])

In [164]:
df2 = pd.concat([df1,dummies], axis=1)
df2.shape

(9650, 32)

In [165]:
df2 = df2.drop(['Airline', 'Source', 'Destination'], axis=1)
df2.shape

(9650, 29)

In [166]:
df2.head()

Unnamed: 0,Total_Stops,Additional_Info,Price,Day,Month,Duration,Airline_Air Asia,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Jet Airways Business,Airline_Multiple carriers,Airline_Multiple carriers Premium economy,Airline_SpiceJet,Airline_Trujet,Airline_Vistara,Airline_Vistara Premium economy,Source_Banglore,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Banglore,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
0,0,0,3897,24,3,170.0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
1,2,0,7662,1,5,445.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2,1,0,6218,12,5,325.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
3,1,0,13302,1,3,285.0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
4,0,0,3873,24,6,145.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0


In [167]:
df2.columns

Index(['Total_Stops', 'Additional_Info', 'Price', 'Day', 'Month', 'Duration',
       'Airline_Air Asia', 'Airline_Air India', 'Airline_GoAir',
       'Airline_IndiGo', 'Airline_Jet Airways', 'Airline_Jet Airways Business',
       'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy',
       'Source_Banglore', 'Source_Chennai', 'Source_Delhi', 'Source_Kolkata',
       'Source_Mumbai', 'Destination_Banglore', 'Destination_Cochin',
       'Destination_Delhi', 'Destination_Hyderabad', 'Destination_Kolkata',
       'Destination_New Delhi'],
      dtype='object')

In [168]:
df2['Additional_Info'].unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int64)

In [169]:
X = df2.drop('Price', axis=1)
y = df2['Price']

In [170]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6755, 28), (2895, 28), (6755,), (2895,))

In [171]:
models = [['LinearRegression : ', LinearRegression()],
          ['ElasticNet :', ElasticNet()],
          ['Lasso : ', Lasso()],
          ['Ridge : ', Ridge()],
          ['KNeighborsRegressor : ', KNeighborsRegressor()],
          ['DecisionTreeRegressor : ', DecisionTreeRegressor()],
          ['RandomForestRegressor : ', RandomForestRegressor()],
          ['SVR : ', SVR()],
          ['AdaBoostRegressor : ', AdaBoostRegressor()],
          ['GradientBoostingRegressor : ', GradientBoostingRegressor()],
          ['ExtraTreeRegressor : ', ExtraTreeRegressor()],
          ['HuberRegressor : ', HuberRegressor()],
          ['XGBRegressor : ', XGBRegressor()],
          ['BayesianRidge : ', BayesianRidge()]]

In [172]:
for name, model in models:
    model=model
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(name, (np.sqrt(mean_squared_error(y_test, predictions))))
    print(name,'Accuracy :', ((model.score(X_train, y_train))))
    acc=float(model.score(X_train, y_train))
    print(acc)
    import pickle

    if acc>=0.90:
        name2=str(name).replace(':', '').replace(' ', '')
        filename = 'models/{name2}model.sav'.format(name2=name2)
        print('filename=',filename)
        pickle.dump(model, open(filename, 'wb'))

LinearRegression :  2782.75023661655
LinearRegression :  Accuracy : 0.6409100673537047
0.6409100673537047
ElasticNet : 3374.6297373093057
ElasticNet : Accuracy : 0.4378437770130367
0.4378437770130367
Lasso :  2765.0023861873046
Lasso :  Accuracy : 0.64073492854743
0.64073492854743
Ridge :  2723.8222534885554
Ridge :  Accuracy : 0.63684613447874
0.63684613447874
KNeighborsRegressor :  3240.107892536082
KNeighborsRegressor :  Accuracy : 0.6801568395111086
0.6801568395111086
DecisionTreeRegressor :  2022.1972133536337
DecisionTreeRegressor :  Accuracy : 0.9763124003537005
0.9763124003537005
filename= models/DecisionTreeRegressormodel.sav
RandomForestRegressor :  1662.0960094996121
RandomForestRegressor :  Accuracy : 0.9620419604111312
0.9620419604111312
filename= models/RandomForestRegressormodel.sav
SVR :  4246.460018470376
SVR :  Accuracy : 0.13173788701003786
0.13173788701003786
AdaBoostRegressor :  3195.6454760408087
AdaBoostRegressor :  Accuracy : 0.6223237516221944
0.622323751622194

In [173]:
algorithms = {
    'XGBRegressor' : {
        'model' : XGBRegressor(),
        'param' : {
            'learning_rate' : [0.5, 0.8, 0.1, 0.20, 0.25, 0.30],
            'max_depth' : [3, 5, 7, 9, 11, 13, 15],
            'gamma' : [0.1,0.2, 0.3, 0.4, 0.5],
            'min_child_weight' : [1, 3, 5, 7, 9],
            'colsample_bytree' : [0.5, 0.8, 0.1, 0.20, 0.25, 0.30]
        }
    },
    'RandomForestRegressor' : {
        'model' : RandomForestRegressor(),
        'param' : {
            'n_estimators' : [300, 500, 700, 1000, 2100],
            'max_depth' : [3, 5, 7, 9, 11, 13, 15],
            'max_features' : ["auto", "sqrt", "log2"],
            'min_samples_split' : [2, 4, 6, 8]
        }
    },
    'GradientBoostingRegressor' : {
        'model' : GradientBoostingRegressor(),
        'param' : {
            'learning_rate' : [0.5, 0.8, 0.1, 0.20, 0.25, 0.30],
            'n_estimators' : [300, 500, 700, 1000, 2100],
            'criterion' : ['friedman_mse', 'mse']
        }
    }
}

In [174]:
score = []

for name, mp in algorithms.items() :
    rs = RandomizedSearchCV(estimator = mp['model'], param_distributions = mp['param'], cv = 10, n_jobs=-1, verbose=3)
    rs.fit(X_train, y_train)
    score.append({
        'model': name,
        'score' : rs.best_score_,
        'params' : rs.best_params_
    })

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [175]:
final = pd.DataFrame(score, columns=['model', 'score', 'params'])
final

Unnamed: 0,model,score,params
0,XGBRegressor,0.867237,"{'min_child_weight': 1, 'max_depth': 11, 'lear..."
1,RandomForestRegressor,0.856095,"{'n_estimators': 500, 'min_samples_split': 2, ..."
2,GradientBoostingRegressor,0.868472,"{'n_estimators': 500, 'learning_rate': 0.5, 'c..."


In [176]:
final['params'][2]

{'n_estimators': 500, 'learning_rate': 0.5, 'criterion': 'mse'}

In [177]:
regressor = GradientBoostingRegressor(n_estimators = 500, learning_rate = 0.3, criterion = 'friedman_mse')
regressor.fit(X_train, y_train)
prediction = regressor.predict(X_test)
print('RMSE : {}'.format(np.sqrt(mean_squared_error(y_test, prediction))))

RMSE : 1626.1088262398712


In [178]:
regressor.score(X_train, y_train), regressor.score(X_test, y_test)

(0.9169577616559885, 0.8726821846613881)

In [179]:
prediction[0]

4409.649881214336

In [180]:
df2['Price'][0]

3897

In [181]:
print('MAE:', mean_absolute_error(y_test, prediction))
print('MSE:', mean_squared_error(y_test, prediction))
print('RMSE:', np.sqrt(mean_squared_error(y_test, prediction)))

MAE: 947.9364198055224
MSE: 2644229.914775212
RMSE: 1626.1088262398712


In [182]:
import pickle
file = open('GradientBoostingRegressor.sav', 'wb')
pickle.dump(regressor, file)

In [183]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))

In [184]:
df1.columns

Index(['Airline', 'Source', 'Destination', 'Total_Stops', 'Additional_Info',
       'Price', 'Day', 'Month', 'Duration'],
      dtype='object')

In [185]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9650 entries, 0 to 9649
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          9650 non-null   object 
 1   Source           9650 non-null   object 
 2   Destination      9650 non-null   object 
 3   Total_Stops      9650 non-null   int64  
 4   Additional_Info  9650 non-null   int64  
 5   Price            9650 non-null   int64  
 6   Day              9650 non-null   int64  
 7   Month            9650 non-null   int64  
 8   Duration         9650 non-null   float64
dtypes: float64(1), int64(5), object(3)
memory usage: 678.6+ KB


In [186]:
df1['Total_Stops'].unique()

array([0, 2, 1, 3, 4], dtype=int64)

In [187]:
X_test.columns

Index(['Total_Stops', 'Additional_Info', 'Day', 'Month', 'Duration',
       'Airline_Air Asia', 'Airline_Air India', 'Airline_GoAir',
       'Airline_IndiGo', 'Airline_Jet Airways', 'Airline_Jet Airways Business',
       'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy',
       'Source_Banglore', 'Source_Chennai', 'Source_Delhi', 'Source_Kolkata',
       'Source_Mumbai', 'Destination_Banglore', 'Destination_Cochin',
       'Destination_Delhi', 'Destination_Hyderabad', 'Destination_Kolkata',
       'Destination_New Delhi'],
      dtype='object')

In [188]:
test123=X_test.head(1)

In [189]:
test123.shape

(1, 28)

In [190]:
regressor.predict(test123)

array([4409.64988121])

In [191]:
for feature in df.columns:
    print('{} has total {} categories \n'.format(feature,len(df[feature].value_counts())))

Airline has total 12 categories 

Date_of_Journey has total 44 categories 

Source has total 5 categories 

Destination has total 6 categories 

Route has total 125 categories 

Dep_Time has total 214 categories 

Arrival_Time has total 220 categories 

Duration has total 338 categories 

Total_Stops has total 5 categories 

Additional_Info has total 9 categories 

Price has total 1778 categories 

Day has total 10 categories 

Month has total 4 categories 

Year has total 1 categories 

Dep_Hr has total 24 categories 

Dep_Min has total 12 categories 

Arr_Hr has total 24 categories 

Arr_Min has total 12 categories 

Duration_Hr has total 43 categories 

Duration_Min has total 11 categories 

Duration_bool has total 338 categories 



In [192]:
df['Airline'].unique()

array(['IndiGo', 'Air India', 'SpiceJet', 'Jet Airways',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Multiple carriers Premium economy',
       'Trujet', 'Jet Airways Business'], dtype=object)

In [193]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2895 entries, 3773 to 6027
Data columns (total 28 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Total_Stops                                2895 non-null   int64  
 1   Additional_Info                            2895 non-null   int64  
 2   Day                                        2895 non-null   int64  
 3   Month                                      2895 non-null   int64  
 4   Duration                                   2895 non-null   float64
 5   Airline_Air Asia                           2895 non-null   uint8  
 6   Airline_Air India                          2895 non-null   uint8  
 7   Airline_GoAir                              2895 non-null   uint8  
 8   Airline_IndiGo                             2895 non-null   uint8  
 9   Airline_Jet Airways                        2895 non-null   uint8  
 10  Airline_Jet Airways B

In [194]:
X_test.head(5)

Unnamed: 0,Total_Stops,Additional_Info,Day,Month,Duration,Airline_Air Asia,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Jet Airways Business,Airline_Multiple carriers,Airline_Multiple carriers Premium economy,Airline_SpiceJet,Airline_Trujet,Airline_Vistara,Airline_Vistara Premium economy,Source_Banglore,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Banglore,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
3773,0,0,9,3,160.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
4053,0,0,21,5,140.0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
4165,1,0,6,5,1005.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
3424,1,0,6,5,1655.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
5391,1,0,6,6,560.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0


In [195]:
for feature in X_train.columns:
    try:
        print('{} has total {} categories \n'.format(feature,len(df[feature].value_counts())))
    except:
        pass

Total_Stops has total 5 categories 

Additional_Info has total 9 categories 

Day has total 10 categories 

Month has total 4 categories 

Duration has total 338 categories 



In [196]:
df['Airline'].unique()


array(['IndiGo', 'Air India', 'SpiceJet', 'Jet Airways',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Multiple carriers Premium economy',
       'Trujet', 'Jet Airways Business'], dtype=object)

In [197]:
X_train['Total_Stops'].unique()


array([1, 2, 3, 0, 4], dtype=int64)

In [198]:
X_train['Additional_Info'].unique()

array([0, 1, 2, 5, 6, 4, 7, 8], dtype=int64)

In [199]:
df['Additional_Info'].unique()


array(['No Info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)

In [200]:
df.head(3774	)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Day,Month,Year,Dep_Hr,Dep_Min,Arr_Hr,Arr_Min,Duration_Hr,Duration_Min,Duration_bool
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10,2:50,0,No Info,3897,24,MAR,2019,22,20,1,10,2,50.0,170.0
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7:25,2,No Info,7662,1,MAY,2019,5,50,13,15,7,25.0,445.0
2,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5:25,1,No Info,6218,12,MAY,2019,18,5,23,30,5,25.0,325.0
3,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4:45,1,No Info,13302,1,MAR,2019,16,50,21,35,4,45.0,285.0
4,SpiceJet,24/06/2019,Kolkata,Banglore,CCU → BLR,09:00,11:25,2:25,0,No Info,3873,24,JUN,2019,9,0,11,25,2,25.0,145.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3769,Multiple carriers,15/06/2019,Delhi,Cochin,DEL → BOM → COK,11:25,19:15,7:50,1,No Info,8614,15,JUN,2019,11,25,19,15,7,50.0,470.0
3770,Multiple carriers,21/03/2019,Delhi,Cochin,DEL → BOM → COK,18:00,01:35,7:35,1,No Info,7363,21,MAR,2019,18,0,1,35,7,35.0,455.0
3771,Air India,15/05/2019,Delhi,Cochin,DEL → GOI → BOM → COK,22:00,19:15,21:15,2,No Info,11281,15,MAY,2019,22,0,19,15,21,15.0,1275.0
3772,GoAir,18/05/2019,Banglore,Delhi,BLR → DEL,11:40,14:30,2:50,0,No Info,4340,18,MAY,2019,11,40,14,30,2,50.0,170.0


In [201]:
X_test['Day'].max()

27

In [202]:
X_test.columns

Index(['Total_Stops', 'Additional_Info', 'Day', 'Month', 'Duration',
       'Airline_Air Asia', 'Airline_Air India', 'Airline_GoAir',
       'Airline_IndiGo', 'Airline_Jet Airways', 'Airline_Jet Airways Business',
       'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy',
       'Source_Banglore', 'Source_Chennai', 'Source_Delhi', 'Source_Kolkata',
       'Source_Mumbai', 'Destination_Banglore', 'Destination_Cochin',
       'Destination_Delhi', 'Destination_Hyderabad', 'Destination_Kolkata',
       'Destination_New Delhi'],
      dtype='object')

In [203]:
df['Additional_Info'].unique()


array(['No Info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)