In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [3]:
##Reading the data
data = pd.read_csv('data/airlines_flights_data.csv')
data.head()

Unnamed: 0,index,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [4]:
data.shape

(300153, 12)

In [5]:
data.nunique()

index               300153
airline                  6
flight                1561
source_city              6
departure_time           6
stops                    3
arrival_time             6
destination_city         6
class                    2
duration               476
days_left               49
price                12157
dtype: int64

In [6]:
data.isna().sum()

index               0
airline             0
flight              0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
dtype: int64

In [7]:
data.isnull().sum()

index               0
airline             0
flight              0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
dtype: int64

In [8]:
data.duplicated().sum()

np.int64(0)

In [9]:
data['airline'].unique()

array(['SpiceJet', 'AirAsia', 'Vistara', 'GO_FIRST', 'Indigo',
       'Air_India'], dtype=object)

In [10]:
data['flight'].unique()

array(['SG-8709', 'SG-8157', 'I5-764', ..., '6E-7127', '6E-7259',
       'AI-433'], shape=(1561,), dtype=object)

In [11]:
#dropping flight
df = data.drop(['flight','index'],axis=1)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   airline           300153 non-null  object 
 1   source_city       300153 non-null  object 
 2   departure_time    300153 non-null  object 
 3   stops             300153 non-null  object 
 4   arrival_time      300153 non-null  object 
 5   destination_city  300153 non-null  object 
 6   class             300153 non-null  object 
 7   duration          300153 non-null  float64
 8   days_left         300153 non-null  int64  
 9   price             300153 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 22.9+ MB


In [18]:
num_features = [features for features in df.columns if df[features].dtype !='O']
print("Numerical features: ",num_features)
cat_features = [features for features in df.columns if df[features].dtype =='O']
print("Categorical features: ",cat_features)

Numerical features:  ['duration', 'days_left', 'price']
Categorical features:  ['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']


In [19]:
df.head()

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [31]:
df['duration'] = df['duration'].fillna(df['duration'].median()).astype(str)

In [33]:
df['duration_hour'] = df['duration'].str.split(".").str[0].astype(int)
df['duration_minute'] = df['duration'].str.split(".").str[1].astype(int)

In [34]:
df.head()

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price,duration_hour,duration_minute
0,SpiceJet,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953,2,17
1,SpiceJet,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953,2,33
2,AirAsia,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956,2,17
3,Vistara,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955,2,25
4,Vistara,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955,2,33


In [35]:
df_1= df.drop('duration',axis=1)

In [36]:
df_1.head()

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,days_left,price,duration_hour,duration_minute
0,SpiceJet,Delhi,Evening,zero,Night,Mumbai,Economy,1,5953,2,17
1,SpiceJet,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,1,5953,2,33
2,AirAsia,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,1,5956,2,17
3,Vistara,Delhi,Morning,zero,Afternoon,Mumbai,Economy,1,5955,2,25
4,Vistara,Delhi,Morning,zero,Morning,Mumbai,Economy,1,5955,2,33


In [37]:
num_features = [features for features in df_1.columns if df_1[features].dtype !='O']
print("Numerical features: ",num_features)
cat_features = [features for features in df_1.columns if df_1[features].dtype =='O']
print("Categorical features: ",cat_features)

Numerical features:  ['days_left', 'price', 'duration_hour', 'duration_minute']
Categorical features:  ['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']


In [38]:
print(df_1['airline'].unique())
print(df_1['source_city'].unique())
print(df_1['departure_time'].unique())
print(df_1['stops'].unique())
print(df_1['arrival_time'].unique())
print(df_1['destination_city'].unique())
print(df_1['class'].unique())

['SpiceJet' 'AirAsia' 'Vistara' 'GO_FIRST' 'Indigo' 'Air_India']
['Delhi' 'Mumbai' 'Bangalore' 'Kolkata' 'Hyderabad' 'Chennai']
['Evening' 'Early_Morning' 'Morning' 'Afternoon' 'Night' 'Late_Night']
['zero' 'one' 'two_or_more']
['Night' 'Morning' 'Early_Morning' 'Afternoon' 'Evening' 'Late_Night']
['Mumbai' 'Bangalore' 'Kolkata' 'Hyderabad' 'Chennai' 'Delhi']
['Economy' 'Business']


In [39]:
#Assigining inputs and ioutput
x = df_1.drop('price',axis=1)
y = df_1['price']
x.shape, y.shape

((300153, 10), (300153,))

In [40]:
num_features = [features for features in x.columns if x[features].dtype !='O']
print("Numerical features: ",num_features)
cat_features = [features for features in x.columns if x[features].dtype =='O']
print("Categorical features: ",cat_features)

Numerical features:  ['days_left', 'duration_hour', 'duration_minute']
Categorical features:  ['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']


In [41]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

one_hot = OneHotEncoder()
scaler = StandardScaler()

preprocessor = ColumnTransformer([
    ('one hot',one_hot,cat_features),
    ('scaler',scaler,num_features)
],remainder='passthrough')

In [42]:
x_scaled =preprocessor.fit_transform(x)

In [43]:
#Spltting the data using train test split
x_train,x_test,y_train,y_test = train_test_split(x_scaled,y,test_size=0.2,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((240122, 38), (60031, 38), (240122,), (60031,))

In [45]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet,LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [46]:
#Training the model
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "ElasticNet": ElasticNet(),
    "KNeighborsRegressor": KNeighborsRegressor(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "RandomForestRegressor": RandomForestRegressor(),
    "AdaBoostRegressor": AdaBoostRegressor(),
    "GradientBoostingRegressor": GradientBoostingRegressor(),
    "XGBRegressor": XGBRegressor()
}

def Metrics(truth,predicted):
    r2 = r2_score(truth,predicted)
    n = x_train.shape[0]
    p = x_train.shape[1]
    adjusted_r2 = 1- ((1-r2)/(n-p-1))
    mse = mean_squared_error(truth,predicted)
    mae = mean_absolute_error(truth,predicted)
    rmse = np.sqrt(mse)
    return r2,adjusted_r2,mse,mae,rmse

In [47]:
for i in range(len(list(models.keys()))):
    model = list(models.values())[i]
    model.fit(x_train,y_train)
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    r2_train,adjusted_r2_train,mse_train,mae_train,rmse_train = Metrics(y_train,y_train_pred)
    r2_test,adjusted_r2_test,mse_test,mae_test,rmse_test = Metrics(y_test,y_test_pred)

    print(list(models.keys())[i])
    print("Performance metrics of Train")
    
    print("- R2 Score: {:.4f}".format(r2_train))
    print("- Adjusted R2 Score: {:.4f}".format(adjusted_r2_train))
    print("- Mean Squared Error: {:.4f}".format(mse_train))
    print("- Mean Absolute Error: {:.4f}".format(mae_train))
    print("- Root Mean Squared Error: {:.4f}".format(rmse_train))
    print("--------------------------------------")
    print("- Performance metrics of Test")
    
    print("- R2 Score: {:.4f}".format(r2_test))
    print("- Adjusted R2 Score: {:.4f}".format(adjusted_r2_test))
    print("- Mean Squared Error: {:.4f}".format(mse_test))
    print("- Mean Absolute Error: {:.4f}".format(mae_test))
    print("- Root Mean Squared Error: {:.4f}".format(rmse_test))

    print("="*32)

Linear Regression
Performance metrics of Train
- R2 Score: 0.9115
- Adjusted R2 Score: 1.0000
- Mean Squared Error: 45577089.6692
- Mean Absolute Error: 4574.1158
- Root Mean Squared Error: 6751.0806
--------------------------------------
- Performance metrics of Test
- R2 Score: 0.9114
- Adjusted R2 Score: 1.0000
- Mean Squared Error: 45695454.0392
- Mean Absolute Error: 4552.0222
- Root Mean Squared Error: 6759.8413
Lasso
Performance metrics of Train
- R2 Score: 0.9115
- Adjusted R2 Score: 1.0000
- Mean Squared Error: 45577637.3183
- Mean Absolute Error: 4572.2891
- Root Mean Squared Error: 6751.1212
--------------------------------------
- Performance metrics of Test
- R2 Score: 0.9114
- Adjusted R2 Score: 1.0000
- Mean Squared Error: 45696747.5603
- Mean Absolute Error: 4550.3007
- Root Mean Squared Error: 6759.9369
Ridge
Performance metrics of Train
- R2 Score: 0.9115
- Adjusted R2 Score: 1.0000
- Mean Squared Error: 45577094.2567
- Mean Absolute Error: 4574.0359
- Root Mean Squar