In [1]:
import os
import pandas as pd
import numpy as np
import opendatasets as od
import warnings
warnings.filterwarnings("ignore")

In [2]:
files = os.listdir('.')
if "flight-fare-prediction-mh" not in files:
    od.download("https://www.kaggle.com/datasets/nikhilmittal/flight-fare-prediction-mh")

dataset = pd.read_excel("flight-fare-prediction-mh/Data_Train.xlsx")

In [3]:
dataset.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


### Missing values

In [5]:
dataset.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

 - In Route and Total stops have each null values

In [6]:
nan_in_col  = dataset[dataset['Route'].isnull()]

nan_in_col

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
9039,Air India,6/05/2019,Delhi,Cochin,,09:45,09:25 07 May,23h 40m,,No info,7480


In [7]:
route_mode = dataset['Route'].mode()
route_mode

0    DEL → BOM → COK
Name: Route, dtype: object

In [8]:
total_stops_mode = dataset['Total_Stops'].mode()
total_stops_mode

0    1 stop
Name: Total_Stops, dtype: object

 - As per the dataset the source of the flight is Delhi and destination is Cochin so the Route must be Delhi to Cochin.
 - The most occured data in Route column is DEL → BOM → COK and it also matches the source and destination places so we can fill this data as null values
 - The most occured data in Total_Stops column is 1 stop and it also matches the source,route and destination places so we can fill this data as null values

##### Filling the null values

In [9]:
dataset = dataset.apply(lambda x: x.fillna(x.value_counts().index[0]))
dataset.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
dtype: int64

#### taking care of duplicate data

In [10]:
dataset.duplicated().sum()

220

In [11]:
dataset[dataset.duplicated(keep = 'first')]

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
683,Jet Airways,1/06/2019,Delhi,Cochin,DEL → NAG → BOM → COK,14:35,04:25 02 Jun,13h 50m,2 stops,No info,13376
1061,Air India,21/05/2019,Delhi,Cochin,DEL → GOI → BOM → COK,22:00,19:15 22 May,21h 15m,2 stops,No info,10231
1348,Air India,18/05/2019,Delhi,Cochin,DEL → HYD → BOM → COK,17:15,19:15 19 May,26h,2 stops,No info,12392
1418,Jet Airways,6/06/2019,Delhi,Cochin,DEL → JAI → BOM → COK,05:30,04:25 07 Jun,22h 55m,2 stops,In-flight meal not included,10368
1674,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,18:25,21:20,2h 55m,non-stop,No info,7303
...,...,...,...,...,...,...,...,...,...,...,...
10594,Jet Airways,27/06/2019,Delhi,Cochin,DEL → AMD → BOM → COK,23:05,12:35 28 Jun,13h 30m,2 stops,No info,12819
10616,Jet Airways,1/06/2019,Delhi,Cochin,DEL → JAI → BOM → COK,09:40,12:35 02 Jun,26h 55m,2 stops,No info,13014
10634,Jet Airways,6/06/2019,Delhi,Cochin,DEL → JAI → BOM → COK,09:40,12:35 07 Jun,26h 55m,2 stops,In-flight meal not included,11733
10672,Jet Airways,27/06/2019,Delhi,Cochin,DEL → AMD → BOM → COK,23:05,19:00 28 Jun,19h 55m,2 stops,In-flight meal not included,11150


there are more duplicate values are present in the dataset which excatly same as the other data rows so its not worth to keep the data 

##### droping the duplicate data

In [12]:
dataset.drop_duplicates(keep=False, inplace=True)

Arround 220 rows are droped with same data

#### Spliting Date of Journey into day month and year

In [13]:
dataset['Day_of_Journey'] = dataset['Date_of_Journey'].str.split('/').str[0].astype(int)
dataset['Month_of_Journey'] = dataset['Date_of_Journey'].str.split('/').str[1].astype(int)
dataset['Year_of_Journey'] = dataset['Date_of_Journey'].str.split('/').str[2].astype(int)

In [14]:
dataset.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Day_of_Journey,Month_of_Journey,Year_of_Journey
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3,2019
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1,5,2019
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,9,6,2019
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218,12,5,2019
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302,1,3,2019


#### droping the Date of Journey

In [15]:
dataset.drop('Date_of_Journey', axis=1, inplace=True)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10268 entries, 0 to 10682
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Airline           10268 non-null  object
 1   Source            10268 non-null  object
 2   Destination       10268 non-null  object
 3   Route             10268 non-null  object
 4   Dep_Time          10268 non-null  object
 5   Arrival_Time      10268 non-null  object
 6   Duration          10268 non-null  object
 7   Total_Stops       10268 non-null  object
 8   Additional_Info   10268 non-null  object
 9   Price             10268 non-null  int64 
 10  Day_of_Journey    10268 non-null  int32 
 11  Month_of_Journey  10268 non-null  int32 
 12  Year_of_Journey   10268 non-null  int32 
dtypes: int32(3), int64(1), object(9)
memory usage: 1002.7+ KB


#### Spliting hours and minutes from the Dep_Time and droping Dep_Time

In [16]:
dataset['Dep_Time_Hour'] = dataset['Dep_Time'].str.split(':').str[0].astype(int)
dataset['Dep_Time_minute'] = dataset['Dep_Time'].str.split(':').str[1].astype(int)
dataset.drop('Dep_Time', axis=1, inplace=True)

In [17]:
dataset.head(2)

Unnamed: 0,Airline,Source,Destination,Route,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Day_of_Journey,Month_of_Journey,Year_of_Journey,Dep_Time_Hour,Dep_Time_minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3,2019,22,20
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,13:15,7h 25m,2 stops,No info,7662,1,5,2019,5,50


#### taking care of Arrival time

In arrival time column have the time with date if the flight arrived at mid night so column contains the next day date so removing it and creating separate column as Arrival day

 - if the travell ends with same day then we add 0 for 'Same day of DOJ' otherwise 1

In [18]:
dataset['Arrival_day'] = dataset['Arrival_Time'].apply(lambda x: 1 if len(x) > 5 else 0)

dataset['Arrival_Time'] = dataset['Arrival_Time'].apply(lambda x: x.split(' ')[0] if len(x)== 12 else x)

In [19]:
dataset['Arrival_Time_Hour'] = dataset['Arrival_Time'].str.split(':').str[0].astype(int)
dataset['Arrival_Time_minute'] = dataset['Arrival_Time'].str.split(':').str[1].astype(int)
dataset.drop('Arrival_Time', axis=1, inplace=True)

In [20]:
dataset.head(2)

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Day_of_Journey,Month_of_Journey,Year_of_Journey,Dep_Time_Hour,Dep_Time_minute,Arrival_day,Arrival_Time_Hour,Arrival_Time_minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,non-stop,No info,3897,24,3,2019,22,20,1,1,10
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2 stops,No info,7662,1,5,2019,5,50,0,13,15


#### Droping the Route column

Beacause the counts of places in between the source and destination is already added in the total stops so its ok to remove the Route column

In [21]:
dataset.drop('Route', axis=1, inplace=True)
dataset.head(2)

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Day_of_Journey,Month_of_Journey,Year_of_Journey,Dep_Time_Hour,Dep_Time_minute,Arrival_day,Arrival_Time_Hour,Arrival_Time_minute
0,IndiGo,Banglore,New Delhi,2h 50m,non-stop,No info,3897,24,3,2019,22,20,1,1,10
1,Air India,Kolkata,Banglore,7h 25m,2 stops,No info,7662,1,5,2019,5,50,0,13,15


#### Converting Duration into minutes and droping duration column

In [22]:
dataset['Duration_min'] = (pd.to_timedelta(dataset['Duration']).dt.seconds // 60)
dataset.drop('Duration', axis=1, inplace=True)

In [23]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10268 entries, 0 to 10682
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Airline              10268 non-null  object
 1   Source               10268 non-null  object
 2   Destination          10268 non-null  object
 3   Total_Stops          10268 non-null  object
 4   Additional_Info      10268 non-null  object
 5   Price                10268 non-null  int64 
 6   Day_of_Journey       10268 non-null  int32 
 7   Month_of_Journey     10268 non-null  int32 
 8   Year_of_Journey      10268 non-null  int32 
 9   Dep_Time_Hour        10268 non-null  int32 
 10  Dep_Time_minute      10268 non-null  int32 
 11  Arrival_day          10268 non-null  int64 
 12  Arrival_Time_Hour    10268 non-null  int32 
 13  Arrival_Time_minute  10268 non-null  int32 
 14  Duration_min         10268 non-null  int64 
dtypes: int32(7), int64(3), object(5)
memory usage: 1002.7

#### Taking care of Additional info

In [24]:
dataset['Additional_Info'].unique()

array(['No info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover', 'No Info',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)

In the Additional Info column there are some values are same like No info and No Info so combining them

In [25]:
dataset['Additional_Info'] = dataset['Additional_Info'].apply(lambda x: 'No info' if x =='No Info' else x)

In [26]:
dataset['Additional_Info'].unique()

array(['No info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)

#### checking the Total stops column 

In [27]:
dataset['Total_Stops'].unique()

array(['non-stop', '2 stops', '1 stop', '3 stops', '4 stops'],
      dtype=object)

maping the values of Total stops into integer as 'non-stop' to 0, '2 stops' to 2, '1 stop' to 1, '3 stops' to 3 and '4 stops' to 4

In [28]:
dataset['Total_Stops'] = dataset['Total_Stops'].map({'non-stop': 0, '2 stops': 2, '1 stop': 1, '3 stops': 3, '4 stops': 4})

In [29]:
dataset['Total_Stops'].unique()

array([0, 2, 1, 3, 4], dtype=int64)

In [30]:
dataset.head(2)

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Day_of_Journey,Month_of_Journey,Year_of_Journey,Dep_Time_Hour,Dep_Time_minute,Arrival_day,Arrival_Time_Hour,Arrival_Time_minute,Duration_min
0,IndiGo,Banglore,New Delhi,0,No info,3897,24,3,2019,22,20,1,1,10,170
1,Air India,Kolkata,Banglore,2,No info,7662,1,5,2019,5,50,0,13,15,445


Some of the coulumns have to converted by onehot encoding the columns are: ['Airline','Source','Destination','Additional_info']

#### Converting categorical columns to numeric using

 - Label encoding
 - One-Hot encoding
 - By using dummies
 - Frequency encoding

#### When to use Label encoding, One-Hot encoding ?
There are some cases where LabelEncoder or DictVectorizor are useful, but these are quite limited in my opinion due to ordinality.

LabelEncoder can turn [dog,cat,dog,mouse,cat] into [1,2,1,3,2], but then the imposed ordinality means that the average of dog and mouse is cat. Still there are algorithms like decision trees and random forests that can work with categorical variables just fine and LabelEncoder can be used to store values using less disk space.

One-Hot-Encoding has the advantage that the result is binary rather than ordinal and that everything sits in an orthogonal vector space. The disadvantage is that for high cardinality, the feature space can really blow up quickly and you start fighting with the curse of dimensionality. In these cases, I typically employ one-hot-encoding followed by PCA for dimensionality reduction. I find that the judicious combination of one-hot plus PCA can seldom be beat by other encoding schemes. PCA finds the linear overlap, so will naturally tend to group similar features into the same feature.

In [31]:
categorical_columns = [i for i in dataset.columns if dataset[i].dtype == 'O']
categorical_columns

['Airline', 'Source', 'Destination', 'Additional_Info']

#### 1. Label Encoding

In [32]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data = dataset.copy()
categorical_columns = [i for i in data.columns if data[i].dtype == 'O']
data[categorical_columns] = data[categorical_columns].apply(lambda col: encoder.fit_transform(col))
data.head(2)

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Day_of_Journey,Month_of_Journey,Year_of_Journey,Dep_Time_Hour,Dep_Time_minute,Arrival_day,Arrival_Time_Hour,Arrival_Time_minute,Duration_min
0,3,0,5,0,7,3897,24,3,2019,22,20,1,1,10,170
1,1,3,0,2,7,7662,1,5,2019,5,50,0,13,15,445


In [33]:
data.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Day_of_Journey,Month_of_Journey,Year_of_Journey,Dep_Time_Hour,Dep_Time_minute,Arrival_day,Arrival_Time_Hour,Arrival_Time_minute,Duration_min
0,3,0,5,0,7,3897,24,3,2019,22,20,1,1,10,170
1,1,3,0,2,7,7662,1,5,2019,5,50,0,13,15,445
2,4,2,1,2,7,13882,9,6,2019,9,25,1,4,25,1140
3,3,3,0,1,7,6218,12,5,2019,18,5,0,23,30,325
4,3,0,5,1,7,13302,1,3,2019,16,50,0,21,35,285


#### 2. One-Hot encoding

In [34]:
from sklearn.preprocessing import OneHotEncoder

one_hot_data = dataset.copy()
ohe = OneHotEncoder()
array_hot_encoded = ohe.fit_transform(one_hot_data[categorical_columns])

data_hot_encoded = pd.DataFrame(array_hot_encoded.toarray())
data_hot_encoded.index = one_hot_data.index
data_other_cols = one_hot_data.drop(columns=categorical_columns)

data_out = pd.concat([data_hot_encoded, data_other_cols], axis=1)

In [35]:
data_out

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Price,Day_of_Journey,Month_of_Journey,Year_of_Journey,Dep_Time_Hour,Dep_Time_minute,Arrival_day,Arrival_Time_Hour,Arrival_Time_minute,Duration_min
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3897,24,3,2019,22,20,1,1,10,170
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7662,1,5,2019,5,50,0,13,15,445
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,13882,9,6,2019,9,25,1,4,25,1140
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6218,12,5,2019,18,5,0,23,30,325
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13302,1,3,2019,16,50,0,21,35,285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4107,9,4,2019,19,55,0,22,25,150
10679,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4145,27,4,2019,20,45,0,23,20,155
10680,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,7229,27,4,2019,8,20,0,11,20,180
10681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,12648,1,3,2019,11,30,0,14,10,160


One hot encoding is best with the Standardization and PCA for better ML model so here it is

In [36]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_std=scaler.fit_transform(array_hot_encoded.toarray())
df_std = pd.DataFrame(df_std)
data_std_out = pd.concat([df_std, data_other_cols], axis=1)

#### the standardized dataframe

In [37]:
data_std_out.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Price,Day_of_Journey,Month_of_Journey,Year_of_Journey,Dep_Time_Hour,Dep_Time_minute,Arrival_day,Arrival_Time_Hour,Arrival_Time_minute,Duration_min
0,-0.179063,-0.436771,-0.138771,2.012626,-0.729908,-0.02418,-0.36309,-0.035604,-0.293038,-0.009869,...,3897.0,24.0,3.0,2019.0,22.0,20.0,1.0,1.0,10.0,170.0
1,-0.179063,2.289529,-0.138771,-0.496863,-0.729908,-0.02418,-0.36309,-0.035604,-0.293038,-0.009869,...,7662.0,1.0,5.0,2019.0,5.0,50.0,0.0,13.0,15.0,445.0
2,-0.179063,-0.436771,-0.138771,-0.496863,1.370035,-0.02418,-0.36309,-0.035604,-0.293038,-0.009869,...,13882.0,9.0,6.0,2019.0,9.0,25.0,1.0,4.0,25.0,1140.0
3,-0.179063,-0.436771,-0.138771,2.012626,-0.729908,-0.02418,-0.36309,-0.035604,-0.293038,-0.009869,...,6218.0,12.0,5.0,2019.0,18.0,5.0,0.0,23.0,30.0,325.0
4,-0.179063,-0.436771,-0.138771,2.012626,-0.729908,-0.02418,-0.36309,-0.035604,-0.293038,-0.009869,...,13302.0,1.0,3.0,2019.0,16.0,50.0,0.0,21.0,35.0,285.0


#### The PCA for one hot encoding and standardized data

In [38]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10, svd_solver='full', random_state=1001)
X_pca = pca.fit_transform(df_std)
print('Explained variance: %.4f' % pca.explained_variance_ratio_.sum())

print('Individual variance contributions:')
for j in range(10):
    print(pca.explained_variance_ratio_[j])

Explained variance: 0.6217
Individual variance contributions:
0.10784493013950167
0.08901658214428053
0.07781646985909317
0.07182784433118948
0.0667715496903763
0.050594859983221746
0.04855418853454099
0.04153628403378282
0.034387658359995
0.03339890486005541


In [39]:
pca_data = pd.DataFrame(X_pca)
data_pca_out = pd.concat([pca_data, data_other_cols], axis=1)

In [40]:
data_pca_out.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Price,Day_of_Journey,Month_of_Journey,Year_of_Journey,Dep_Time_Hour,Dep_Time_minute,Arrival_day,Arrival_Time_Hour,Arrival_Time_minute,Duration_min
0,0.780019,2.127828,2.102504,-1.040555,0.031115,0.124425,-0.810705,0.816047,-1.988756,0.349655,...,3897.0,24.0,3.0,2019.0,22.0,20.0,1.0,1.0,10.0,170.0
1,1.549258,-0.276678,-1.856255,-1.752227,0.74302,0.240574,0.169236,1.387898,0.211542,-0.727457,...,7662.0,1.0,5.0,2019.0,5.0,50.0,0.0,13.0,15.0,445.0
2,-1.798203,-0.763935,0.13317,0.096577,-0.10422,-0.05098,-0.270555,-0.030176,0.044978,-0.020689,...,13882.0,9.0,6.0,2019.0,9.0,25.0,1.0,4.0,25.0,1140.0
3,1.596324,0.066056,-1.822319,-1.456525,0.375504,0.015019,-0.613888,-0.94011,-1.57951,-0.241041,...,6218.0,12.0,5.0,2019.0,18.0,5.0,0.0,23.0,30.0,325.0
4,0.780019,2.127828,2.102504,-1.040555,0.031115,0.124425,-0.810705,0.816047,-1.988756,0.349655,...,13302.0,1.0,3.0,2019.0,16.0,50.0,0.0,21.0,35.0,285.0


Almost all one hot encoding data is important by looking at the after PCA data because all the columns share equal information about the data

#### 3. By using dummies

In [41]:
for_dummie_data = dataset.copy()
dummie_data = pd.get_dummies(data=for_dummie_data, columns=categorical_columns)
dummie_data.head(2)

Unnamed: 0,Total_Stops,Price,Day_of_Journey,Month_of_Journey,Year_of_Journey,Dep_Time_Hour,Dep_Time_minute,Arrival_day,Arrival_Time_Hour,Arrival_Time_minute,...,Destination_New Delhi,Additional_Info_1 Long layover,Additional_Info_1 Short layover,Additional_Info_2 Long layover,Additional_Info_Business class,Additional_Info_Change airports,Additional_Info_In-flight meal not included,Additional_Info_No check-in baggage included,Additional_Info_No info,Additional_Info_Red-eye flight
0,0,3897,24,3,2019,22,20,1,1,10,...,1,0,0,0,0,0,0,0,1,0
1,2,7662,1,5,2019,5,50,0,13,15,...,0,0,0,0,0,0,0,0,1,0


#### Frequency Encoding

In [42]:
freq_data_enc = dataset.copy()
for i in categorical_columns:
    encoding = freq_data_enc.groupby(i).size()
    encoding = encoding/len(freq_data_enc)
    freq_data_enc[i] = freq_data_enc[i].map(encoding)

freq_data_enc.head(10)

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Day_of_Journey,Month_of_Journey,Year_of_Journey,Dep_Time_Hour,Dep_Time_minute,Arrival_day,Arrival_Time_Hour,Arrival_Time_minute,Duration_min
0,0.197994,0.21046,0.087261,0,0.783307,3897,24,3,2019,22,20,1,1,10,170
1,0.160206,0.277561,0.277561,2,0.783307,7662,1,5,2019,5,50,0,13,15,445
2,0.347585,0.406993,0.406993,2,0.783307,13882,9,6,2019,9,25,1,4,25,1140
3,0.197994,0.277561,0.277561,1,0.783307,6218,12,5,2019,18,5,0,23,30,325
4,0.197994,0.21046,0.087261,1,0.783307,13302,1,3,2019,16,50,0,21,35,285
5,0.079081,0.277561,0.277561,0,0.783307,3873,24,6,2019,9,0,0,11,25,145
6,0.347585,0.21046,0.087261,1,0.182704,11087,12,3,2019,18,55,1,10,25,930
7,0.347585,0.21046,0.087261,1,0.783307,22270,1,3,2019,8,0,1,5,5,1265
8,0.347585,0.21046,0.087261,1,0.182704,11087,12,3,2019,8,55,1,10,25,90
9,0.116478,0.406993,0.406993,1,0.783307,8625,27,5,2019,11,25,0,19,15,470
