## 1. Import Libraries

In [3]:
import numpy as np

import pandas as pd

import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import matplotlib.pyplot as plt

import warnings

## 2. Display Settings

In [4]:
pd.set_option("display.max_columns", None)

In [5]:
sklearn.set_config(transform_output="pandas")

In [6]:
warnings.filterwarnings("ignore")

## 3. Read the Data

In [7]:
path = r"/home/sourabh/Flights-Sagemaker-Project/data/train.csv"

train = pd.read_csv(path)
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-05-06,Delhi,Cochin,07:30:00,21:00:00,810,1.0,No Info,7191
1,Indigo,2019-04-06,Banglore,Delhi,23:30:00,02:20:00,170,0.0,No Info,4591
2,Jet Airways,2019-05-21,Kolkata,Banglore,14:05:00,10:05:00,1200,1.0,No Info,14388
3,Multiple Carriers,2019-05-15,Delhi,Cochin,15:00:00,01:30:00,630,1.0,No Info,13727
4,Multiple Carriers,2019-06-15,Delhi,Cochin,13:00:00,19:15:00,375,1.0,No Info,16108
...,...,...,...,...,...,...,...,...,...,...
635,Jet Airways,2019-06-27,Delhi,Cochin,19:10:00,12:35:00,1045,2.0,No Info,12819
636,Jet Airways,2019-03-01,Banglore,New Delhi,08:55:00,16:10:00,435,1.0,No Info,26890
637,Jet Airways,2019-05-15,Kolkata,Banglore,18:55:00,09:20:00,865,1.0,In-flight meal not included,9663
638,Indigo,2019-06-03,Banglore,Delhi,08:30:00,11:20:00,170,0.0,No Info,4823


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          640 non-null    object 
 1   date_of_journey  640 non-null    object 
 2   source           640 non-null    object 
 3   destination      640 non-null    object 
 4   dep_time         640 non-null    object 
 5   arrival_time     640 non-null    object 
 6   duration         640 non-null    int64  
 7   total_stops      640 non-null    float64
 8   additional_info  640 non-null    object 
 9   price            640 non-null    int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 50.1+ KB


In [9]:
X_train = train.drop(columns="price")
y_train = train.price.copy()

## 4. Transformation Operations

## 4.1 airline

In [10]:
X_train.airline

0                 Indigo
1                 Indigo
2            Jet Airways
3      Multiple Carriers
4      Multiple Carriers
             ...        
635          Jet Airways
636          Jet Airways
637          Jet Airways
638               Indigo
639          Jet Airways
Name: airline, Length: 640, dtype: object

In [11]:
air_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
	("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

air_transformer.fit_transform(X_train.loc[:, ["airline"]])#.airline.value_counts()

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Other
0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...
635,0.0,0.0,1.0,0.0,0.0
636,0.0,0.0,1.0,0.0,0.0
637,0.0,0.0,1.0,0.0,0.0
638,0.0,1.0,0.0,0.0,0.0


## 4.2 date_of_journey

In [12]:
X_train.date_of_journey

0      2019-05-06
1      2019-04-06
2      2019-05-21
3      2019-05-15
4      2019-06-15
          ...    
635    2019-06-27
636    2019-03-01
637    2019-05-15
638    2019-06-03
639    2019-05-12
Name: date_of_journey, Length: 640, dtype: object

In [13]:
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
	("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
	("scaler", MinMaxScaler())
])

doj_transformer.fit_transform(X_train.loc[:, ["date_of_journey"]])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,0.666667,0.588235,0.000000,0.559322
1,0.333333,0.294118,0.833333,0.305085
2,0.666667,0.705882,0.166667,0.686441
3,0.666667,0.647059,0.333333,0.635593
4,1.000000,0.882353,0.833333,0.898305
...,...,...,...,...
635,1.000000,1.000000,0.500000,1.000000
636,0.000000,0.000000,0.666667,0.000000
637,0.666667,0.647059,0.333333,0.635593
638,1.000000,0.823529,0.000000,0.796610


## 4.3 source & destination

In [14]:
X_train.source

0         Delhi
1      Banglore
2       Kolkata
3         Delhi
4         Delhi
         ...   
635       Delhi
636    Banglore
637     Kolkata
638    Banglore
639     Kolkata
Name: source, Length: 640, dtype: object

In [15]:
X_train.destination

0         Cochin
1          Delhi
2       Banglore
3         Cochin
4         Cochin
         ...    
635       Cochin
636    New Delhi
637     Banglore
638        Delhi
639     Banglore
Name: destination, Length: 640, dtype: object

In [16]:
location_subset = X_train.loc[:, ["source", "destination"]]
location_subset


Unnamed: 0,source,destination
0,Delhi,Cochin
1,Banglore,Delhi
2,Kolkata,Banglore
3,Delhi,Cochin
4,Delhi,Cochin
...,...,...
635,Delhi,Cochin
636,Banglore,New Delhi
637,Kolkata,Banglore
638,Banglore,Delhi


In [17]:
location_pipe1 = Pipeline(steps=[
	("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
	("encoder", MeanEncoder()),
	("scaler", PowerTransformer())
])

location_pipe1.fit_transform(location_subset, y_train)

Unnamed: 0,source,destination
0,1.034326,1.033284
1,-0.925083,-1.865078
2,-0.057908,-0.072615
3,1.034326,1.033284
4,1.034326,1.033284
...,...,...
635,1.034326,1.033284
636,-0.925083,-0.852428
637,-0.057908,-0.072615
638,-0.925083,-1.865078


In [18]:
np.union1d(
	X_train.source.unique(),
	X_train.destination.unique()
)

array(['Banglore', 'Chennai', 'Cochin', 'Delhi', 'Hyderabad', 'Kolkata',
       'Mumbai', 'New Delhi'], dtype=object)

In [23]:
def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
    return (
		X
		.assign(**{
			f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
			for col in columns
		})
		.drop(columns=columns)
	)


FunctionTransformer(func=is_north).fit_transform(location_subset)

Unnamed: 0,source_is_north,destination_is_north
0,1,0
1,0,1
2,1,0
3,1,0
4,1,0
...,...,...
635,1,0
636,0,1
637,1,0
638,0,1


In [24]:
location_transformer = FeatureUnion(transformer_list=[
	("part1", location_pipe1),
	("part2", FunctionTransformer(func=is_north))
])

location_transformer.fit_transform(location_subset, y_train)


Unnamed: 0,source,destination,source_is_north,destination_is_north
0,1.034326,1.033284,1,0
1,-0.925083,-1.865078,0,1
2,-0.057908,-0.072615,1,0
3,1.034326,1.033284,1,0
4,1.034326,1.033284,1,0
...,...,...,...,...
635,1.034326,1.033284,1,0
636,-0.925083,-0.852428,0,1
637,-0.057908,-0.072615,1,0
638,-0.925083,-1.865078,0,1


## 4.4 dep_time & arrival_time

In [25]:
X_train.dep_time

0      07:30:00
1      23:30:00
2      14:05:00
3      15:00:00
4      13:00:00
         ...   
635    19:10:00
636    08:55:00
637    18:55:00
638    08:30:00
639    18:55:00
Name: dep_time, Length: 640, dtype: object

In [26]:
X_train.arrival_time

0      21:00:00
1      02:20:00
2      10:05:00
3      01:30:00
4      19:15:00
         ...   
635    12:35:00
636    16:10:00
637    09:20:00
638    11:20:00
639    08:15:00
Name: arrival_time, Length: 640, dtype: object

In [27]:
time_subset = X_train.loc[:, ["dep_time", "arrival_time"]]
time_subset

Unnamed: 0,dep_time,arrival_time
0,07:30:00,21:00:00
1,23:30:00,02:20:00
2,14:05:00,10:05:00
3,15:00:00,01:30:00
4,13:00:00,19:15:00
...,...,...
635,19:10:00,12:35:00
636,08:55:00,16:10:00
637,18:55:00,09:20:00
638,08:30:00,11:20:00


In [28]:
time_pipe1 = Pipeline(steps=[
	("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
	("scaler", MinMaxScaler())
])

time_pipe1.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.304348,0.545455,0.913043,0.000000
1,1.000000,0.545455,0.086957,0.363636
2,0.608696,0.090909,0.434783,0.090909
3,0.652174,0.000000,0.043478,0.545455
4,0.565217,0.000000,0.826087,0.272727
...,...,...,...,...
635,0.826087,0.181818,0.521739,0.636364
636,0.347826,1.000000,0.695652,0.181818
637,0.782609,1.000000,0.391304,0.363636
638,0.347826,0.545455,0.478261,0.363636


In [30]:
def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
		col: pd.to_datetime(X.loc[:, col]).dt.hour
		for col in columns
	})

    return (
		X_temp
		.assign(**{
			f"{col}_part_of_day": np.select(
				[X_temp.loc[:, col].between(morning, noon, inclusive="left"),
				 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
				 X_temp.loc[:, col].between(eve, night, inclusive="left")],
				["morning", "afternoon", "evening"],
				default="night"
			)
			for col in columns
		})
		.drop(columns=columns)
	)

FunctionTransformer(func=part_of_day).fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,morning,night
1,night,night
2,afternoon,morning
3,afternoon,night
4,afternoon,evening
...,...,...
635,evening,afternoon
636,morning,evening
637,evening,morning
638,morning,morning


In [31]:
time_pipe2 = Pipeline(steps=[
	("part", FunctionTransformer(func=part_of_day)),
	("encoder", CountFrequencyEncoder()),
	("scaler", MinMaxScaler())
])

time_pipe2.fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,1.000000,0.903226
1,0.116162,0.903226
2,0.000000,1.000000
3,0.000000,0.903226
4,0.000000,0.806452
...,...,...
635,0.176768,0.000000
636,1.000000,0.806452
637,0.176768,1.000000
638,1.000000,1.000000


In [32]:
time_transformer = FeatureUnion(transformer_list=[
	("part1", time_pipe1),
	("part2", time_pipe2)
])

time_transformer.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute,dep_time_part_of_day,arrival_time_part_of_day
0,0.304348,0.545455,0.913043,0.000000,1.000000,0.903226
1,1.000000,0.545455,0.086957,0.363636,0.116162,0.903226
2,0.608696,0.090909,0.434783,0.090909,0.000000,1.000000
3,0.652174,0.000000,0.043478,0.545455,0.000000,0.903226
4,0.565217,0.000000,0.826087,0.272727,0.000000,0.806452
...,...,...,...,...,...,...
635,0.826087,0.181818,0.521739,0.636364,0.176768,0.000000
636,0.347826,1.000000,0.695652,0.181818,1.000000,0.806452
637,0.782609,1.000000,0.391304,0.363636,0.176768,1.000000
638,0.347826,0.545455,0.478261,0.363636,1.000000,1.000000


## 4.5 duration

In [33]:
X_train.duration

0       810
1       170
2      1200
3       630
4       375
       ... 
635    1045
636     435
637     865
638     170
639     800
Name: duration, Length: 640, dtype: int64

In [34]:
(
	X_train
	.duration
	.quantile([0.25, 0.5, 0.75])
	.values
	.reshape(-1, 1)
	# .shape
)

array([[175. ],
       [480. ],
       [927.5]])

In [37]:
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma

def fit(self, X, y=None):
		if not self.variables:
			self.variables = X.select_dtypes(include="number").columns.to_list()

		self.reference_values_ = {
			col: (
				X
				.loc[:, col]
				.quantile(self.percentiles)
				.values
				.reshape(-1, 1)
			)
			for col in self.variables
		}

		return self

def transform(self, X):
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
				data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
				columns=columns
			)
            objects.append(obj)
        return pd.concat(objects, axis=1)

In [59]:
RBFPercentileSimilarity(percentiles=[0.4, 0.8]).fit_transform(X_train)


AttributeError: 'RBFPercentileSimilarity' object has no attribute 'fit'

In [40]:
def duration_category(X, short=180, med=400):
	return (
		X
		.assign(duration_cat=np.select([X.duration.lt(short),
									    X.duration.between(short, med, inclusive="left")],
									   ["short", "medium"],
									   default="long"))
		.drop(columns="duration")
	)
def is_over(X, value=1000):
	return (
		X
		.assign(**{
			f"duration_over_{value}": X.duration.ge(value).astype(int)
		})
		.drop(columns="duration")
	)
duration_pipe1 = Pipeline(steps=[
	("rbf", RBFPercentileSimilarity()),
	("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
	("cat", FunctionTransformer(func=duration_category)),
	("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
	("part1", duration_pipe1),
	("part2", duration_pipe2),
	("part3", FunctionTransformer(func=is_over)),
	("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
	("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
	("imputer", SimpleImputer(strategy="median")),
	("union", duration_union)
])

duration_transformer.fit_transform(X_train.loc[:, ["duration"]])

TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'RBFPercentileSimilarity()' (type <class '__main__.RBFPercentileSimilarity'>) doesn't

## 4.6 total_stops

In [42]:
X_train.total_stops

0      1.0
1      0.0
2      1.0
3      1.0
4      1.0
      ... 
635    2.0
636    1.0
637    1.0
638    0.0
639    1.0
Name: total_stops, Length: 640, dtype: float64

In [43]:
def is_direct(X):
    	return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("", FunctionTransformer(func=is_direct))
])

total_stops_transformer.fit_transform(X_train.loc[:, ["total_stops"]])


Unnamed: 0,total_stops,is_direct_flight
0,1.0,0
1,0.0,1
2,1.0,0
3,1.0,0
4,1.0,0
...,...,...
635,2.0,0
636,1.0,0
637,1.0,0
638,0.0,1


## 4.7 additional_info

In [44]:
X_train.additional_info

0                          No Info
1                          No Info
2                          No Info
3                          No Info
4                          No Info
                  ...             
635                        No Info
636                        No Info
637    In-flight meal not included
638                        No Info
639    In-flight meal not included
Name: additional_info, Length: 640, dtype: object

In [45]:
info_pipe1 = Pipeline(steps=[
	("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
	("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

info_pipe1.fit_transform(X_train.loc[:, ["additional_info"]])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No Info,additional_info_Other
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0
...,...,...,...
635,0.0,1.0,0.0
636,0.0,1.0,0.0
637,1.0,0.0,0.0
638,0.0,1.0,0.0


In [46]:
def have_info(X):
	return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

In [47]:
info_union = FeatureUnion(transformer_list=[
	("part1", info_pipe1),
	("part2", FunctionTransformer(func=have_info))
])

In [48]:
info_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
	("union", info_union)
])

info_transformer.fit_transform(X_train.loc[:, ["additional_info"]])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No Info,additional_info_Other,additional_info
0,0.0,1.0,0.0,0
1,0.0,1.0,0.0,0
2,0.0,1.0,0.0,0
3,0.0,1.0,0.0,0
4,0.0,1.0,0.0,0
...,...,...,...,...
635,0.0,1.0,0.0,0
636,0.0,1.0,0.0,0
637,1.0,0.0,0.0,1
638,0.0,1.0,0.0,0


## 4.8 Column Transformer

In [51]:
column_transformer = ColumnTransformer(transformers=[
	("air", air_transformer, ["airline"]),
	("doj", doj_transformer, ["date_of_journey"]),
	("location", location_transformer, ["source", 'destination']),
	("time", time_transformer, ["dep_time", "arrival_time"]),
	("dur", duration_transformer, ["duration"]),
	("stops", total_stops_transformer, ["total_stops"]),
	("info", info_transformer, ["additional_info"])
], remainder="passthrough")

column_transformer.fit_transform(X_train, y_train)

TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'RBFPercentileSimilarity()' (type <class '__main__.RBFPercentileSimilarity'>) doesn't

## 5. Feature Selection


In [52]:
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
	estimator=estimator,
	scoring="r2",
	threshold=0.1
) 


## 6. Putting it all Together

In [53]:
preprocessor = Pipeline(steps=[
	("ct", column_transformer),
	("selector", selector)
])

preprocessor.fit_transform(X_train, y_train)

TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'RBFPercentileSimilarity()' (type <class '__main__.RBFPercentileSimilarity'>) doesn't

## 7. Visualizations

In [54]:
feature_performances = preprocessor.named_steps["selector"].feature_performance_
feature_performances

AttributeError: 'SelectBySingleFeaturePerformance' object has no attribute 'feature_performance_'

In [55]:
sorted_feat_imp = dict(sorted(feature_performances.items(), key=lambda val: val[1]))
sorted_feat_imp

NameError: name 'feature_performances' is not defined

In [57]:
THRESHOLD = 0.1

selected_bar = None
dropped_bar = None
colors = ["red" if score < THRESHOLD else "green" for score in sorted_feat_imp.values()]


fig, ax = plt.subplots(figsize=(15, 4)) 

for i, (feature, score) in enumerate(sorted_feat_imp.items()):
	params = dict(
		x=i,
		height=score,
		edgecolor="black",
		alpha=0.5
	)
	
	if score < THRESHOLD:
		bar = ax.bar(
			color="red",
			**params
		)
		if not dropped_bar:
			dropped_bar = bar[0]
	else:
		bar = ax.bar(
			color="green",
			**params
		)
		if not selected_bar:
			selected_bar = bar[0]

thresh_line = ax.axhline(
	y=0.1,
	color="black",
	linestyle="--"
)

ax.set_xticks(
	ticks=range(len(sorted_feat_imp)),
	labels=list(sorted_feat_imp.keys()),
	rotation=30,
	ha="right"
)

ax.set(
	xlabel="Feature",
	ylabel="Score",
	title="Feature Selection Scores"
)

ax.legend(
	handles=[selected_bar, dropped_bar, thresh_line],
	labels=["Selected", "Dropped", "Threshold"],
	loc="upper left"
)

plt.show()


NameError: name 'sorted_feat_imp' is not defined

-The dataset went upto 31 columns after Feature Engineering

-The Feature Selection algorithm selected 13 features out of that