## 1. Importing Libraries

In [120]:
import pandas as pd 
import numpy as np

import sklearn
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder, MinMaxScaler, TargetEncoder,
    PowerTransformer, FunctionTransformer
)
from sklearn.compose import ColumnTransformer


from feature_engine.encoding import RareLabelEncoder, MeanEncoder, CountFrequencyEncoder
from feature_engine.datetime import DatetimeFeatures

import warnings

## 2. Display Setting 

In [4]:
pd.set_option('display.max_columns', None)
sklearn.set_config(transform_output='pandas')
warnings.filterwarnings('ignore')

## 3. Reading Data 

In [7]:
train = pd.read_csv('../Data/train.csv')
train.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-03,Delhi,Cochin,02:15:00,04:25:00,1570,1.0,No info,17024
1,Vistara,2019-03-24,Kolkata,Banglore,07:10:00,18:45:00,695,1.0,No info,16932
2,Spicejet,2019-04-09,Banglore,Delhi,09:30:00,12:20:00,170,0.0,No info,4423
3,Indigo,2019-04-27,Banglore,Delhi,21:15:00,00:15:00,180,0.0,No info,3943
4,Air India,2019-06-12,Delhi,Cochin,09:45:00,09:25:00,1420,1.0,No info,7480


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6836 entries, 0 to 6835
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          6836 non-null   object 
 1   date_of_journey  6836 non-null   object 
 2   source           6836 non-null   object 
 3   destination      6836 non-null   object 
 4   dep_time         6836 non-null   object 
 5   arrival_time     6836 non-null   object 
 6   duration         6836 non-null   int64  
 7   total_stops      6835 non-null   float64
 8   additional_info  6836 non-null   object 
 9   price            6836 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 534.2+ KB


In [10]:
X_train = train.drop(columns='price')
y_train = train['price'].copy()

In [11]:
X_train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info
0,Jet Airways,2019-03-03,Delhi,Cochin,02:15:00,04:25:00,1570,1.0,No info
1,Vistara,2019-03-24,Kolkata,Banglore,07:10:00,18:45:00,695,1.0,No info
2,Spicejet,2019-04-09,Banglore,Delhi,09:30:00,12:20:00,170,0.0,No info
3,Indigo,2019-04-27,Banglore,Delhi,21:15:00,00:15:00,180,0.0,No info
4,Air India,2019-06-12,Delhi,Cochin,09:45:00,09:25:00,1420,1.0,No info
...,...,...,...,...,...,...,...,...,...
6831,Indigo,2019-06-03,Banglore,Delhi,04:00:00,06:50:00,170,0.0,No info
6832,Indigo,2019-06-24,Delhi,Cochin,05:05:00,16:10:00,665,1.0,No info
6833,Air India,2019-05-09,Banglore,Delhi,10:00:00,12:45:00,165,0.0,No info
6834,Jet Airways,2019-05-27,Delhi,Cochin,07:05:00,12:35:00,330,1.0,In-flight meal not included


In [12]:
y_train

0       17024
1       16932
2        4423
3        3943
4        7480
        ...  
6831     3943
6832     6442
6833     5228
6834    12898
6835    22794
Name: price, Length: 6836, dtype: int64

 ## 4. Transformation Operations

### 4.1 Airline

In [16]:
X_train.airline.value_counts()

airline
Jet Airways          2453
Indigo               1341
Air India            1131
Multiple Carriers     754
Spicejet              525
Vistara               300
Air Asia              207
Go Air                124
Trujet                  1
Name: count, dtype: int64

**Steps**

- Imputation
- Group Rare Categories 
- One Hot encoding 

In [26]:
airline_pipeline = Pipeline(steps=[
    ('Imputer', SimpleImputer(strategy='most_frequent')),
    ('Grouper', RareLabelEncoder(tol = 0.07, n_categories=2, replace_with='Others')),
    ('Encoding', OneHotEncoder(sparse_output=False))
])


airline_pipeline.fit_transform(X_train.loc[:,['airline']])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Others,airline_Spicejet
0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
6831,0.0,1.0,0.0,0.0,0.0,0.0
6832,0.0,1.0,0.0,0.0,0.0,0.0
6833,1.0,0.0,0.0,0.0,0.0,0.0
6834,0.0,0.0,1.0,0.0,0.0,0.0


### 4.2 Date of journey

In [27]:
X_train.date_of_journey

0       2019-03-03
1       2019-03-24
2       2019-04-09
3       2019-04-27
4       2019-06-12
           ...    
6831    2019-06-03
6832    2019-06-24
6833    2019-05-09
6834    2019-05-27
6835    2019-03-03
Name: date_of_journey, Length: 6836, dtype: object

**Steps**
- Date Time features (extract features like month, day, day of the week etc.)
- min-max journey (0-1 transformation)

In [42]:
features_to_extract = ['month', 'week', 'day_of_week', 'day_of_month', 'day_of_year']

date_pipeline = Pipeline(steps=[
    ('features', DatetimeFeatures(
                                  features_to_extract= features_to_extract,
                                  yearfirst=True,
                                  format='mixed')),
    ('scale', MinMaxScaler())
])


date_pipeline.fit_transform(X_train.loc[:, ['date_of_journey']])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_month,date_of_journey_day_of_year
0,0.000000,0.000000,1.000000,0.076923,0.016949
1,0.000000,0.176471,1.000000,0.884615,0.194915
2,0.333333,0.352941,0.166667,0.307692,0.330508
3,0.333333,0.470588,0.833333,1.000000,0.483051
4,1.000000,0.882353,0.333333,0.423077,0.872881
...,...,...,...,...,...
6831,1.000000,0.823529,0.000000,0.076923,0.796610
6832,1.000000,1.000000,0.000000,0.884615,0.974576
6833,0.666667,0.588235,0.500000,0.307692,0.584746
6834,0.666667,0.764706,0.000000,1.000000,0.737288


### 4.3 Source & Destination

In [46]:
X_train['source']

0          Delhi
1        Kolkata
2       Banglore
3       Banglore
4          Delhi
          ...   
6831    Banglore
6832       Delhi
6833    Banglore
6834       Delhi
6835       Delhi
Name: source, Length: 6836, dtype: object

In [47]:
X_train['destination']

0         Cochin
1       Banglore
2          Delhi
3          Delhi
4         Cochin
          ...   
6831       Delhi
6832      Cochin
6833       Delhi
6834      Cochin
6835      Cochin
Name: destination, Length: 6836, dtype: object

**Steps**

- Group Rare Labels
- Mean Encoding   ( The MeanEncoder() replaces categories by the mean value of the target for each category. )
- Power Transformer ( Apply a power transform featurewise to make data more Gaussian-like. )

In [66]:
source_dest = X_train[['source', 'destination']]

source_dest_transformation = Pipeline(steps = [
    ('group', RareLabelEncoder(tol = 0.1, n_categories=2, replace_with='Others')),
    ('mean', MeanEncoder()),
    ('scaling', PowerTransformer())
])

source_dest_transformation.fit_transform(source_dest, y_train)

Unnamed: 0,source,destination
0,1.048486,1.045496
1,-0.197994,-0.228321
2,-0.934208,-1.814770
3,-0.934208,-1.814770
4,1.048486,1.045496
...,...,...
6831,-0.934208,-1.814770
6832,1.048486,1.045496
6833,-0.934208,-1.814770
6834,1.048486,1.045496


### 4.4 arrival & departure time

In [74]:
X_train['dep_time']

0       02:15:00
1       07:10:00
2       09:30:00
3       21:15:00
4       09:45:00
          ...   
6831    04:00:00
6832    05:05:00
6833    10:00:00
6834    07:05:00
6835    14:10:00
Name: dep_time, Length: 6836, dtype: object

In [75]:
X_train['arrival_time']

0       04:25:00
1       18:45:00
2       12:20:00
3       00:15:00
4       09:25:00
          ...   
6831    06:50:00
6832    16:10:00
6833    12:45:00
6834    12:35:00
6835    19:20:00
Name: arrival_time, Length: 6836, dtype: object

**Steps**
- Date time features
- min max scaling 


- part of the day 
- count encoding 
- min max scaling 

In [111]:
time_subset = X_train.loc[:, ['dep_time', 'arrival_time']]



dep_arrival1 = Pipeline(steps = [
    ('features', DatetimeFeatures(features_to_extract = ['hour', 'minute'])),
    ('scaling', MinMaxScaler())
])


dep_arrival1.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.086957,0.272727,0.173913,0.454545
1,0.304348,0.181818,0.782609,0.818182
2,0.391304,0.545455,0.521739,0.363636
3,0.913043,0.272727,0.000000,0.272727
4,0.391304,0.818182,0.391304,0.454545
...,...,...,...,...
6831,0.173913,0.000000,0.260870,0.909091
6832,0.217391,0.090909,0.695652,0.181818
6833,0.434783,0.000000,0.521739,0.818182
6834,0.304348,0.090909,0.521739,0.636364


In [108]:
# Part of the day is custom transformation

def part_of_the_day(X, morning = 4, afternoon = 12, evening = 16, night = 20):
    columns = X.columns.to_list()
    
    X_temp = X.assign(**{
        col : pd.to_datetime(X[col]).dt.hour
        for col in columns
    })
    
    return (
        X_temp
        .assign(**{
            f'{col}_part_of_the_day' : np.select(
                [X_temp.loc[:, col].between(morning, afternoon, inclusive = 'left'),
                X_temp.loc[:, col].between(afternoon, evening, inclusive = 'left'),
                X_temp.loc[:, col].between(evening, night, inclusive = 'left'),
                ],
                ['Morning', 'Afternoon', 'Night'],
                default = 'night'
            )
            for col in columns
        })
        .drop(columns = columns)
    )
    

In [118]:
FunctionTransformer(func=part_of_the_day).fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_the_day,arrival_time_part_of_the_day
0,night,Morning
1,Morning,Night
2,Morning,Afternoon
3,night,night
4,Morning,Morning
...,...,...
6831,Morning,Morning
6832,Morning,Night
6833,Morning,Afternoon
6834,Morning,Afternoon


In [119]:
# Count Encoding 
dep_arrival2 = Pipeline(steps = [
    ('part', FunctionTransformer(func=part_of_the_day)),
    ('encoder', CountFrequencyEncoder()),
    ('scaling', MinMaxScaler())
    
])

dep_arrival2.fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_the_day,arrival_time_part_of_the_day
0,0.130625,0.944106
1,1.000000,0.681911
2,1.000000,0.000000
3,0.130625,1.000000
4,1.000000,0.944106
...,...,...
6831,1.000000,0.944106
6832,1.000000,0.681911
6833,1.000000,0.000000
6834,1.000000,0.000000


In [128]:
# Feature union

time_transformer = FeatureUnion(transformer_list=[
    ('transform1', dep_arrival1),
    ('transform2', dep_arrival2)
])

time_transformer.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute,dep_time_part_of_the_day,arrival_time_part_of_the_day
0,0.086957,0.272727,0.173913,0.454545,0.130625,0.944106
1,0.304348,0.181818,0.782609,0.818182,1.000000,0.681911
2,0.391304,0.545455,0.521739,0.363636,1.000000,0.000000
3,0.913043,0.272727,0.000000,0.272727,0.130625,1.000000
4,0.391304,0.818182,0.391304,0.454545,1.000000,0.944106
...,...,...,...,...,...,...
6831,0.173913,0.000000,0.260870,0.909091,1.000000,0.944106
6832,0.217391,0.090909,0.695652,0.181818,1.000000,0.681911
6833,0.434783,0.000000,0.521739,0.818182,1.000000,0.000000
6834,0.304348,0.090909,0.521739,0.636364,1.000000,0.000000


### 4.5 Duration

In [130]:
X_train['duration']

0       1570
1        695
2        170
3        180
4       1420
        ... 
6831     170
6832     665
6833     165
6834     330
6835     310
Name: duration, Length: 6836, dtype: int64

## 5. Column Transformations

In [129]:
column_transformations = ColumnTransformer([
    ('air', airline_pipeline, ['airline']),
    ('doj', date_pipeline, ['date_of_journey']),
    ('city', source_dest_transformation, ['source', 'destination']),
    ('time', time_transformer, ['dep_time', 'arrival_time']),
#     (),
],
remainder='passthrough')

column_transformations.fit_transform(X_train, y_train)

Unnamed: 0,air__airline_Air India,air__airline_Indigo,air__airline_Jet Airways,air__airline_Multiple Carriers,air__airline_Others,air__airline_Spicejet,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_week,doj__date_of_journey_day_of_month,doj__date_of_journey_day_of_year,city__source,city__destination,time__dep_time_hour,time__dep_time_minute,time__arrival_time_hour,time__arrival_time_minute,time__dep_time_part_of_the_day,time__arrival_time_part_of_the_day,remainder__duration,remainder__total_stops,remainder__additional_info
0,0.0,0.0,1.0,0.0,0.0,0.0,0.000000,0.000000,1.000000,0.076923,0.016949,1.048486,1.045496,0.086957,0.272727,0.173913,0.454545,0.130625,0.944106,1570,1.0,No info
1,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,0.176471,1.000000,0.884615,0.194915,-0.197994,-0.228321,0.304348,0.181818,0.782609,0.818182,1.000000,0.681911,695,1.0,No info
2,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.352941,0.166667,0.307692,0.330508,-0.934208,-1.814770,0.391304,0.545455,0.521739,0.363636,1.000000,0.000000,170,0.0,No info
3,0.0,1.0,0.0,0.0,0.0,0.0,0.333333,0.470588,0.833333,1.000000,0.483051,-0.934208,-1.814770,0.913043,0.272727,0.000000,0.272727,0.130625,1.000000,180,0.0,No info
4,1.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.882353,0.333333,0.423077,0.872881,1.048486,1.045496,0.391304,0.818182,0.391304,0.454545,1.000000,0.944106,1420,1.0,No info
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6831,0.0,1.0,0.0,0.0,0.0,0.0,1.000000,0.823529,0.000000,0.076923,0.796610,-0.934208,-1.814770,0.173913,0.000000,0.260870,0.909091,1.000000,0.944106,170,0.0,No info
6832,0.0,1.0,0.0,0.0,0.0,0.0,1.000000,1.000000,0.000000,0.884615,0.974576,1.048486,1.045496,0.217391,0.090909,0.695652,0.181818,1.000000,0.681911,665,1.0,No info
6833,1.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.588235,0.500000,0.307692,0.584746,-0.934208,-1.814770,0.434783,0.000000,0.521739,0.818182,1.000000,0.000000,165,0.0,No info
6834,0.0,0.0,1.0,0.0,0.0,0.0,0.666667,0.764706,0.000000,1.000000,0.737288,1.048486,1.045496,0.304348,0.090909,0.521739,0.636364,1.000000,0.000000,330,1.0,In-flight meal not included
