# 1. Import Libraries

In [69]:
import numpy as np

import pandas as pd

import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler,PowerTransformer,FunctionTransformer


from feature_engine.encoding import RareLabelEncoder, MeanEncoder, CountFrequencyEncoder
from feature_engine.datetime import DatetimeFeatures



import matplotlib.pyplot as plt

import warnings



# 2.Display Settings

#### ignore warnings.

In [2]:
warnings.filterwarnings("ignore")

#### display all columns

In [6]:
pd.set_option('display.max_columns', None)

#### transformed output as pandas dataframe and not numpy array.

In [5]:
sklearn.set_config(transform_output="pandas")

# 3. Read Training Dataset

In [9]:
path=r'C:\Users\user\Desktop\flight-predictor\data\train.csv'

train = pd.read_csv(path)
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Multiple Carriers,2019-03-27,delhi,cochin,09:00:00,21:00:00,720,1.0,No info,12537
1,Indigo,2019-06-03,delhi,cochin,10:35:00,01:30:00,895,1.0,No info,5883
2,Spicejet,2019-04-03,kolkata,banglore,15:05:00,20:20:00,315,1.0,No info,4649
3,Air India,2019-03-06,mumbai,hyderabad,05:05:00,16:55:00,710,2.0,No info,16697
4,Jet Airways,2019-06-21,banglore,delhi,18:55:00,22:00:00,185,0.0,in-flight meal not included,7754
...,...,...,...,...,...,...,...,...,...,...
3195,Multiple Carriers,2019-03-21,delhi,cochin,09:00:00,15:30:00,390,1.0,No info,8307
3196,Air India,2019-04-09,delhi,cochin,14:05:00,17:55:00,230,0.0,No info,6724
3197,Jet Airways,2019-05-01,kolkata,banglore,09:35:00,23:35:00,840,1.0,No info,13067
3198,Jet Airways,2019-03-15,banglore,new delhi,21:25:00,05:05:00,460,1.0,No info,27210


Spilting X_train and y_train(target feature).

In [10]:
X_train = train.drop(columns='price')
y_train = train.price.copy()

In [11]:
X_train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info
0,Multiple Carriers,2019-03-27,delhi,cochin,09:00:00,21:00:00,720,1.0,No info
1,Indigo,2019-06-03,delhi,cochin,10:35:00,01:30:00,895,1.0,No info
2,Spicejet,2019-04-03,kolkata,banglore,15:05:00,20:20:00,315,1.0,No info
3,Air India,2019-03-06,mumbai,hyderabad,05:05:00,16:55:00,710,2.0,No info
4,Jet Airways,2019-06-21,banglore,delhi,18:55:00,22:00:00,185,0.0,in-flight meal not included
...,...,...,...,...,...,...,...,...,...
3195,Multiple Carriers,2019-03-21,delhi,cochin,09:00:00,15:30:00,390,1.0,No info
3196,Air India,2019-04-09,delhi,cochin,14:05:00,17:55:00,230,0.0,No info
3197,Jet Airways,2019-05-01,kolkata,banglore,09:35:00,23:35:00,840,1.0,No info
3198,Jet Airways,2019-03-15,banglore,new delhi,21:25:00,05:05:00,460,1.0,No info


In [15]:
y_train

0       12537
1        5883
2        4649
3       16697
4        7754
        ...  
3195     8307
3196     6724
3197    13067
3198    27210
3199    10844
Name: price, Length: 3200, dtype: int64

# 4. Transformation Operations

## 4.1 Airline

#### on the airline column we will perform:

1) Imputation for missing values

2) Group Rare Labels

3) One-hot Encoding

In [12]:
X_train.airline 

0       Multiple Carriers
1                  Indigo
2                Spicejet
3               Air India
4             Jet Airways
              ...        
3195    Multiple Carriers
3196            Air India
3197          Jet Airways
3198          Jet Airways
3199          Jet Airways
Name: airline, Length: 3200, dtype: object

In [13]:
air_transformer = Pipeline(steps = [
    ('imputation', SimpleImputer(strategy='most_frequent')),
    ('group_labels', RareLabelEncoder(n_categories=2, tol=0.1, replace_with='Others')),
    ('Ohe', OneHotEncoder(sparse_output=False , handle_unknown='ignore'))
])

air_transformer.fit_transform(X_train.loc[:, ["airline"]])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_Others
0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
3195,0.0,0.0,0.0,1.0,0.0
3196,1.0,0.0,0.0,0.0,0.0
3197,0.0,0.0,1.0,0.0,0.0
3198,0.0,0.0,1.0,0.0,0.0


## 4.2 date_of_journey     

#### on this column we will perform the following transformations: 
1) Date-time feature extraction
2) Min Max Scaling

In [15]:
features_to_extract = ['month', 'week','day_of_month','day_of_week']

In [17]:

doj_transformer = Pipeline(steps = [
    ('dt_feature', DatetimeFeatures(features_to_extract=features_to_extract, yearfirst=True, format="mixed")),
    ('min-max', MinMaxScaler())
    
])

doj_transformer.fit_transform(X_train.loc[:, ["date_of_journey"]])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_month,date_of_journey_day_of_week
0,0.000000,0.235294,1.000000,0.333333
1,1.000000,0.823529,0.076923,0.000000
2,0.333333,0.294118,0.076923,0.333333
3,0.000000,0.058824,0.192308,0.333333
4,1.000000,0.941176,0.769231,0.666667
...,...,...,...,...
3195,0.000000,0.176471,0.769231,0.500000
3196,0.333333,0.352941,0.307692,0.166667
3197,0.666667,0.529412,0.000000,0.333333
3198,0.000000,0.117647,0.538462,0.666667


## 4.3 source and destination     

#### since these 2 columns are similar we will deal with them together: 
1)  Group Rare Labels
2) Mean Encoding
3) Power Transformer
4) Is North City (New column)

In [18]:
location_subset = X_train.loc[:, ["source", "destination"]]
location_subset

Unnamed: 0,source,destination
0,delhi,cochin
1,delhi,cochin
2,kolkata,banglore
3,mumbai,hyderabad
4,banglore,delhi
...,...,...
3195,delhi,cochin
3196,delhi,cochin
3197,kolkata,banglore
3198,banglore,new delhi


In [19]:

location_transfomer = Pipeline(steps=[
    ('group-label', RareLabelEncoder(tol=0.1, n_categories=2, replace_with='others')),
    ('mean_encoding', MeanEncoder()),
    ('power_transformer',PowerTransformer())
]) 

location_transfomer.fit_transform(location_subset,y_train)

Unnamed: 0,source,destination
0,1.058972,1.056852
1,1.058972,1.056852
2,-0.184239,-0.195717
3,-1.887646,-0.815121
4,-0.911914,-1.822694
...,...,...
3195,1.058972,1.056852
3196,1.058972,1.056852
3197,-0.184239,-0.195717
3198,-0.911914,-0.815121


#### Creating a new column "is_north"

#### finding unique categories in source and destination 

In [33]:
np.union1d(
    X_train.source.unique(),
    X_train.destination.unique()
 
    )


array(['banglore', 'chennai', 'cochin', 'delhi', 'hyderabad', 'kolkata',
       'mumbai', 'new delhi'], dtype=object)

In [30]:
def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
    return (
        X
        .assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
    .drop(columns=columns)
    )


FunctionTransformer(func=is_north).fit_transform(location_subset)

Unnamed: 0,source_is_north,destination_is_north
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
3195,0,0
3196,0,0
3197,0,0
3198,0,0


In [34]:
location_final_transformer = FeatureUnion(transformer_list = [
    ('intial', location_transfomer),
    ('final', FunctionTransformer(func=is_north))
])

location_final_transformer.fit_transform(location_subset,y_train)

Unnamed: 0,source,destination,source_is_north,destination_is_north
0,1.058972,1.056852,0,0
1,1.058972,1.056852,0,0
2,-0.184239,-0.195717,0,0
3,-1.887646,-0.815121,0,0
4,-0.911914,-1.822694,0,0
...,...,...,...,...
3195,1.058972,1.056852,0,0
3196,1.058972,1.056852,0,0
3197,-0.184239,-0.195717,0,0
3198,-0.911914,-0.815121,0,0


## 4.4 dep_time & arrival_time

#### on these 2 columns we will perform: 

1) Date-time features
2) Min Max Scaling
3) Part of Day
4) Count Encoding
5) Min Max Scaling

In [39]:
X_train.dep_time

0       09:00:00
1       10:35:00
2       15:05:00
3       05:05:00
4       18:55:00
          ...   
3195    09:00:00
3196    14:05:00
3197    09:35:00
3198    21:25:00
3199    08:25:00
Name: dep_time, Length: 3200, dtype: object

In [40]:
X_train.arrival_time

0       21:00:00
1       01:30:00
2       20:20:00
3       16:55:00
4       22:00:00
          ...   
3195    15:30:00
3196    17:55:00
3197    23:35:00
3198    05:05:00
3199    20:45:00
Name: arrival_time, Length: 3200, dtype: object

#### after a quick look at these both columns we can see that seconds is always zero so no need to extract that.


In [41]:
time_subset = train.loc[:,['dep_time','arrival_time']]
time_subset

Unnamed: 0,dep_time,arrival_time
0,09:00:00,21:00:00
1,10:35:00,01:30:00
2,15:05:00,20:20:00
3,05:05:00,16:55:00
4,18:55:00,22:00:00
...,...,...
3195,09:00:00,15:30:00
3196,14:05:00,17:55:00
3197,09:35:00,23:35:00
3198,21:25:00,05:05:00


In [80]:
time_to_extract = ['hour','minute']

time_pipe_1 = Pipeline (steps = [
    ('dt_extraction', DatetimeFeatures(features_to_extract=time_to_extract)),
    ('scaler', MinMaxScaler())
    
])

time_pipe_1.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.391304,0.000000,0.913043,0.000000
1,0.434783,0.636364,0.043478,0.545455
2,0.652174,0.090909,0.869565,0.363636
3,0.217391,0.090909,0.695652,1.000000
4,0.782609,1.000000,0.956522,0.000000
...,...,...,...,...
3195,0.391304,0.000000,0.652174,0.545455
3196,0.608696,0.090909,0.739130,1.000000
3197,0.391304,0.636364,1.000000,0.636364
3198,0.913043,0.454545,0.217391,0.090909


#### creating a new feature to define the part of day.

In [62]:
def part_of_day(data, morning=4 , noon=12 , eve=16 , night=20):
    columns = data.columns.to_list()
    X_temp = data.assign(**{
                    col: pd.to_datetime(data.loc[:, col]).dt.hour
                    for col in columns
    })
    
    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive="left"),
                X_temp.loc[:, col].between(noon, eve, inclusive="left"),
                X_temp.loc[:, col].between(eve, night, inclusive="left")],
                ["morning", "afternoon", "eve"],
                default = "night"
            )
            for col in columns
        })
       .drop(columns=columns)
        
    )
    
FunctionTransformer(func=part_of_day).fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,morning,night
1,morning,night
2,afternoon,night
3,morning,eve
4,eve,night
...,...,...
3195,morning,afternoon
3196,afternoon,eve
3197,morning,night
3198,night,morning


In [78]:
time_pipe_2  = Pipeline(steps = [
    ('function',FunctionTransformer(func=part_of_day)),
    ('encoding', CountFrequencyEncoder()),
    ('scaler', MinMaxScaler())
    
])

time_pipe_2.fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,1.000000,1.000000
1,1.000000,1.000000
2,0.000000,1.000000
3,1.000000,0.584158
4,0.258834,1.000000
...,...,...
3195,1.000000,0.000000
3196,0.000000,0.584158
3197,1.000000,1.000000
3198,0.189929,0.863366


In [82]:
time_transformer = FeatureUnion(transformer_list = [
    ('pipe_1',time_pipe_1),
    ('pipe_2',time_pipe_2)
])

time_transformer.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute,dep_time_part_of_day,arrival_time_part_of_day
0,0.391304,0.000000,0.913043,0.000000,1.000000,1.000000
1,0.434783,0.636364,0.043478,0.545455,1.000000,1.000000
2,0.652174,0.090909,0.869565,0.363636,0.000000,1.000000
3,0.217391,0.090909,0.695652,1.000000,1.000000,0.584158
4,0.782609,1.000000,0.956522,0.000000,0.258834,1.000000
...,...,...,...,...,...,...
3195,0.391304,0.000000,0.652174,0.545455,1.000000,0.000000
3196,0.608696,0.090909,0.739130,1.000000,0.000000,0.584158
3197,0.391304,0.636364,1.000000,0.636364,1.000000,1.000000
3198,0.913043,0.454545,0.217391,0.090909,0.189929,0.863366


# 5. Column Transformer

In [83]:
preprocessor = ColumnTransformer(transformers=[
    ('airline', air_transformer, ['airline']),
    ('doj', doj_transformer,['date_of_journey']),
    ('location', location_final_transformer, ['source','destination']),
    ('time',time_transformer,['dep_time','arrival_time'])
    
])

preprocessor.fit_transform(X_train, y_train, )

Unnamed: 0,airline__airline_Air India,airline__airline_Indigo,airline__airline_Jet Airways,airline__airline_Multiple Carriers,airline__airline_Others,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_month,doj__date_of_journey_day_of_week,location__source,location__destination,location__source_is_north,location__destination_is_north,time__dep_time_hour,time__dep_time_minute,time__arrival_time_hour,time__arrival_time_minute,time__dep_time_part_of_day,time__arrival_time_part_of_day
0,0.0,0.0,0.0,1.0,0.0,0.000000,0.235294,1.000000,0.333333,1.058972,1.056852,0,0,0.391304,0.000000,0.913043,0.000000,1.000000,1.000000
1,0.0,1.0,0.0,0.0,0.0,1.000000,0.823529,0.076923,0.000000,1.058972,1.056852,0,0,0.434783,0.636364,0.043478,0.545455,1.000000,1.000000
2,0.0,0.0,0.0,0.0,1.0,0.333333,0.294118,0.076923,0.333333,-0.184239,-0.195717,0,0,0.652174,0.090909,0.869565,0.363636,0.000000,1.000000
3,1.0,0.0,0.0,0.0,0.0,0.000000,0.058824,0.192308,0.333333,-1.887646,-0.815121,0,0,0.217391,0.090909,0.695652,1.000000,1.000000,0.584158
4,0.0,0.0,1.0,0.0,0.0,1.000000,0.941176,0.769231,0.666667,-0.911914,-1.822694,0,0,0.782609,1.000000,0.956522,0.000000,0.258834,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195,0.0,0.0,0.0,1.0,0.0,0.000000,0.176471,0.769231,0.500000,1.058972,1.056852,0,0,0.391304,0.000000,0.652174,0.545455,1.000000,0.000000
3196,1.0,0.0,0.0,0.0,0.0,0.333333,0.352941,0.307692,0.166667,1.058972,1.056852,0,0,0.608696,0.090909,0.739130,1.000000,0.000000,0.584158
3197,0.0,0.0,1.0,0.0,0.0,0.666667,0.529412,0.000000,0.333333,-0.184239,-0.195717,0,0,0.391304,0.636364,1.000000,0.636364,1.000000,1.000000
3198,0.0,0.0,1.0,0.0,0.0,0.000000,0.117647,0.538462,0.666667,-0.911914,-0.815121,0,0,0.913043,0.454545,0.217391,0.090909,0.189929,0.863366
