## 1. Import Libraries

In [20]:
#conda install -c conda-forge feature_engine


In [62]:
import numpy as np
import pandas as pd
import sklearn 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    PowerTransformer,
    FunctionTransformer
    
)

from feature_engine.encoding import RareLabelEncoder
from feature_engine.datetime import DatetimeFeatures
from feature_engine.encoding import MeanEncoder

import warnings

## 2. Display Settings

In [37]:
# if we have lots of columns pandas will not show all of them by default
# but if we want to force pandas to show all columns we can write foll code

pd.set_option("display.max_columns",None)

In [38]:
sklearn.set_config(transform_output="pandas")
# by default scikit learn transformers in the output they return numpy arrays even if the input is dataframe or something else output is numpy arrayt
# so if we want scikit to output pandas dataframe itself then will us above code

In [39]:
warnings.filterwarnings("ignore")
# we might get some warnings frequently just to remove that we wrote this code

## 3. Read the Data

In [40]:
#  one important note whatever feature engg will do is only on training data so experimentation, analysis everything on training data

path = r"D:\SAURABH\spring 2024\ML_Projects\AWS SageMaker FlightFarePredictor\data\train_set.csv"

train = pd.read_csv(path)

train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-06-06,Kolkata,Banglore,20:25:00,23:10:00,165,0.0,No Info,4804
1,Air India,2019-05-21,Delhi,Cochin,17:15:00,19:15:00,1560,2.0,No Info,11989
2,Jet Airways,2019-03-24,Kolkata,Banglore,21:10:00,16:20:00,1150,1.0,In-flight meal not included,10031
3,Indigo,2019-04-01,Delhi,Cochin,14:20:00,17:35:00,195,0.0,No Info,4729
4,Jet Airways,2019-03-15,Mumbai,Hyderabad,10:20:00,11:50:00,90,0.0,No Info,8040
...,...,...,...,...,...,...,...,...,...,...
6690,Jet Airways,2019-05-09,Kolkata,Banglore,09:35:00,23:35:00,840,1.0,No Info,12121
6691,Jet Airways,2019-05-18,Kolkata,Banglore,09:35:00,19:10:00,575,1.0,No Info,13067
6692,Jet Airways,2019-05-24,Kolkata,Banglore,20:00:00,23:35:00,1655,1.0,In-flight meal not included,10844
6693,Multiple Carriers,2019-06-03,Delhi,Cochin,10:35:00,19:00:00,505,1.0,No Info,10877


In [41]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6695 entries, 0 to 6694
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          6695 non-null   object 
 1   date_of_journey  6695 non-null   object 
 2   source           6695 non-null   object 
 3   destination      6695 non-null   object 
 4   dep_time         6695 non-null   object 
 5   arrival_time     6695 non-null   object 
 6   duration         6695 non-null   int64  
 7   total_stops      6695 non-null   float64
 8   additional_info  6695 non-null   object 
 9   price            6695 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 523.2+ KB


In [42]:
# will split into x and y where x is input variable and y would be target column

X_train = train.drop(columns="price")
y_train = train.price.copy()

In [43]:
X_train.columns.to_list()

['airline',
 'date_of_journey',
 'source',
 'destination',
 'dep_time',
 'arrival_time',
 'duration',
 'total_stops',
 'additional_info']

## 4. Column wise Transformation Operations

### 4.1 airline

In [44]:
X_train.airline

0                  Indigo
1               Air India
2             Jet Airways
3                  Indigo
4             Jet Airways
              ...        
6690          Jet Airways
6691          Jet Airways
6692          Jet Airways
6693    Multiple Carriers
6694            Air India
Name: airline, Length: 6695, dtype: object

- **Airline is a categorical column.**
- **Transformation/Preprocessing for Categorical Columns:**
  - First, we will perform imputation if there are any missing values.
  - During EDA, we observed that some airline categories were very rare (less than 10%). We will group these rare categories into a single category named "Other."
  - Finally, we will apply one-hot encoding.

### Steps for Preprocessing the Airline Column

1. **Imputation:**
   - Check for any missing values in the `airline` column and impute them accordingly.

2. **Grouping Rare Categories:**
   - Identify the airline categories that appear less than 10% of the time in the dataset.
   - Group these rare categories into a new category named "Other."

3. **One-Hot Encoding:**
   - Apply one-hot encoding to the `airline` column to convert the categorical data into numerical format, which can be used in machine learning models.


In [45]:
airline_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    
])

airline_transformer.fit_transform(X_train.loc[:, ["airline"]])
     
    

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_other
0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
6690,0.0,0.0,1.0,0.0,0.0
6691,0.0,0.0,1.0,0.0,0.0
6692,0.0,0.0,1.0,0.0,0.0
6693,0.0,0.0,0.0,1.0,0.0


**Code Explanation:**

We will call `airline_transformer = Pipeline` with the input steps.
- **What steps?** The transformation steps to apply on this column. It's a list of tuples.
- In each tuple, we will mention a list of transformers that we want to apply.

**First Transformer:** We want to apply `SimpleImputer` to impute the values. We used the most frequent values for imputation and named this transformer `imputer`.

**Second Transformer:** We wanted to group rare categories, so we named it `grouper` and used `RareLabelEncoder` with a tolerance of `0.1`. This means all categories that occur less than 10% will be grouped and called "other". We also set `n_categories=2` to ensure grouping occurs only if the variable has at least 2 categories.

**Third Transformer:** We will do one-hot encoding using `OneHotEncoder`. By default, scikit-learn stores it as a sparse matrix, which is a memory-saving technique where many values are zero. However, to see the output, we set `sparse_output=False`. We also set `handle_unknown="ignore"` to ensure that any unseen categories in the test or validation set are handled without errors. If an unseen category appears, it will create a new column and mark them as 0.

Then, we test it with `fit_transform`.

**Output Observation:**

- **Simple Imputer:** Missing values are now imputed.
- **Grouper:** Rare categories are categorized into "other."
- **One-Hot Encoding:** For each category, we have one separate column.

**Conclusion:**

The use of a pipeline allows us to list all the transformations we need to do in sequence. We just wrote 3 transformations, but we could write 10 or 100s in sequence, and it will execute them.


### 4.2 date_of_journey

In [46]:
X_train.date_of_journey

0       2019-06-06
1       2019-05-21
2       2019-03-24
3       2019-04-01
4       2019-03-15
           ...    
6690    2019-05-09
6691    2019-05-18
6692    2019-05-24
6693    2019-06-03
6694    2019-06-15
Name: date_of_journey, Length: 6695, dtype: object

- **Date-Time Column Transformation:**

  - **Object Type with Date Information:** Since this column contains date information, we will apply specific transformations to extract meaningful features from it.

  - **Feature Extraction from Date:**
    - We will extract features such as date, month, day of the week, and whether it was a weekend.
    - We will avoid extracting the year as all values are from 2019.
    - Additionally, we will extract the week and day of the week.

  - **Min-Max Scaling:** 
    - After extracting the features, we will apply Min-Max Scaling to convert all values between 0 and 1.
  
  - **Available Features from Datetime Column:**
    - For a comprehensive list of extractable features from a datetime column, you can refer to [Feature-Engine's Datetime Features Documentation](https://feature-engine.trainindata.com/en/latest/api_doc/datetime/DatetimeFeatures.html#feature_engine.datetime.DatetimeFeatures).
    
  - **Selected Features:**
    - We will extract the month, day, week, and day of the week from the date-time column.


In [47]:
feature_to_extract = ["month", "week", "day_of_week","day_of_year"]

doj_transformer = Pipeline(steps=[
     ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True)),
     ("scaler", MinMaxScaler())
])

doj_transformer.fit_transform(X_train.loc[:,["date_of_journey"]])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,1.000000,0.823529,0.500000,0.822034
1,0.666667,0.705882,0.166667,0.686441
2,0.000000,0.176471,1.000000,0.194915
3,0.333333,0.294118,0.000000,0.262712
4,0.000000,0.117647,0.666667,0.118644
...,...,...,...,...
6690,0.666667,0.588235,0.500000,0.584746
6691,0.666667,0.647059,0.833333,0.661017
6692,0.666667,0.705882,0.666667,0.711864
6693,1.000000,0.823529,0.000000,0.796610


- **Date of Journey Input:**
  - The extracted features from the date of journey include month, day, week, and day of the week.
  
- **Observations on Extracted Feature Values:**
  - The values of each feature vary, with some being single-digit and others being double-digit.
  - While this discrepancy in scale is not an issue for tree-based models, we will still scale the features for consistency.
  
- **Scaling in the Pipeline:**
  - To ensure all features are on the same scale, we will apply scaling within the pipeline.
  - This will help maintain uniformity and improve the performance of non-tree-based models.


### 4.3 source & destination

In [51]:
X_train.source

0       Kolkata
1         Delhi
2       Kolkata
3         Delhi
4        Mumbai
         ...   
6690    Kolkata
6691    Kolkata
6692    Kolkata
6693      Delhi
6694      Delhi
Name: source, Length: 6695, dtype: object

In [52]:
X_train.destination

0        Banglore
1          Cochin
2        Banglore
3          Cochin
4       Hyderabad
          ...    
6690     Banglore
6691     Banglore
6692     Banglore
6693       Cochin
6694       Cochin
Name: destination, Length: 6695, dtype: object

- **Grouping Rare Labels:**
  - We will group rare labels because there are a few values, like "Chennai," that are very rare.
  
- **Mean Encoding:**
  - Mean encoding involves calculating the average value of the target column for each category of a categorical variable. 
  - For example, if we have a categorical variable with values cat1, cat2, cat3, and our target variable is "price" (numerical), mean encoding will calculate the average price for each category and use these averages as the encoded values.

- **Power Transformer:**
  - A power transformer is applied to numerical variables to make the distribution of the data as symmetric as possible.
  - It transforms the input variable \( x \) using an algorithm that finds the optimal lambda value such that \( x \) raised to this lambda results in a transformed variable with a symmetric distribution.
  - Under the hood, the power transformer also applies a standard scaler to the data.


In [53]:
location_subset = X_train.loc[:, ["source", "destination"]]
location_subset
# this are the columns we want to work on right now 


Unnamed: 0,source,destination
0,Kolkata,Banglore
1,Delhi,Cochin
2,Kolkata,Banglore
3,Delhi,Cochin
4,Mumbai,Hyderabad
...,...,...
6690,Kolkata,Banglore
6691,Kolkata,Banglore
6692,Kolkata,Banglore
6693,Delhi,Cochin


In [59]:
location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

location_pipe1.fit_transform(location_subset, y_train)

Unnamed: 0,source,destination
0,-0.154666,-0.173651
1,1.042355,1.040402
2,-0.154666,-0.173651
3,1.042355,1.040402
4,-1.853893,-0.834662
...,...,...
6690,-0.154666,-0.173651
6691,-0.154666,-0.173651
6692,-0.154666,-0.173651
6693,1.042355,1.040402


- **Mean Encoding Result:**
  - Each category in the categorical variable has been replaced by the mean value of the target variable ("price").
  - However, the range of these mean values can be quite large, which might affect the model performance.

- **Power Transformer:**
  - To address the issue of large range values, we will use a power transformer.
  - The power transformer will apply a transformation that makes the distribution of the data more symmetric.
  - It will also scale the values to a standard range, ensuring consistency and improving model performance.


In [None]:
- we have source and destination we can also create a new feature which will state if its a north city or not just a randomn feature will create
- will see how to create

In [60]:
np.union1d(
    X_train.source.unique(),
    X_train.destination.unique()
)

#will do this just to understand what all uniwue values we have

array(['Banglore', 'Chennai', 'Cochin', 'Delhi', 'Hyderabad', 'Kolkata',
       'Mumbai', 'New Delhi'], dtype=object)

In [63]:
def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
    return (
        X
        .assign(**{
            col: X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
    )

FunctionTransformer(func=is_north)

In [None]:
- whatever is the north value become 1 and south became 0
- but we want scilit learn compatible transformer this function will not work inside column transofrmer inside pipeline
- to make it work will use scikit learn function transformer

### 5. Column Transformer

In [49]:
column_transformer = ColumnTransformer(transformers=[
    ("air", airline_transformer, ["airline"]),
    ("doj", doj_transformer, ["date_of_journey"])
], remainder="passthrough")

column_transformer.fit_transform(X_train)

Unnamed: 0,air__airline_Air India,air__airline_Indigo,air__airline_Jet Airways,air__airline_Multiple Carriers,air__airline_other,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_week,doj__date_of_journey_day_of_year,remainder__source,remainder__destination,remainder__dep_time,remainder__arrival_time,remainder__duration,remainder__total_stops,remainder__additional_info
0,0.0,1.0,0.0,0.0,0.0,1.000000,0.823529,0.500000,0.822034,Kolkata,Banglore,20:25:00,23:10:00,165,0.0,No Info
1,1.0,0.0,0.0,0.0,0.0,0.666667,0.705882,0.166667,0.686441,Delhi,Cochin,17:15:00,19:15:00,1560,2.0,No Info
2,0.0,0.0,1.0,0.0,0.0,0.000000,0.176471,1.000000,0.194915,Kolkata,Banglore,21:10:00,16:20:00,1150,1.0,In-flight meal not included
3,0.0,1.0,0.0,0.0,0.0,0.333333,0.294118,0.000000,0.262712,Delhi,Cochin,14:20:00,17:35:00,195,0.0,No Info
4,0.0,0.0,1.0,0.0,0.0,0.000000,0.117647,0.666667,0.118644,Mumbai,Hyderabad,10:20:00,11:50:00,90,0.0,No Info
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,0.0,0.0,1.0,0.0,0.0,0.666667,0.588235,0.500000,0.584746,Kolkata,Banglore,09:35:00,23:35:00,840,1.0,No Info
6691,0.0,0.0,1.0,0.0,0.0,0.666667,0.647059,0.833333,0.661017,Kolkata,Banglore,09:35:00,19:10:00,575,1.0,No Info
6692,0.0,0.0,1.0,0.0,0.0,0.666667,0.705882,0.666667,0.711864,Kolkata,Banglore,20:00:00,23:35:00,1655,1.0,In-flight meal not included
6693,0.0,0.0,0.0,1.0,0.0,1.000000,0.823529,0.000000,0.796610,Delhi,Cochin,10:35:00,19:00:00,505,1.0,No Info
