# What is Feature Engineering?

In [2]:
# Feature : It is an attribute / property shared by all the independent units on which analysis or prediction is to be done

In [3]:
# Feature Engineering : 
#     It is the process of creating new features from existing features 
#     by domain knowledge to increase the performance of te ML model.

In [4]:
# - ML model sometimes cannot decypher important informations from existing features because
# they treat them as numerical values instead of applying any logic to them.

# - In this case the Feature Engineers apply their logic and extract valuable and computable features for the ML model
# and that data is used instead to increase the accuracy of the ML model.

# Major processes of Feature Engineering

In [5]:
# - Brainstorming or testing
# - Deciding what features to create
# - Creating features
# - Checking how the features work with the model
# - Improving the features if needed
# - Repeat Brainstorming / creating until the accuracy is desired

# Examples

In [8]:
import pandas as pd
import numpy as np

### Example 1

In [12]:
# original dataset : 

pd.DataFrame({
    'Train_Scheduled_Time' : ['10.00 AM' , '05.00 PM'],
    'Train_Reach_Time' : ['10.15 AM' , '05.00 PM']
})

Unnamed: 0,Train_Scheduled_Time,Train_Reach_Time
0,10.00 AM,10.15 AM
1,05.00 PM,05.00 PM


In [15]:
# Dataset After Feature Engineering :
# We extracted a new feature - "Delay Time"

pd.DataFrame({
    'Train_Scheduled_Time' : ['10.00 AM' , '05.00 PM'],
    'Train_Reach_Time' : ['10.15 AM' , '05.00 PM'],
    'Dealy_time_in_minute' : [15,0]
})

Unnamed: 0,Train_Scheduled_Time,Train_Reach_Time,Dealy_time_in_minute
0,10.00 AM,10.15 AM,15
1,05.00 PM,05.00 PM,0


### Example 2

In [16]:
# original dataset : 

pd.DataFrame({
    'Date' : ['07-Jan-2022' , '20-Fab-2022'],
    'Time' : ['12.00.00' , '23.04.00']
})

Unnamed: 0,Date,Time
0,07-Jan-2022,12.00.00
1,20-Fab-2022,23.04.00


In [18]:
# Dataset After Feature Engineering :
# We extracted 5 new features

pd.DataFrame({
    'Date' : ['07-Jan-2022' , '20-Fab-2022'],
    'Time' : ['12.00.00' , '23.04.00'],
    'hour' : [12,23],
    'day' : [7,20],
    'month' : [1,2],
    'year' : [2022,2022],
    'day_of_the_week' : [6,1]
})

Unnamed: 0,Date,Time,hour,day,month,year,day_of_the_week
0,07-Jan-2022,12.00.00,12,7,1,2022,6
1,20-Fab-2022,23.04.00,23,20,2,2022,1


### Example 3

In [19]:
# original dataset : 

pd.DataFrame({
    'Age' : [23,12,34,26,15,38],
    'Sex' : ['Male','Female','Male','Female','Male','Male']
})

Unnamed: 0,Age,Sex
0,23,Male
1,12,Female
2,34,Male
3,26,Female
4,15,Male
5,38,Male


In [21]:
# Dataset After Feature Engineering :
# We extracted 3 new features
# We also implemented one hot encoding for better results.

pd.DataFrame({
    'Age' : [23,12,34,26,15,38],
    'Sex' : ['Male','Female','Male','Female','Male','Male'],
    'male' : [1,0,1,0,1,1],
    'female' : [0,1,0,1,0,0],
    'age range' : ['21-30','11-20','31-40','21-30','11-20','31-40']
})

Unnamed: 0,Age,Sex,male,female,age range
0,23,Male,1,0,21-30
1,12,Female,0,1,11-20
2,34,Male,1,0,31-40
3,26,Female,0,1,21-30
4,15,Male,1,0,11-20
5,38,Male,1,0,31-40
