# Feature Transformation in Machine Learning

Feature transformation is the process of modifying your data but keeping the information. These modifications will make Machine Learning algorithms understanding easier, which will deliver better results. we want to reduce the number of features to plot and visualize data, speed up training or improve the accuracy of a specific model.

In [None]:
import pandas as pd
import seaborn as seaborn
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('agora.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [3]:
x = df.drop('Profit',axis=1) # asix=1 means col, axis = 0 means rows

In [4]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area
0,114523.61,136897.8,471784.1,Dhaka
1,162597.7,151377.59,443898.53,Ctg
2,153441.51,101145.55,407934.54,Rangpur
3,144372.41,118671.85,383199.62,Dhaka
4,142107.34,91391.77,366168.42,Rangpur


In [5]:
y = df['Profit']

In [6]:
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

# Manual

In [6]:
x['Marketing Spend'] = x['Marketing Spend'] / x['Marketing Spend'].max()

In [7]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area
0,0.692617,136897.8,471784.1,Dhaka
1,0.983359,151377.59,443898.53,Ctg
2,0.927985,101145.55,407934.54,Rangpur
3,0.873136,118671.85,383199.62,Dhaka
4,0.859438,91391.77,366168.42,Rangpur


# Min Max Scaler

The MinMax scaler is scales all the data between 0 and 1. Though 0 and 1 are the default range, we can define our range of max and min values as well.
Calculating formula: x_scaled = (x – x_min)/(x_max – x_min)

In [8]:
from sklearn.preprocessing import MinMaxScaler

In [9]:
minmax = MinMaxScaler(feature_range=(0, 1)) #default range 0,1
minmax

MinMaxScaler()

In [10]:
df['Marketing Spend'] = minmax.fit_transform(x[['Marketing Spend']])
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,136897.8,471784.1,Dhaka,192261.83
1,0.983359,151377.59,443898.53,Ctg,191792.06
2,0.927985,101145.55,407934.54,Rangpur,191050.39
3,0.873136,118671.85,383199.62,Dhaka,182901.99
4,0.859438,91391.77,366168.42,Rangpur,166187.94


In [11]:
df['Administration'] = minmax.fit_transform(x[['Administration']])
df['Transport'] = minmax.fit_transform(x[['Transport']])
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,0.651744,1.0,Dhaka,192261.83
1,0.983359,0.761972,0.940893,Ctg,191792.06
2,0.927985,0.379579,0.864664,Rangpur,191050.39
3,0.873136,0.512998,0.812235,Dhaka,182901.99
4,0.859438,0.305328,0.776136,Rangpur,166187.94


# Standard Scaler

Calculating formula: x_scaled = x – mean/std_dev

In [12]:
df2 = pd.read_csv('agora.csv')
df2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [14]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()

In [15]:
df2['Marketing Spend'] = std.fit_transform(x[['Marketing Spend']])
df2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.897913,0.651744,1.0,Dhaka,192261.83
1,1.95586,0.761972,0.940893,Ctg,191792.06
2,1.754364,0.379579,0.864664,Rangpur,191050.39
3,1.554784,0.512998,0.812235,Dhaka,182901.99
4,1.504937,0.305328,0.776136,Rangpur,166187.94


In [22]:
df2['Administration'] = std.fit_transform(x[['Administration']])
df2['Transport'] = std.fit_transform(x[['Transport']])
df2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.897913,0.560753,2.165287,Dhaka,192261.83
1,1.95586,1.082807,1.929843,Ctg,191792.06
2,1.754364,-0.728257,1.626191,Rangpur,191050.39
3,1.554784,-0.096365,1.417348,Dhaka,182901.99
4,1.504937,-1.079919,1.27355,Rangpur,166187.94


# Robust Scaler

In [23]:
df3 = pd.read_csv('agora.csv')
df3.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [24]:
from sklearn.preprocessing import RobustScaler
rob = RobustScaler() 

In [25]:
df3['Marketing Spend'] = rob.fit_transform(x[['Marketing Spend']])
df3.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.67253,136897.8,471784.1,Dhaka,192261.83
1,1.452113,151377.59,443898.53,Ctg,191792.06
2,1.303634,101145.55,407934.54,Rangpur,191050.39
3,1.156567,118671.85,383199.62,Dhaka,182901.99
4,1.119836,91391.77,366168.42,Rangpur,166187.94


In [26]:
df3['Administration'] = rob.fit_transform(x[['Administration']])
df3['Transport'] = rob.fit_transform(x[['Transport']])
df3.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.67253,0.345355,1.552016,Dhaka,192261.83
1,1.452113,0.697565,1.383714,Ctg,191792.06
2,1.303634,-0.52429,1.166654,Rangpur,191050.39
3,1.156567,-0.097977,1.017368,Dhaka,182901.99
4,1.119836,-0.761543,0.914576,Rangpur,166187.94


# Max Absolute Scaler

In [27]:
df4 = pd.read_csv('agora.csv')
df4.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [28]:
from sklearn.preprocessing import MaxAbsScaler
mas = MaxAbsScaler() 

In [30]:
df4['Marketing Spend'] = mas.fit_transform(x[['Marketing Spend']])
df4.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,136897.8,471784.1,Dhaka,192261.83
1,0.983359,151377.59,443898.53,Ctg,191792.06
2,0.927985,101145.55,407934.54,Rangpur,191050.39
3,0.873136,118671.85,383199.62,Dhaka,182901.99
4,0.859438,91391.77,366168.42,Rangpur,166187.94


In [31]:
df4['Administration'] = mas.fit_transform(x[['Administration']])
df4['Transport'] = mas.fit_transform(x[['Transport']])
df2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.897913,0.560753,2.165287,Dhaka,192261.83
1,1.95586,1.082807,1.929843,Ctg,191792.06
2,1.754364,-0.728257,1.626191,Rangpur,191050.39
3,1.554784,-0.096365,1.417348,Dhaka,182901.99
4,1.504937,-1.079919,1.27355,Rangpur,166187.94
