# Feature Scaling

# Preparing a Dataset - Feature Scaling

Dataset -- Clean -- Categorical Value Conversion -- Feature Scaling

Scale values to be within the same proportional range

A column with much larger range will over influence
learning over another column with smaller range.

Typically, scale the range between 0 and 1 (normalization) or -1 and 1 (standardization)

# --Normalization

Scaling each observation from original range into the range 0 and 1

# --Standardization
rescaling data so it has a zero mean and unit variance

# Standard(Z)Scaling
After Standardization, a feature has mean of 0 and variance of 1 (assumption
of many learning algorithm)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df=pd.read_csv("dataset\\Data.csv")

In [4]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [5]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

In [6]:
X = df.iloc[:,:-1].values
y = df.iloc[:,3].values


In [7]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [8]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [10]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [11]:
X[:,1:3] = imp.fit_transform(X[:,1:3])


In [12]:
X[:,1:3]=X[:,1:3].astype(int)
X

array([['France', 44, 72000],
       ['Spain', 27, 48000],
       ['Germany', 30, 54000],
       ['Spain', 38, 61000],
       ['Germany', 40, 63777],
       ['France', 35, 58000],
       ['Spain', 38, 52000],
       ['France', 48, 79000],
       ['Germany', 50, 83000],
       ['France', 37, 67000]], dtype=object)

look age going from 27 to 50

salary going from 48000 to 83000

so both features dont have no a same scale, this will cause some issue for
machine learning model.

In [15]:
#Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

#Country column
ct= ColumnTransformer([("Country", OneHotEncoder(), [0])],
remainder = 'passthrough')

X = ct.fit_transform(X)
X

array([[1.0, 0.0, 0.0, 44, 72000],
       [0.0, 0.0, 1.0, 27, 48000],
       [0.0, 1.0, 0.0, 30, 54000],
       [0.0, 0.0, 1.0, 38, 61000],
       [0.0, 1.0, 0.0, 40, 63777],
       [1.0, 0.0, 0.0, 35, 58000],
       [0.0, 0.0, 1.0, 38, 52000],
       [1.0, 0.0, 0.0, 48, 79000],
       [0.0, 1.0, 0.0, 50, 83000],
       [1.0, 0.0, 0.0, 37, 67000]], dtype=object)

In [16]:
df=pd.DataFrame(X,columns=['France','Germany','Spain','Age','Salary'])
df

Unnamed: 0,France,Germany,Spain,Age,Salary
0,1.0,0.0,0.0,44,72000
1,0.0,0.0,1.0,27,48000
2,0.0,1.0,0.0,30,54000
3,0.0,0.0,1.0,38,61000
4,0.0,1.0,0.0,40,63777
5,1.0,0.0,0.0,35,58000
6,0.0,0.0,1.0,38,52000
7,1.0,0.0,0.0,48,79000
8,0.0,1.0,0.0,50,83000
9,1.0,0.0,0.0,37,67000


In [17]:
label_y = LabelEncoder()
y = label_y.fit_transform(y)

In [18]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [19]:
#feature scaling using Standardization
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()

In [20]:
X1=X[:,3:5]
X1

array([[44, 72000],
       [27, 48000],
       [30, 54000],
       [38, 61000],
       [40, 63777],
       [35, 58000],
       [38, 52000],
       [48, 79000],
       [50, 83000],
       [37, 67000]], dtype=object)

In [21]:
X_new= sc.fit_transform(X1)

In [23]:
X_new

array([[ 7.69734393e-01,  7.49480344e-01],
       [-1.69922498e+00, -1.43817132e+00],
       [-1.26352627e+00, -8.91258402e-01],
       [-1.01663033e-01, -2.53193334e-01],
       [ 1.88802776e-01, -6.38065068e-05],
       [-5.37361746e-01, -5.26649792e-01],
       [-1.01663033e-01, -1.07356271e+00],
       [ 1.35066601e+00,  1.38754541e+00],
       [ 1.64113182e+00,  1.75215402e+00],
       [-2.46895937e-01,  2.93719581e-01]])

In [24]:
X_new=pd.DataFrame(X_new,columns=['Age','Salary'])
X_new

Unnamed: 0,Age,Salary
0,0.769734,0.74948
1,-1.699225,-1.438171
2,-1.263526,-0.891258
3,-0.101663,-0.253193
4,0.188803,-6.4e-05
5,-0.537362,-0.52665
6,-0.101663,-1.073563
7,1.350666,1.387545
8,1.641132,1.752154
9,-0.246896,0.29372


In [25]:
merged = pd.concat([df.iloc[:,0:3],X_new], axis='columns')
merged

Unnamed: 0,France,Germany,Spain,Age,Salary
0,1.0,0.0,0.0,0.769734,0.74948
1,0.0,0.0,1.0,-1.699225,-1.438171
2,0.0,1.0,0.0,-1.263526,-0.891258
3,0.0,0.0,1.0,-0.101663,-0.253193
4,0.0,1.0,0.0,0.188803,-6.4e-05
5,1.0,0.0,0.0,-0.537362,-0.52665
6,0.0,0.0,1.0,-0.101663,-1.073563
7,1.0,0.0,0.0,1.350666,1.387545
8,0.0,1.0,0.0,1.641132,1.752154
9,1.0,0.0,0.0,-0.246896,0.29372


In [26]:
from sklearn.preprocessing import MinMaxScaler #Using Normalization
sc = MinMaxScaler(feature_range=(0,1))

In [27]:
X1=X[:,3:5]
X1

array([[44, 72000],
       [27, 48000],
       [30, 54000],
       [38, 61000],
       [40, 63777],
       [35, 58000],
       [38, 52000],
       [48, 79000],
       [50, 83000],
       [37, 67000]], dtype=object)

In [28]:
X_new= sc.fit_transform(X1)

In [29]:
X_new

array([[0.73913043, 0.68571429],
       [0.        , 0.        ],
       [0.13043478, 0.17142857],
       [0.47826087, 0.37142857],
       [0.56521739, 0.45077143],
       [0.34782609, 0.28571429],
       [0.47826087, 0.11428571],
       [0.91304348, 0.88571429],
       [1.        , 1.        ],
       [0.43478261, 0.54285714]])

In [30]:
X

array([[1.0, 0.0, 0.0, 44, 72000],
       [0.0, 0.0, 1.0, 27, 48000],
       [0.0, 1.0, 0.0, 30, 54000],
       [0.0, 0.0, 1.0, 38, 61000],
       [0.0, 1.0, 0.0, 40, 63777],
       [1.0, 0.0, 0.0, 35, 58000],
       [0.0, 0.0, 1.0, 38, 52000],
       [1.0, 0.0, 0.0, 48, 79000],
       [0.0, 1.0, 0.0, 50, 83000],
       [1.0, 0.0, 0.0, 37, 67000]], dtype=object)

In [31]:
X_new = pd.DataFrame(X_new, columns=['Age','Salary'])
X_new

Unnamed: 0,Age,Salary
0,0.73913,0.685714
1,0.0,0.0
2,0.130435,0.171429
3,0.478261,0.371429
4,0.565217,0.450771
5,0.347826,0.285714
6,0.478261,0.114286
7,0.913043,0.885714
8,1.0,1.0
9,0.434783,0.542857


In [33]:
merged = pd.concat([df.iloc[:,0:3],X_new], axis='columns')

In [34]:
merged

Unnamed: 0,France,Germany,Spain,Age,Salary
0,1.0,0.0,0.0,0.73913,0.685714
1,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.130435,0.171429
3,0.0,0.0,1.0,0.478261,0.371429
4,0.0,1.0,0.0,0.565217,0.450771
5,1.0,0.0,0.0,0.347826,0.285714
6,0.0,0.0,1.0,0.478261,0.114286
7,1.0,0.0,0.0,0.913043,0.885714
8,0.0,1.0,0.0,1.0,1.0
9,1.0,0.0,0.0,0.434783,0.542857
