### Standardisasi / Feature Scaling
##### Membuat standar supaya antar fitur dapat dibandingkan
##### scaling, membandingkan feature apple to apple

$\displaystyle z = \frac {x - \bar{x}}{s}$

$\displaystyle s = \sqrt {\frac {\sum (x - \bar{x}) ^ 2}{n - 0}}$

- High outlier jika $ z_{score} > 2.5 $
- Low outlier jika $ z_{score} < -2.5 $
- Data Terstandardisasi mean ~ 0 dan std ~ 1

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
data = {
    'TB' : [0.8, 0.85, 0.92, 0.88, 0.82, 0.86, 0.87, 0.93, 0.81, 1],
    'BB' : [20, 25, 21, 29, 30, 21, 28, 27, 29, 30]
}

df = pd.DataFrame(data)
df

Unnamed: 0,TB,BB
0,0.8,20
1,0.85,25
2,0.92,21
3,0.88,29
4,0.82,30
5,0.86,21
6,0.87,28
7,0.93,27
8,0.81,29
9,1.0,30


In [3]:
averageTB = np.average(df['TB'])
averageBB = np.average(df['BB'])

df['(TB-avgTB)^2'] = (df['TB'] - averageTB) ** 2
df['(BB-avgBB)^2'] = (df['BB'] - averageBB) ** 2
df

Unnamed: 0,TB,BB,(TB-avgTB)^2,(BB-avgBB)^2
0,0.8,20,0.005476,36.0
1,0.85,25,0.000576,1.0
2,0.92,21,0.002116,25.0
3,0.88,29,3.6e-05,9.0
4,0.82,30,0.002916,16.0
5,0.86,21,0.000196,25.0
6,0.87,28,1.6e-05,4.0
7,0.93,27,0.003136,1.0
8,0.81,29,0.004096,9.0
9,1.0,30,0.015876,16.0


In [4]:
devStdTB = np.sqrt(np.sum(df['(TB-avgTB)^2']) / (df['TB'].count() - 1))
devStdBB = np.sqrt(np.sum(df['(BB-avgBB)^2']) / (df['BB'].count() - 1))

In [5]:
df['zTB'] = (df['TB'] - averageTB)/ devStdTB
df['zBB'] = (df['BB'] - averageBB)/ devStdBB
df

Unnamed: 0,TB,BB,(TB-avgTB)^2,(BB-avgBB)^2,zTB,zBB
0,0.8,20,0.005476,36.0,-1.196248,-1.510526
1,0.85,25,0.000576,1.0,-0.387972,-0.251754
2,0.92,21,0.002116,25.0,0.743614,-1.258772
3,0.88,29,3.6e-05,9.0,0.096993,0.755263
4,0.82,30,0.002916,16.0,-0.872938,1.007018
5,0.86,21,0.000196,25.0,-0.226317,-1.258772
6,0.87,28,1.6e-05,4.0,-0.064662,0.503509
7,0.93,27,0.003136,1.0,0.905269,0.251754
8,0.81,29,0.004096,9.0,-1.034593,0.755263
9,1.0,30,0.015876,16.0,2.036856,1.007018


### Difference between std() function from Numpy and Pandas
##### Numpy : dengan degree of Freedom = 0,
##### Pandas: dengan degree of Freedom = 1, seperti excel

In [6]:
devStdTBPd = df['TB'].std()
devStdBBPd = df['BB'].std()
devStdTBNp = np.std(df['TB'])
devStdBBNp = np.std(df['BB'])

print(f'Standard Deviation of TB using Pandas : {devStdTBPd}')
print(f'Standard Deviation of BB using Pandas : {devStdBBPd}')
print(f'Standard Deviation of TB using NumPy : {devStdTBNp}')
print(f'Standard Deviation of BB using NumPy : {devStdBBNp}')

Standard Deviation of TB using Pandas : 0.06186005711819757
Standard Deviation of BB using Pandas : 3.972125095937662
Standard Deviation of TB using NumPy : 0.05868560300448484
Standard Deviation of BB using NumPy : 3.7682887362833544


In [7]:
df['zTB PD'] = (df['TB'] - averageTB)/ devStdTBPd
df['zBB PD'] = (df['BB'] - averageBB)/ devStdBBPd
df['zTB NP'] = (df['TB'] - averageTB)/ devStdTBNp
df['zBB NP'] = (df['BB'] - averageBB)/ devStdBBNp
df

Unnamed: 0,TB,BB,(TB-avgTB)^2,(BB-avgBB)^2,zTB,zBB,zTB PD,zBB PD,zTB NP,zBB NP
0,0.8,20,0.005476,36.0,-1.196248,-1.510526,-1.196248,-1.510526,-1.260957,-1.592235
1,0.85,25,0.000576,1.0,-0.387972,-0.251754,-0.387972,-0.251754,-0.408959,-0.265372
2,0.92,21,0.002116,25.0,0.743614,-1.258772,0.743614,-1.258772,0.783838,-1.326862
3,0.88,29,3.6e-05,9.0,0.096993,0.755263,0.096993,0.755263,0.10224,0.796117
4,0.82,30,0.002916,16.0,-0.872938,1.007018,-0.872938,1.007018,-0.920158,1.06149
5,0.86,21,0.000196,25.0,-0.226317,-1.258772,-0.226317,-1.258772,-0.238559,-1.326862
6,0.87,28,1.6e-05,4.0,-0.064662,0.503509,-0.064662,0.503509,-0.06816,0.530745
7,0.93,27,0.003136,1.0,0.905269,0.251754,0.905269,0.251754,0.954237,0.265372
8,0.81,29,0.004096,9.0,-1.034593,0.755263,-1.034593,0.755263,-1.090557,0.796117
9,1.0,30,0.015876,16.0,2.036856,1.007018,2.036856,1.007018,2.147034,1.06149


#### Mengecek apakah data sudah standar
Data Terstandardisasi mean ~ 0 dan std ~ 1

In [8]:
def checkStandard(param):
    if param > 2.5:
        print('High Outlier!')
    elif param < (-2.5):
        print('Low Outlier!')
    elif param < 1:
        print('Data is Standardized!')

print(f'zTB Pandas : {df["zTB PD"].mean()}')
checkStandard(df["zTB PD"].mean())
print(f'zBB Pandas : {df["zBB PD"].mean()}')
checkStandard(df["zBB PD"].mean())
print(f'zTB NumPy : {df["zTB NP"].mean()}')
checkStandard(df["zTB NP"].mean())
print(f'zBB NumPy : {df["zBB NP"].mean()}')
checkStandard(df["zBB NP"].mean())

zTB Pandas : 1.7763568394002506e-16
Data is Standardized!
zBB Pandas : -8.881784197001253e-17
Data is Standardized!
zTB NumPy : 1.3322676295501878e-16
Data is Standardized!
zBB NumPy : 0.0
Data is Standardized!


In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

def scalerTB():
    # scaler.fit_transform(df[['TB']])
    scaler.fit(df[['TB']])
    scaler.transform(df[['TB']])

scalerTB()
df['zTB SKL'] = scaler.transform(df[['TB']])
df

Unnamed: 0,TB,BB,(TB-avgTB)^2,(BB-avgBB)^2,zTB,zBB,zTB PD,zBB PD,zTB NP,zBB NP,zTB SKL
0,0.8,20,0.005476,36.0,-1.196248,-1.510526,-1.196248,-1.510526,-1.260957,-1.592235,-1.260957
1,0.85,25,0.000576,1.0,-0.387972,-0.251754,-0.387972,-0.251754,-0.408959,-0.265372,-0.408959
2,0.92,21,0.002116,25.0,0.743614,-1.258772,0.743614,-1.258772,0.783838,-1.326862,0.783838
3,0.88,29,3.6e-05,9.0,0.096993,0.755263,0.096993,0.755263,0.10224,0.796117,0.10224
4,0.82,30,0.002916,16.0,-0.872938,1.007018,-0.872938,1.007018,-0.920158,1.06149,-0.920158
5,0.86,21,0.000196,25.0,-0.226317,-1.258772,-0.226317,-1.258772,-0.238559,-1.326862,-0.238559
6,0.87,28,1.6e-05,4.0,-0.064662,0.503509,-0.064662,0.503509,-0.06816,0.530745,-0.06816
7,0.93,27,0.003136,1.0,0.905269,0.251754,0.905269,0.251754,0.954237,0.265372,0.954237
8,0.81,29,0.004096,9.0,-1.034593,0.755263,-1.034593,0.755263,-1.090557,0.796117,-1.090557
9,1.0,30,0.015876,16.0,2.036856,1.007018,2.036856,1.007018,2.147034,1.06149,2.147034


In [10]:
def scalerBB():
    # scaler.fit_transform(df[['TB']])
    scaler.fit(df[['BB']])
    scaler.transform(df[['BB']])
scalerBB()
df['zBB SKL'] = scaler.transform(df[['BB']])
df

Unnamed: 0,TB,BB,(TB-avgTB)^2,(BB-avgBB)^2,zTB,zBB,zTB PD,zBB PD,zTB NP,zBB NP,zTB SKL,zBB SKL
0,0.8,20,0.005476,36.0,-1.196248,-1.510526,-1.196248,-1.510526,-1.260957,-1.592235,-1.260957,-1.592235
1,0.85,25,0.000576,1.0,-0.387972,-0.251754,-0.387972,-0.251754,-0.408959,-0.265372,-0.408959,-0.265372
2,0.92,21,0.002116,25.0,0.743614,-1.258772,0.743614,-1.258772,0.783838,-1.326862,0.783838,-1.326862
3,0.88,29,3.6e-05,9.0,0.096993,0.755263,0.096993,0.755263,0.10224,0.796117,0.10224,0.796117
4,0.82,30,0.002916,16.0,-0.872938,1.007018,-0.872938,1.007018,-0.920158,1.06149,-0.920158,1.06149
5,0.86,21,0.000196,25.0,-0.226317,-1.258772,-0.226317,-1.258772,-0.238559,-1.326862,-0.238559,-1.326862
6,0.87,28,1.6e-05,4.0,-0.064662,0.503509,-0.064662,0.503509,-0.06816,0.530745,-0.06816,0.530745
7,0.93,27,0.003136,1.0,0.905269,0.251754,0.905269,0.251754,0.954237,0.265372,0.954237,0.265372
8,0.81,29,0.004096,9.0,-1.034593,0.755263,-1.034593,0.755263,-1.090557,0.796117,-1.090557,0.796117
9,1.0,30,0.015876,16.0,2.036856,1.007018,2.036856,1.007018,2.147034,1.06149,2.147034,1.06149


In [11]:
scalerTB()
invTB = scaler.inverse_transform(df['zTB SKL'])
scalerBB()
invBB = scaler.inverse_transform(df['zBB SKL'])
print(invTB)
print(invBB)

[0.8  0.85 0.92 0.88 0.82 0.86 0.87 0.93 0.81 1.  ]
[20. 25. 21. 29. 30. 21. 28. 27. 29. 30.]


In [12]:
scaler.fit(df[['TB', 'BB']])
z = scaler.transform(df[['TB', 'BB']])

sklTB = z[:,0]
sklBB = z[:,1]
df['zTB SKL Combine'] = sklTB
df['zBB SKL Combine'] = sklBB
df

Unnamed: 0,TB,BB,(TB-avgTB)^2,(BB-avgBB)^2,zTB,zBB,zTB PD,zBB PD,zTB NP,zBB NP,zTB SKL,zBB SKL,zTB SKL Combine,zBB SKL Combine
0,0.8,20,0.005476,36.0,-1.196248,-1.510526,-1.196248,-1.510526,-1.260957,-1.592235,-1.260957,-1.592235,-1.260957,-1.592235
1,0.85,25,0.000576,1.0,-0.387972,-0.251754,-0.387972,-0.251754,-0.408959,-0.265372,-0.408959,-0.265372,-0.408959,-0.265372
2,0.92,21,0.002116,25.0,0.743614,-1.258772,0.743614,-1.258772,0.783838,-1.326862,0.783838,-1.326862,0.783838,-1.326862
3,0.88,29,3.6e-05,9.0,0.096993,0.755263,0.096993,0.755263,0.10224,0.796117,0.10224,0.796117,0.10224,0.796117
4,0.82,30,0.002916,16.0,-0.872938,1.007018,-0.872938,1.007018,-0.920158,1.06149,-0.920158,1.06149,-0.920158,1.06149
5,0.86,21,0.000196,25.0,-0.226317,-1.258772,-0.226317,-1.258772,-0.238559,-1.326862,-0.238559,-1.326862,-0.238559,-1.326862
6,0.87,28,1.6e-05,4.0,-0.064662,0.503509,-0.064662,0.503509,-0.06816,0.530745,-0.06816,0.530745,-0.06816,0.530745
7,0.93,27,0.003136,1.0,0.905269,0.251754,0.905269,0.251754,0.954237,0.265372,0.954237,0.265372,0.954237,0.265372
8,0.81,29,0.004096,9.0,-1.034593,0.755263,-1.034593,0.755263,-1.090557,0.796117,-1.090557,0.796117,-1.090557,0.796117
9,1.0,30,0.015876,16.0,2.036856,1.007018,2.036856,1.007018,2.147034,1.06149,2.147034,1.06149,2.147034,1.06149


In [18]:
invValue = scaler.inverse_transform(df[['zTB SKL Combine', 'zBB SKL Combine']])
print(f'Inverted TB : {invValue[:, 0]}')
print(f'Inverted BB : {invValue[:, 1]}')

Inverted TB : [0.8  0.85 0.92 0.88 0.82 0.86 0.87 0.93 0.81 1.  ]
Inverted BB : [20. 25. 21. 29. 30. 21. 28. 27. 29. 30.]
