# Transformer

### Normalize

- Normalization: Scale input vectors individually to unit norm. 
- Scaling inputs to unit norms is a common operation for text classification or clustering.
- Default on sklearn: __works on rows__

In [1]:
import numpy as np
import pandas as pd

In [60]:
df = pd.DataFrame({
    'TB': [1.72, 1.81, 1.93, 1.67, 1.85, 1.66, 1.59, 1.76, 1.88, 1.78],
    'BB': [76, 65, 87, 55, 60, 78, 98, 77, 74, 64]
})
df.head()

Unnamed: 0,TB,BB
0,1.72,76
1,1.81,65
2,1.93,87
3,1.67,55
4,1.85,60


In [61]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
ss = scaler.fit_transform(df[['TB', 'BB']])
df['ss TB'] = ss[:, 0]
df['ss BB'] = ss[:, 1]
df

Unnamed: 0,TB,BB,ss TB,ss BB
0,1.72,76,-0.444153,0.211697
1,1.81,65,0.444153,-0.683945
2,1.93,87,1.628563,1.107339
3,1.67,55,-0.937657,-1.498165
4,1.85,60,0.838957,-1.091055
5,1.66,78,-1.036358,0.374541
6,1.59,98,-1.727263,2.002981
7,1.76,77,-0.04935,0.293119
8,1.88,74,1.135059,0.048853
9,1.78,64,0.148051,-0.765367


<hr>

### C1. Manual Calculation

- Ada 3 rule normalize:
    
    - Max Normalize $\displaystyle x' = \frac {x} {x_\textrm{max}} $
    
    - L1 Normalize $\displaystyle x' = \frac {x} {\sum x}$
    
    - L2 Normalize $\displaystyle x' = \frac {x} {\sqrt{\sum x^2}} $

In [62]:
# 1. Max Normalize

x = []
for i in range(len(df)):
    data = df.iloc[i][['ss TB', 'ss BB']] / df.iloc[i][['ss TB', 'ss BB']].max()
    x.append(data.tolist())
x = np.array(x)

df['ssTB nmax'] = x[:, 0]
df['ssBB nmax'] = x[:, 1]
df

Unnamed: 0,TB,BB,ss TB,ss BB,ssTB nmax,ssBB nmax
0,1.72,76,-0.444153,0.211697,-2.09806,1.0
1,1.81,65,0.444153,-0.683945,1.0,-1.539884
2,1.93,87,1.628563,1.107339,1.0,0.679949
3,1.67,55,-0.937657,-1.498165,1.0,1.597774
4,1.85,60,0.838957,-1.091055,1.0,-1.30049
5,1.66,78,-1.036358,0.374541,-2.767007,1.0
6,1.59,98,-1.727263,2.002981,-0.862346,1.0
7,1.76,77,-0.04935,0.293119,-0.168363,1.0
8,1.88,74,1.135059,0.048853,1.0,0.04304
9,1.78,64,0.148051,-0.765367,1.0,-5.16961


In [66]:
# 2. L1 Normalize

x = []
for i in range(len(df)):
    data = df.iloc[i][['ss TB', 'ss BB']] / df.iloc[i][['ss TB', 'ss BB']].sum()
    x.append(data.tolist())
x = np.array(x)

df['ssTB nL1'] = x[:, 0]
df['ssBB nL1'] = x[:, 1]
df

Unnamed: 0,TB,BB,ss TB,ss BB,ssTB nmax,ssBB nmax,ssTB nL1,ssBB nL1
0,1.72,76,-0.444153,0.211697,-2.09806,1.0,1.910697,-0.910697
1,1.81,65,0.444153,-0.683945,1.0,-1.539884,-1.85225,2.85225
2,1.93,87,1.628563,1.107339,1.0,0.679949,0.595256,0.404744
3,1.67,55,-0.937657,-1.498165,1.0,1.597774,0.384945,0.615055
4,1.85,60,0.838957,-1.091055,1.0,-1.30049,-3.327895,4.327895
5,1.66,78,-1.036358,0.374541,-2.767007,1.0,1.565929,-0.565929
6,1.59,98,-1.727263,2.002981,-0.862346,1.0,-6.264609,7.264609
7,1.76,77,-0.04935,0.293119,-0.168363,1.0,-0.202447,1.202447
8,1.88,74,1.135059,0.048853,1.0,0.04304,0.958736,0.041264
9,1.78,64,0.148051,-0.765367,1.0,-5.16961,-0.239831,1.239831


In [67]:
# 3. L2 Normalize

x = []
for i in range(len(df)):
    data = df.iloc[i][['ss TB', 'ss BB']] / np.sqrt(np.sum(df.iloc[i][['ss TB', 'ss BB']] ** 2))
    x.append(data.tolist())
x = np.array(x)

df['ssTB nL2'] = x[:, 0]
df['ssBB nL2'] = x[:, 1]
df

Unnamed: 0,TB,BB,ss TB,ss BB,ssTB nmax,ssBB nmax,ssTB nL1,ssBB nL1,ssTB nL2,ssBB nL2
0,1.72,76,-0.444153,0.211697,-2.09806,1.0,1.910697,-0.910697,-0.902706,0.430258
1,1.81,65,0.444153,-0.683945,1.0,-1.539884,-1.85225,2.85225,0.544634,-0.838674
2,1.93,87,1.628563,1.107339,1.0,0.679949,0.595256,0.404744,0.826946,0.562281
3,1.67,55,-0.937657,-1.498165,1.0,1.597774,0.384945,0.615055,-0.53053,-0.847666
4,1.85,60,0.838957,-1.091055,1.0,-1.30049,-3.327895,4.327895,0.609566,-0.792735
5,1.66,78,-1.036358,0.374541,-2.767007,1.0,1.565929,-0.565929,-0.940467,0.339886
6,1.59,98,-1.727263,2.002981,-0.862346,1.0,-6.264609,7.264609,-0.65306,0.757306
7,1.76,77,-0.04935,0.293119,-0.168363,1.0,-0.202447,1.202447,-0.166026,0.986121
8,1.88,74,1.135059,0.048853,1.0,0.04304,0.958736,0.041264,0.999075,0.043
9,1.78,64,0.148051,-0.765367,1.0,-5.16961,-0.239831,1.239831,0.189918,-0.9818


<hr>

### C2. Sklearn ```normalize()```

In [68]:
df = pd.DataFrame({
    'TB': [1.72, 1.81, 1.93, 1.67, 1.85, 1.66, 1.59, 1.76, 1.88, 1.78],
    'BB': [76, 65, 87, 55, 60, 78, 98, 77, 74, 64]
})

scaler = StandardScaler()
ss = scaler.fit_transform(df[['TB', 'BB']])
df['ss TB'] = ss[:, 0]
df['ss BB'] = ss[:, 1]

df

Unnamed: 0,TB,BB,ss TB,ss BB
0,1.72,76,-0.444153,0.211697
1,1.81,65,0.444153,-0.683945
2,1.93,87,1.628563,1.107339
3,1.67,55,-0.937657,-1.498165
4,1.85,60,0.838957,-1.091055
5,1.66,78,-1.036358,0.374541
6,1.59,98,-1.727263,2.002981
7,1.76,77,-0.04935,0.293119
8,1.88,74,1.135059,0.048853
9,1.78,64,0.148051,-0.765367


In [69]:
from sklearn.preprocessing import normalize

In [70]:
nmax = normalize(df[['ss TB', 'ss BB']], norm='max')
nL1 = normalize(df[['ss TB', 'ss BB']], norm='l1')
nL2 = normalize(df[['ss TB', 'ss BB']], norm='l2')

df['ssTB nmax'] = nmax[:, 0]
df['ssBB nmax'] = nmax[:, 1]
df['ssTB nL1'] = nL1[:, 0]
df['ssBB nL1'] = nL1[:, 1]
df['ssTB nL2'] = nL2[:, 0]
df['ssBB nL2'] = nL2[:, 1]

df

Unnamed: 0,TB,BB,ss TB,ss BB,ssTB nmax,ssBB nmax,ssTB nL1,ssBB nL1,ssTB nL2,ssBB nL2
0,1.72,76,-0.444153,0.211697,-2.09806,1.0,-0.677217,0.322783,-0.902706,0.430258
1,1.81,65,0.444153,-0.683945,1.0,-1.539884,0.393719,-0.606281,0.544634,-0.838674
2,1.93,87,1.628563,1.107339,1.0,0.679949,0.595256,0.404744,0.826946,0.562281
3,1.67,55,-0.937657,-1.498165,1.0,1.597774,-0.384945,-0.615055,-0.53053,-0.847666
4,1.85,60,0.838957,-1.091055,1.0,-1.30049,0.43469,-0.56531,0.609566,-0.792735
5,1.66,78,-1.036358,0.374541,-2.767007,1.0,-0.734537,0.265463,-0.940467,0.339886
6,1.59,98,-1.727263,2.002981,-0.862346,1.0,-0.463043,0.536957,-0.65306,0.757306
7,1.76,77,-0.04935,0.293119,-0.168363,1.0,-0.144102,0.855898,-0.166026,0.986121
8,1.88,74,1.135059,0.048853,1.0,0.04304,0.958736,0.041264,0.999075,0.043
9,1.78,64,0.148051,-0.765367,1.0,-5.16961,0.162085,-0.837915,0.189918,-0.9818


<hr>

### C3. Sklearn ```Normalizer()```

In [71]:
df = pd.DataFrame({
    'TB': [1.72, 1.81, 1.93, 1.67, 1.85, 1.66, 1.59, 1.76, 1.88, 1.78],
    'BB': [76, 65, 87, 55, 60, 78, 98, 77, 74, 64]
})

scaler = StandardScaler()
ss = scaler.fit_transform(df[['TB', 'BB']])
df['ss TB'] = ss[:, 0]
df['ss BB'] = ss[:, 1]

df

Unnamed: 0,TB,BB,ss TB,ss BB
0,1.72,76,-0.444153,0.211697
1,1.81,65,0.444153,-0.683945
2,1.93,87,1.628563,1.107339
3,1.67,55,-0.937657,-1.498165
4,1.85,60,0.838957,-1.091055
5,1.66,78,-1.036358,0.374541
6,1.59,98,-1.727263,2.002981
7,1.76,77,-0.04935,0.293119
8,1.88,74,1.135059,0.048853
9,1.78,64,0.148051,-0.765367


In [72]:
from sklearn.preprocessing import Normalizer

In [73]:
normMax = Normalizer(norm='max')
normL1 = Normalizer(norm='l1')
normL2 = Normalizer(norm='l2')

In [76]:
nmax = normMax.fit_transform(df[['ss TB', 'ss BB']])
nL1 = normL1.fit_transform(df[['ss TB', 'ss BB']])
nL2 = normL2.fit_transform(df[['ss TB', 'ss BB']])

df['ssTB nmax'] = nmax[:, 0]
df['ssBB nmax'] = nmax[:, 1]
df['ssTB nL1'] = nL1[:, 0]
df['ssBB nL1'] = nL1[:, 1]
df['ssTB nL2'] = nL2[:, 0]
df['ssBB nL2'] = nL2[:, 1]

df

Unnamed: 0,TB,BB,ss TB,ss BB,ssTB nmax,ssBB nmax,ssTB nL1,ssBB nL1,ssTB nL2,ssBB nL2
0,1.72,76,-0.444153,0.211697,-2.09806,1.0,-0.677217,0.322783,-0.902706,0.430258
1,1.81,65,0.444153,-0.683945,1.0,-1.539884,0.393719,-0.606281,0.544634,-0.838674
2,1.93,87,1.628563,1.107339,1.0,0.679949,0.595256,0.404744,0.826946,0.562281
3,1.67,55,-0.937657,-1.498165,1.0,1.597774,-0.384945,-0.615055,-0.53053,-0.847666
4,1.85,60,0.838957,-1.091055,1.0,-1.30049,0.43469,-0.56531,0.609566,-0.792735
5,1.66,78,-1.036358,0.374541,-2.767007,1.0,-0.734537,0.265463,-0.940467,0.339886
6,1.59,98,-1.727263,2.002981,-0.862346,1.0,-0.463043,0.536957,-0.65306,0.757306
7,1.76,77,-0.04935,0.293119,-0.168363,1.0,-0.144102,0.855898,-0.166026,0.986121
8,1.88,74,1.135059,0.048853,1.0,0.04304,0.958736,0.041264,0.999075,0.043
9,1.78,64,0.148051,-0.765367,1.0,-5.16961,0.162085,-0.837915,0.189918,-0.9818
