In [3]:
import pandas as pd
import tensorflow as tf 
import numpy as np 
import matplotlib.pyplot as plt

In [16]:
from sklearn.preprocessing import normalize, minmax_scale

In [6]:
df = pd.read_csv('datasets/dataset2.csv')

In [14]:
df['average_montly_hours'][:10]

0    157
1    262
2    272
3    223
4    159
5    153
6    247
7    259
8    224
9    142
Name: average_montly_hours, dtype: int64

In [7]:
hours = df['average_montly_hours'].values

## Normalization using scikit
### L1-norm:
It is also known as least absolute deviations (LAD), least absolute errors (LAE). It is basically minimizing the sum of the absolute differences (S) between the target value (Yi) and the estimated values (f(xi))
<img src="images/l1-norm-formula.png">

To understand easily, its just adding all the values in the array and dividing each of it using the sum

In [10]:
normalize(df['average_montly_hours'].astype(float).values.reshape(1,-1), norm='l1', axis=1).reshape(-1,1)[:10]

array([[  5.20634019e-05],
       [  8.68828746e-05],
       [  9.01990148e-05],
       [  7.39499276e-05],
       [  5.27266300e-05],
       [  5.07369458e-05],
       [  8.19086642e-05],
       [  8.58880325e-05],
       [  7.42815416e-05],
       [  4.70891916e-05]])

### L2-norm
It is also known as least squares. It is basically minimizing the sum of the square of the differences (S) between the target value (Yi) and the estimated values (f(xi):
<img src="images/l2-norm-formula.png">

To understand easily, its just adding the square of each value in array and taking square root of the sum and dividing every values using the modified sum.

In [11]:
normalize(df['average_montly_hours'].astype(float).values.reshape(1,-1), norm='l2', axis=1).reshape(-1,1)[:10]

array([[ 0.00618817],
       [ 0.01032675],
       [ 0.0107209 ],
       [ 0.00878956],
       [ 0.006267  ],
       [ 0.00603051],
       [ 0.00973552],
       [ 0.0102085 ],
       [ 0.00882898],
       [ 0.00559694]])

### Max-norm
It is also known as max normalization. In this approach, the data is scaled to a fixed range - usually 0 to 1.

It's just dividing every value using the max value in the array.

In [12]:
normalize(df['average_montly_hours'].astype(float).values.reshape(1,-1), norm='max', axis=1).reshape(-1,1)[:10]

array([[ 0.50645161],
       [ 0.84516129],
       [ 0.87741935],
       [ 0.71935484],
       [ 0.51290323],
       [ 0.49354839],
       [ 0.79677419],
       [ 0.83548387],
       [ 0.72258065],
       [ 0.45806452]])

### Min-Max-norm

In [18]:
minmax_scale(df['average_montly_hours'].astype(float).values.reshape(-1,1))

array([[ 0.28504673],
       [ 0.77570093],
       [ 0.82242991],
       ..., 
       [ 0.21962617],
       [ 0.85981308],
       [ 0.28971963]])