<a href="https://colab.research.google.com/github/sidjohal/ML_implemented/blob/main/2.%20data%20preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
    import numpy as np, pandas as pd, matplotlib.pyplot as plt

# Data preprocessing

## 1. Feature extraction

    DictVectorizer - converts dict -> matrix
    FeatureHasher - optimal for large datasets

In [None]:
from sklearn.feature_extraction import DictVectorizer, FeatureHasher

In [None]:
data = [{'age': 4, 'height':96.0},
{'age': 1, 'height':73.9},
{'age': 3, 'height':88.9},
{'age': 2, 'height':81.6}]

In [None]:
dv = DictVectorizer(sparse=False)
dv.fit_transform(data)

array([[ 4. , 96. ],
       [ 1. , 73.9],
       [ 3. , 88.9],
       [ 2. , 81.6]])

In [None]:
fh = FeatureHasher(n_features=2)
fh.transform(data).toarray()

array([[  0. , 100. ],
       [  0. ,  74.9],
       [  0. ,  91.9],
       [  0. ,  83.6]])

## 2. Data cleaning

### Feature imputation

    SimpleImputer
    KNNImputer

In [None]:
from sklearn.impute import SimpleImputer, KNNImputer

In [None]:
x = np.array([[7, 1], [np.nan, 8], [2, np.nan], [9, 6]])
x

array([[ 7.,  1.],
       [nan,  8.],
       [ 2., nan],
       [ 9.,  6.]])

In [None]:
si = SimpleImputer(strategy='mean')

In [None]:
si.fit_transform(x)

array([[7., 1.],
       [6., 8.],
       [2., 5.],
       [9., 6.]])

In [None]:
ops = ['mean', 'median', 'most_frequent', 'constant']
for i in ops:
    print(SimpleImputer(strategy=i).fit_transform(x), i, "\n")

[[7. 1.]
 [6. 8.]
 [2. 5.]
 [9. 6.]] mean 

[[7. 1.]
 [7. 8.]
 [2. 6.]
 [9. 6.]] median 

[[7. 1.]
 [2. 8.]
 [2. 1.]
 [9. 6.]] most_frequent 

[[7. 1.]
 [0. 8.]
 [2. 0.]
 [9. 6.]] constant 



In [None]:
kim = KNNImputer(n_neighbors=2)

In [None]:
kim.fit_transform(x)

array([[7. , 1. ],
       [8. , 8. ],
       [2. , 3.5],
       [9. , 6. ]])

In [None]:
xt = SimpleImputer().fit_transform(x)
xt

array([[7., 1.],
       [6., 8.],
       [2., 5.],
       [9., 6.]])

### Categorical transformers

    OneHotEncoder; OrdinalEncoder;  LabelEncoder
    LabelBinarizer; MultiLabelBinarizer

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder

In [None]:
arr = np.array([1,2,1,4,2,5])
arr.reshape(-1,1), arr # -1 or length of arr

(array([[1],
        [2],
        [1],
        [4],
        [2],
        [5]]), array([1, 2, 1, 4, 2, 5]))

In [None]:
ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(arr.reshape(-1,1))

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.]])

In [None]:
np.array([1,2,1,4,2,5]).reshape(len(arr),1)

array([[1],
       [2],
       [1],
       [4],
       [2],
       [5]])

In [None]:
OrdinalEncoder().fit_transform(x.reshape(-1,1))

array([[ 3.],
       [ 0.],
       [nan],
       [ 4.],
       [ 1.],
       [nan],
       [ 5.],
       [ 2.]])

In [None]:
ohe.fit_transform(x)

array([[0., 1., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1., 0., 0.]])

In [None]:
oe = OrdinalEncoder()
xoe = oe.fit_transform(x)
print(x, "\n\n", xoe)

[[ 7.  1.]
 [nan  8.]
 [ 2. nan]
 [ 9.  6.]] 

 [[ 1.  0.]
 [nan  2.]
 [ 0. nan]
 [ 2.  1.]]


In [None]:
y = [1,3,2,5,2,2]
le = LabelEncoder()
le.fit_transform(y) #works

array([0, 2, 1, 3, 1, 1])

### Numerical transformers

    StandardScaler; MinMaxScaler; MaxAbsScaler
    FunctionTransformer
    PolynomialFeatures (polynomial transformation)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler

In [None]:
x

array([[ 7.,  1.],
       [nan,  8.],
       [ 2., nan],
       [ 9.,  6.]])

In [None]:
StandardScaler().fit_transform(xt)

array([[ 0.39223227, -1.56892908],
       [ 0.        ,  1.17669681],
       [-1.56892908,  0.        ],
       [ 1.17669681,  0.39223227]])

In [None]:
MinMaxScaler().fit_transform(xt)

array([[0.71428571, 0.        ],
       [0.57142857, 1.        ],
       [0.        , 0.57142857],
       [1.        , 0.71428571]])

#### Function Transformer
    user defined function trnasformation

In [None]:
from sklearn.preprocessing import FunctionTransformer

In [None]:
ft = FunctionTransformer(np.log2)

In [None]:
ft.fit_transform(xt)

array([[2.80735492, 0.        ],
       [2.5849625 , 3.        ],
       [1.        , 2.32192809],
       [3.169925  , 2.5849625 ]])

#### Polynomial transformation

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
arr = np.array([[3,4]])
pf = PolynomialFeatures(degree=2)

In [None]:
pf.fit_transform(arr)

array([[ 1.,  3.,  4.,  9., 12., 16.]])

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

In [None]:
ser = np.array([[0.1,0.3,0.2, 0.5, 6, 0.7, 0.9, 1.0]])
kbd = KBinsDiscretizer(strategy='uniform', encode='ordinal')