## Transformers

### Function Transformers

We can use function transformers if it doesnot requires any training. They simply take the data and perfrom some calculations and return.

In [17]:
import numpy as np
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.preprocessing import FunctionTransformer

#### log transformer

In [19]:
log_transformer = FunctionTransformer(np.log, inverse_func=np.exp)

In [20]:
arr = np.random.randint(1,100, size=(20,))

In [21]:
arr

array([76, 27, 53, 51, 63, 62, 61,  3, 72, 58,  6,  4, 43, 84, 71, 13, 14,
       73, 26, 77])

In [22]:
log_transformer.transform(arr)

array([4.33073334, 3.29583687, 3.97029191, 3.93182563, 4.14313473,
       4.12713439, 4.11087386, 1.09861229, 4.27666612, 4.06044301,
       1.79175947, 1.38629436, 3.76120012, 4.4308168 , 4.26267988,
       2.56494936, 2.63905733, 4.29045944, 3.25809654, 4.34380542])

In [23]:
log_transformer.inverse_func(log_transformer.transform(arr))

array([76., 27., 53., 51., 63., 62., 61.,  3., 72., 58.,  6.,  4., 43.,
       84., 71., 13., 14., 73., 26., 77.])

#### rbf transformer

In [25]:
rbf_transformer = FunctionTransformer(rbf_kernel, kw_args=dict(Y=[[35]], gamma=0.1))

In [27]:
rbf_transformer.transform(arr.reshape(20,1))

array([[9.88775261e-074],
       [1.66155727e-003],
       [8.48904403e-015],
       [7.62186519e-012],
       [8.93948745e-035],
       [2.18742046e-032],
       [4.38220836e-030],
       [3.37477679e-045],
       [3.50820875e-060],
       [1.06126027e-023],
       [2.99112162e-037],
       [1.83780864e-042],
       [1.66155727e-003],
       [5.31979498e-105],
       [5.19320116e-057],
       [9.55316054e-022],
       [7.04065961e-020],
       [1.94033543e-063],
       [3.03539138e-004],
       [2.45727291e-077]])

### Custom Transformer

In [38]:
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin

In [40]:
class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=10, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):  # y=None reqiured, if no y present
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self   #always return self
        
    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
        

## Transformer Pipeline

We can use pipelines to stack several transformers

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [44]:
num_transformer = Pipeline([
    ("Imputer", SimpleImputer(strategy="median")),
    ("SScaler", StandardScaler())
])

In [45]:
num_transformer

In [47]:
num_transformer.fit_transform(arr.reshape(20,1))

array([[ 1.08618477],
       [-0.73964898],
       [ 0.22916077],
       [ 0.15463694],
       [ 0.6017799 ],
       [ 0.56451799],
       [ 0.52725607],
       [-1.63393489],
       [ 0.93713712],
       [ 0.41547033],
       [-1.52214916],
       [-1.59667298],
       [-0.14345837],
       [ 1.38428008],
       [ 0.8998752 ],
       [-1.26131576],
       [-1.22405385],
       [ 0.97439903],
       [-0.77691089],
       [ 1.12344668]])

## if you dont want to name each of the estimators then we can use make_pipeline method

In [48]:
from sklearn.pipeline import make_pipeline

In [49]:
num_pipe = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

In [50]:
num_pipe.fit_transform(arr.reshape(20,1))

array([[ 1.08618477],
       [-0.73964898],
       [ 0.22916077],
       [ 0.15463694],
       [ 0.6017799 ],
       [ 0.56451799],
       [ 0.52725607],
       [-1.63393489],
       [ 0.93713712],
       [ 0.41547033],
       [-1.52214916],
       [-1.59667298],
       [-0.14345837],
       [ 1.38428008],
       [ 0.8998752 ],
       [-1.26131576],
       [-1.22405385],
       [ 0.97439903],
       [-0.77691089],
       [ 1.12344668]])

In [51]:
num_pipe.get_feature_names_out()

array(['x0'], dtype=object)

## `ColumnTransformer()` can be used to stack pipelines 

In [69]:
names = np.array(['tinks', 'pinks', 'sinks', 'dinks', 'minks', np.nan, 'dinks'], dtype="object").reshape(7,1)

In [70]:
from sklearn.preprocessing import OneHotEncoder
cate_pipeline = Pipeline([
    ("Cate Imputer", SimpleImputer(strategy="most_frequent")),
    ("Cate Encoder", OneHotEncoder()),
])

In [71]:
transformed = cate_pipeline.fit_transform(names)

In [73]:
transformed.toarray()

array([[0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.]])

Stacking pipeline using Column Transformer

In [74]:
from sklearn.compose import ColumnTransformer

In [75]:
preprocessing = ColumnTransformer([
    ("cate", cate_pipeline, names),
    ("nume", num_transformer, arr.reshape(20,1))
])

In [78]:
preprocessing.fit_transform(arr)

IndexError: tuple index out of range