In [1]:
import sklearn
import pandas as pd
import numpy as np
import scipy

In [2]:
sklearn.__version__, pd.__version__, np.__version__, scipy.__version__

('0.22.2.post1', '1.0.1', '1.18.1', '1.4.1')

## sparse to dense matrix

In [3]:
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier
from sklearn.base import TransformerMixin,BaseEstimator

from sklearn.pipeline import Pipeline

In [4]:
data = scipy.sparse.csr_matrix([
    [1.,0.,0.,0.,0.,0.],
    [0.,1.,0.,0.,0.,0.],
    [1.,0.,0.,0.,0.,0.],
    [0.,0.,0.,0.,1.,0.],
    [0.,0.,0.,1.,0.,0.],
    [1.,0.,0.,0.,0.,0.],
    [1.,1.,0.,0.,0.,0.],
    [1.,1.,0.,0.,0.,0.],
])

target = np.array([1,1,1,0,0,1,1,1])

In [27]:
class ToDenseTransformer():
    
    # here you define the operation it should perform
    def transform(self, X, y=None, **fit_params):
        return X.todense()

#     # just return self
#     def fit(self, X, y=None, **fit_params):
#         return self

# need to make matrices dense because PCA does not work with sparse vectors.
pipeline = Pipeline([
    ('to_dense',ToDenseTransformer()),
    ('pca',PCA()),
    ('clf',DecisionTreeClassifier())
])

pipeline.fit(data,target)
pipeline.predict(data)

TypeError: PCA does not support sparse input. See TruncatedSVD for a possible alternative.

## missing imputation

In [6]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [7]:
df = pd.DataFrame({
    'name':['alice','bob','charlie','david','edward'],
    'age':[24,32,np.nan,38,20]
})

df.head()

Unnamed: 0,name,age
0,alice,24.0
1,bob,32.0
2,charlie,
3,david,38.0
4,edward,20.0


In [8]:
transformer_step = ColumnTransformer([
        ('impute_mean', SimpleImputer(strategy='mean'), ['age'])
    ], remainder='passthrough')

In [9]:
pipe = Pipeline([
    ('transformer', transformer_step)
])

In [10]:
pipe.fit(df)

pd.DataFrame(
    data=pipe.transform(df),
    columns=['age', 'name']
)[["name","age"]]

Unnamed: 0,name,age
0,alice,24.0
1,bob,32.0
2,charlie,28.5
3,david,38.0
4,edward,20.0


## Select columns

In [13]:
import pandas as pd

from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.pipeline import Pipeline

In [18]:
class SelectColumnsTransfomer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns


    def transform(self, X, **transform_params):
        cpy_df = X[self.columns].copy()
        return cpy_df

    def fit(self, X, y=None, **fit_params):
        return self

In [19]:
df = pd.DataFrame({
    'name':['alice','bob','charlie','david','edward'],
    'age':[24,32,np.nan,38,20]
})

df.head()

Unnamed: 0,name,age
0,alice,24.0
1,bob,32.0
2,charlie,
3,david,38.0
4,edward,20.0


In [20]:
pipe = Pipeline([
    ('selector', SelectColumnsTransfomer(["name"]))
])

In [21]:
pipe.fit_transform(df)

Unnamed: 0,name
0,alice
1,bob
2,charlie
3,david
4,edward
