In [6]:
import numpy as np
import pandas as pd

from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

import pickle5 as pickle

sklearn.__version__

'0.22.1'

In [731]:
file_train = open("../data/train_pickle","rb")
df_train = pickle.load(file_train)

X = df_train.drop(['HighValue','SalePrice'], axis = 1)
y = df_train.HighValue

In [721]:
X.shape

(1460, 79)

In [722]:
class EstimatorCustom(BaseEstimator, TransformerMixin):
    
  def __init__(self, feature_to_estimate,feature_estimator):
    self.feature_to_estimate = feature_to_estimate
    self.feature_estimator = feature_estimator
    print('\nCustom Estimator for '+ self.feature_to_estimate + ' init() called.\n')

  def fit(self, X, y = None):
    print('\nCustom Estimator for ' + self.feature_to_estimate + ' fit() called.\n')
    return self
    
  def transform(self, X, y = None):
    print('\nCustom Estimator for ' + self.feature_to_estimate + ' transform() called.\n')
    X_ = X.copy() 
    X_[self.feature_to_estimate] = X_.groupby(self.feature_estimator)[self.feature_to_estimate].transform(lambda x: x.fillna(x.median()))
    
    return X_

In [732]:
pipe_custom = Pipeline(steps=[('Custom Estimator',EstimatorCustom('LotFrontage','Neighborhood'))])
X1 = pipe_custom.fit_transform(X,y)

X1['LotFrontage'].isna().sum()


Custom Estimator for LotFrontage init() called.


Custom Estimator for LotFrontage fit() called.


Custom Estimator for LotFrontage transform() called.



0

In [733]:
dict_fla = {'Alley':'Feature_Level_Absent','BsmtQual':'Feature_Level_Absent','BsmtCond':'Feature_Level_Absent','BsmtExposure':'Feature_Level_Absent','BsmtFinType1':'Feature_Level_Absent','BsmtFinType2':'Feature_Level_Absent','FireplaceQu':'Feature_Level_Absent','GarageType':'Feature_Level_Absent','GarageFinish':'Feature_Level_Absent','GarageQual':'Feature_Level_Absent','GarageCond':'Feature_Level_Absent','PoolQC':'Feature_Level_Absent','Fence':'Feature_Level_Absent','MiscFeature':'None'}
custom_features = list(dict_fla.keys())
pipe_constant = Pipeline(steps=[
    ('ImputerConstant', SimpleImputer(strategy='constant', fill_value='None')),
    ('OrdinalEncoder', OrdinalEncoder())])

numeric_features = df_train.select_dtypes(include=['int64', 'float64','int32','float32']).drop(['SalePrice','HighValue'], axis=1).columns
pipe_median = Pipeline(steps=[
    ('ImputerMedian', SimpleImputer(strategy='median')),
    ('Scaler', StandardScaler())])

categorical_features = df_train.select_dtypes(include=['object','category']).columns
categorical_features = list (set(categorical_features) - set (custom_features))
pipe_mode = Pipeline(steps=[
    ('ImputerMode', SimpleImputer(strategy='most_frequent')),
    ('OrdinalEncoder', OrdinalEncoder())])

#pipe_custom = Pipeline(steps=[('Custom Estimator',EstimatorCustom('LotFrontage','Neighborhood')),('OrdinalEncoder', OrdinalEncoder())])
features = list(custom_features) + list(numeric_features) + list(categorical_features)
len(features)

79

In [734]:
#from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('Constant Imputer', pipe_constant, custom_features),      #14 columns
        ('Median Imputer', pipe_median, numeric_features),        #36 columns
        ('Mode Iputer', pipe_mode, categorical_features),        #43 columna
    ])


In [735]:
clf = Pipeline(steps=[('preprocessor', preprocessor)])

In [738]:
X2 = clf.fit_transform(X1, y)
X = pd.DataFrame(columns = features,data=X2)
X.shape

(1460, 79)

In [737]:
from sklearn import set_config
set_config('diagram')
clf

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('Constant Imputer',
                                                  Pipeline(memory=None,
                                                           steps=[('ImputerConstant',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='None',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                    