In [17]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OrdinalEncoder

import pickle5 as pickle

import sklearn
sklearn.__name__,sklearn.__version__

('sklearn', '0.23.1')

In [18]:
df_train = pd.read_csv('../data/train.csv', index_col='Id', parse_dates=True).sort_index()
df_test = pd.read_csv('../data/test.csv', index_col='Id', parse_dates=True).sort_index()

(df_train.shape, df_test.shape)

((1460, 80), (1459, 79))

In [19]:
high_value_threshold = 200000.00
y = np.where(df_train['SalePrice'] > 200000, 1, 0)

df_train['TrainSet'] = 'Y'
df_test['TrainSet'] = 'N'

#Append Train and Test
df_data = pd.concat([df_train.drop(['SalePrice'], axis=1), df_test], ignore_index=True)
X = df_data

X.shape, y.shape, X.columns

((2919, 80),
 (1460,),
 Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
        'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
        'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
        'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
        'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
        'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
        'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
        'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
        'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
        'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
        'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
        'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
        'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'G

In [20]:
class EstimatorCustom(BaseEstimator, TransformerMixin):
    
  def __init__(self, feature_to_estimate,feature_estimator):
    self.feature_to_estimate = feature_to_estimate
    self.feature_estimator = feature_estimator
    print('\nCustom Estimator for '+ self.feature_to_estimate + ' init() called.\n')

  def fit(self, X, y = None):
    print('\nCustom Estimator for ' + self.feature_to_estimate + ' fit() called.\n')
    return self
    
  def transform(self, X, y = None):
    print('\nCustom Estimator for ' + self.feature_to_estimate + ' transform() called.\n')
    X_ = X.copy() 
    X_[self.feature_to_estimate] = X_.groupby(self.feature_estimator)[self.feature_to_estimate].transform(lambda x: x.fillna(x.median()))
    
    return X_

In [21]:
pipe_custom = Pipeline(steps=[('Custom Estimator Lot Frontage',EstimatorCustom('LotFrontage','Neighborhood'))])
X_pipe_custom = pipe_custom.fit_transform(X,0)

(X_pipe_custom['LotFrontage'].isna().sum(), X_pipe_custom.shape)


Custom Estimator for LotFrontage init() called.


Custom Estimator for LotFrontage fit() called.


Custom Estimator for LotFrontage transform() called.



(0, (2919, 80))

In [22]:
dict_fla = {'Alley':'Feature_Level_Absent','BsmtQual':'Feature_Level_Absent','BsmtCond':'Feature_Level_Absent','BsmtExposure':'Feature_Level_Absent','BsmtFinType1':'Feature_Level_Absent','BsmtFinType2':'Feature_Level_Absent','FireplaceQu':'Feature_Level_Absent','GarageType':'Feature_Level_Absent','GarageFinish':'Feature_Level_Absent','GarageQual':'Feature_Level_Absent','GarageCond':'Feature_Level_Absent','PoolQC':'Feature_Level_Absent','Fence':'Feature_Level_Absent','MiscFeature':'None'}
custom_features = list(dict_fla.keys())
pipe_constant = Pipeline(steps=[
    ('ImputerConstant', SimpleImputer(strategy='constant', fill_value='None')),
    ('OrdinalEncoder', OrdinalEncoder())])

numeric_features = X.select_dtypes(include=['int64', 'float64','int32','float32']).columns
pipe_median = Pipeline(steps=[
    ('ImputerMedian', SimpleImputer(strategy='median')),
    ('Scaler', RobustScaler())])

categorical_features = X.select_dtypes(include=['object','category']).columns
categorical_features = list (set(categorical_features) - set (custom_features))
pipe_mode = Pipeline(steps=[
    ('ImputerMode', SimpleImputer(strategy='most_frequent')),
    ('OrdinalEncoder', OrdinalEncoder())])

features = list(custom_features) + list(numeric_features) + list(categorical_features)
len(features)

80

In [23]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('Constant Imputer', pipe_constant, custom_features),      #14 columns
        ('Median Imputer', pipe_median, numeric_features),        #36 columns
        ('Mode Iputer', pipe_mode, categorical_features),        #43 columna
    ])

In [24]:
pipe_imputations = Pipeline(steps=[('preprocessor', preprocessor)])

X_pipe_imputations = pipe_imputations.fit_transform(X_pipe_custom, 0)
X = pd.DataFrame(columns = features, data = X_pipe_imputations)
X.shape


(2919, 80)

In [25]:
X_train = X[X['TrainSet'] == 1]
X_test = X[X['TrainSet'] == 0]

X_train.drop(['TrainSet'],axis=1, inplace = True)
X_test.drop(['TrainSet'],axis=1, inplace = True)
      
(X_train.shape, X_test.shape, len(y))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


((1460, 79), (1459, 79), 1460)

In [26]:
file_mrd_train = open("../data/X_train_03_preprocess", "wb")
file_mrd_test = open("../data/X_test_03_preprocess", "wb")
file_mrd_y = open("../data/y_train_03_preprocess", "wb")

pickle.dump(X_train,file_mrd_train)
pickle.dump(X_test,file_mrd_test)
pickle.dump(y,file_mrd_y)

file_mrd_train.close()
file_mrd_test.close()
file_mrd_y.close()

In [29]:
from sklearn import set_config
set_config(display='diagram')
pipe_custom

In [28]:
pipe_imputations