In [650]:
import numpy as np
import pandas as pd

from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, PowerTransformer,OrdinalEncoder

from sklearn.metrics import mean_squared_error

import warnings
import pickle5 as pickle
warnings.filterwarnings('ignore')


#from sklearn.compose import TransformedTargetRegressor

In [664]:
file_train = open("../data/train_pickle","rb")
df_train = pickle.load(file_train)

X = df_train.drop(['HighValue','SalePrice'], axis = 1)
y = df_train.HighValue


#file_train = open("../data/train_model_ready","rb")
#df_train = pickle.load(file_train)

In [665]:
X.shape

(1460, 79)

In [652]:
class EstimatorCustom(BaseEstimator, TransformerMixin):
    
  def __init__(self, feature_to_estimate,feature_estimator):
    self.feature_to_estimate = feature_to_estimate
    self.feature_estimator = feature_estimator
    print('\nCustom Estimator for '+ self.feature_to_estimate + ' init() called.\n')

  def fit(self, X, y = None):
    print('\nCustom Estimator for ' + self.feature_to_estimate + ' fit() called.\n')
    return self
    
  def transform(self, X, y = None):
    print('\nCustom Estimator for ' + self.feature_to_estimate + ' transform() called.\n')
    X_ = X.copy() 
    X_[self.feature_to_estimate] = X_.groupby(self.feature_estimator)[self.feature_to_estimate].transform(lambda x: x.fillna(x.median()))
    
    return X_

In [666]:
pipe_custom = Pipeline(steps=[('Custom Estimator',EstimatorCustom('LotFrontage','Neighborhood'))])
X1 = pipe_custom.fit_transform(X,y)

X1['LotFrontage'].isna().sum()


Custom Estimator for LotFrontage init() called.


Custom Estimator for LotFrontage fit() called.


Custom Estimator for LotFrontage transform() called.



0

In [667]:
X1.shape

(1460, 79)

In [704]:
dict_fla = {'Alley':'Feature_Level_Absent','BsmtQual':'Feature_Level_Absent','BsmtCond':'Feature_Level_Absent','BsmtExposure':'Feature_Level_Absent','BsmtFinType1':'Feature_Level_Absent','BsmtFinType2':'Feature_Level_Absent','FireplaceQu':'Feature_Level_Absent','GarageType':'Feature_Level_Absent','GarageFinish':'Feature_Level_Absent','GarageQual':'Feature_Level_Absent','GarageCond':'Feature_Level_Absent','PoolQC':'Feature_Level_Absent','Fence':'Feature_Level_Absent','MiscFeature':'None'}
custom_features = list(dict_fla.keys())
pipe_constant = Pipeline(steps=[
    ('ImputerConstant', SimpleImputer(strategy='constant', fill_value='None')),
    ('OrdinalEncoder', OrdinalEncoder())])

numeric_features = df_train.select_dtypes(include=['int64', 'float64','int32','float32']).drop(['SalePrice','HighValue'], axis=1).columns
pipe_median = Pipeline(steps=[
    ('ImputerMedian', SimpleImputer(strategy='median')),
    ('Scaler', StandardScaler())])

categorical_features = df_train.select_dtypes(include=['object','category']).columns
categorical_features = list (set(categorical_features) - set (custom_features))
pipe_mode = Pipeline(steps=[
    ('ImputerMode', SimpleImputer(strategy='most_frequent')),
    ('OrdinalEncoder', OrdinalEncoder())])

#pipe_custom = Pipeline(steps=[('Custom Estimator',EstimatorCustom('LotFrontage','Neighborhood')),('OrdinalEncoder', OrdinalEncoder())])
features = list(custom_features) + list(numeric_features) + list(categorical_features)
len(features)

79

In [705]:
#from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('Constant Imputer', pipe_constant, custom_features),      #14 columns
        ('Median Imputer', pipe_median, numeric_features),        #36 columns
        ('Mode Iputer', pipe_mode, categorical_features),        #43 columna
    ])


In [706]:
clf = Pipeline(steps=[('preprocessor', preprocessor)])

In [707]:
X2 = clf.fit_transform(X1, y)

In [708]:
X2.shape

(1460, 79)

In [709]:
cols = custom_features.append(numeric_features) #cols.extend(categorical_features)
cols = custom_features
X3 = pd.DataFrame(columns = features,data=X2)

In [711]:
X3.columns

Index(['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature',
       'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'RoofStyle', 'Electrical', 'Condition1',
       'ExterCond', 'MSZoning', 'MasVnrType', 'Foundation', 'RoofMatl',
       'BldgType', 'LotShape', 'LandContour', 'KitchenQual', 'SaleCondition',
       'CentralAir', 'LandSlope', 'Exterior1

In [649]:
#preprocessor = ColumnTransformer(transformers=[
#    ('num', numeric_transformer, selector(dtype_exclude="category")),
#    ('cat', categorical_transformer, selector(dtype_include="category"))
#])

In [608]:
X2

array([[-0.23187687,  0.        ],
       [ 0.43704276,  0.        ],
       [-0.09809294,  0.        ],
       ...,
       [-0.18728222,  0.        ],
       [-0.09809294,  0.        ],
       [ 0.21406955,  0.        ]])

In [572]:
from sklearn import set_config
set_config('diagram')
clf

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('Median Imputer',
                                                  Pipeline(memory=None,
                                                           steps=[('ImputerMedian',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                            

In [712]:
clf = Pipeline(steps=[('classifier', LogisticRegression())])
clf.fit(X3, y)
print("model score: %.3f" % clf.score(X2, y))

model score: 0.945


In [537]:
from sklearn.ensemble import RandomForestClassifier

model = Pipeline(steps=[('classifier', RandomForestClassifier())])

In [508]:
param_grid = { 
    'classifier__n_estimators': [200, 500],
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__max_depth' : [4,5,6,7,8],
    'classifier__criterion' :['gini', 'entropy']}

from sklearn.model_selection import GridSearchCV
CV = GridSearchCV(model, param_grid, n_jobs= 1)
                  
CV.fit(X2, y)  
print(CV.best_params_)    
print(CV.best_score_)


{'classifier__criterion': 'gini', 'classifier__max_depth': 8, 'classifier__max_features': 'auto', 'classifier__n_estimators': 200}
0.9335616438356164


In [538]:
model.fit(X2, y)

Pipeline(memory=None,
         steps=[('classifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbose=False)

In [539]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]

for classifier in classifiers:
    pipe = Pipeline(steps=[('classifier', classifier)])
    pipe.fit(X2, y)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X2, y))
    classifier.get_params()

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')
model score: 0.926
SVC(C=0.025, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)
model score: 0.708
NuSVC(break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
      max_iter=-1, nu=0.5, probability=True, random_state=None, shrinking=True,
      tol=0.001, verbose=False)
model score: 0.860
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                  

In [514]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]

for classifier in classifiers:

    pipe = Pipeline(steps=[('classifier', classifier)])
    
    param_grid = {} 
    #'classifier__n_estimators': [200, 500],
    #'classifier__max_features': ['auto', 'sqrt', 'log2'],
    #'classifier__max_depth' : [4,5,6,7,8],
    #'classifier__criterion' :['gini', 'entropy']}
    
    from sklearn.model_selection import GridSearchCV
    CV = GridSearchCV(pipe, param_grid, n_jobs= 1)
                  
    CV.fit(X2, y)  
    print(CV.best_params_)    
    print(CV.best_score_)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X2, y))

{}
0.8739726027397261
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')


NotFittedError: This KNeighborsClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [398]:
import pandas as pd
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
import os
import time

import warnings
warnings.filterwarnings("ignore")

def MAE(y, ypred):
    
    import numpy as np
    
    return np.sum([abs(y[i]-ypred[i]) for i in range(len(y))]) / len(y)   

In [399]:
parameters = {}
parameters['imp__strategy'] = ['mean', 'median', 'most_frequent']
parameters['feat_select__k'] = [5, 10]

CV = GridSearchCV(model_workflow, parameters, scoring = 'mean_absolute_error', n_jobs= 1)
CV.fit(X_, y)  

ValueError: 'mean_absolute_error' is not a valid scoring value. Use sorted(sklearn.metrics.SCORERS.keys()) to get valid options.