In [2]:
import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt
import scipy

from mlxtend.classifier import StackingClassifier
from sklearn.decomposition import TruncatedSVD, NMF, KernelPCA, LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.decomposition import FastICA
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
import xgboost as xgb
from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
import random  

from random import sample 
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

In [4]:
# Global vars:
RANDOM_STATE     = 0
KBEST_FEATURES   = 2500

# Custom Transformers

In [10]:
class DenseTransformer (BaseEstimator, TransformerMixin):

    def fit (self, X, y=None, **fit_params):
        return self

    def transform (self, X, y=None, **fit_params):
        if type (X) == scipy.sparse.csr.csr_matrix:
            return X.todense ()
        return X

In [12]:
class IdentityTransformer (BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit (self, X, y=None):
        print ('IdentityTransformer: type(X), X.shape =', type (X), X.shape)
        return self
    
    def transform (self, X, y=None):
        print ('IdentityTransformer: type(X), X.shape =', type (X), X.shape)
        return X

# Find optimal PCA dims and the same no. of NMF features
The PCA does an unsupervised dimensionality reduction, while the logistic regression does the prediction.
We use a GridSearchCV to set the dimensionality of the PCA

In [13]:
class PCA_NMF_FeatureTransformer (BaseEstimator, TransformerMixin):
    
    def __init__(self, n_components=[0.90, 0.95, 0.99], C=[0.1, 1, 10], isNMF=True):
        
        self.pca = PCA (whiten=True)
        self.nmf = None
        self.n_components = n_components
        self.C   = C
        self.isNMF = isNMF
        return

    def fit (self, X, Y, **fit_params):
        
        print ('Find optimal PCA dims and the same no. of NMF features for X.shape =', X.shape)
        # logistic = SGDClassifier(loss='log', penalty='l2', max_iter=10000, tol=1e-5, random_state=0)
        logistic = LogisticRegression (penalty="l2", class_weight='balanced')
        pipeline = Pipeline (steps=[('to_dense', DenseTransformer()), ('pca', self.pca), ('logistic', logistic)])
        
        # Parameters of pipelines can be set using ‘__’ separated parameter names:
        param_grid = {
            'pca__n_components' : self.n_components,
            # 'logistic__alpha' : np.logspace(-4, 4, 5),  # for logistic - SGDClassifier
            'logistic__C'       : self.C,      # for LogisticRegression
            'logistic__fit_intercept' : [True,False]      # for LogisticRegression
        }
        
        gridSearchCV = GridSearchCV (pipeline, param_grid, iid=False, cv=5)
        gridSearchCV.fit (X, Y)  # Does not automatically sets the params of the pca or pipe
        print ("Best parameter (CV score=%0.3f):" % gridSearchCV.best_score_)
        print (gridSearchCV.best_params_)
        
        # Now use the optimal params to fit the PCA
        self.pca = PCA (n_components=gridSearchCV.best_params_['pca__n_components'], whiten=True)
        if type (X) == scipy.sparse.csr.csr_matrix:
            X = X.todense()
        self.pca.fit (X)
        pcaDim = self.pca.transform (X[:2,:]).shape[1]
        # or simply use to get already fitted best estimator: self.pca = gridSearchCV.best_estimator_
        print("PCA dimensionality, explainedVarRatio = ", pcaDim, self.pca.n_components)
        # global RESULTS
        # RESULTS.append(benchmark(gridSearchCV.best_estimator_))
        
        if self.isNMF :
            self.nmf = NMF (n_components=pcaDim, random_state=1, alpha=.1, l1_ratio=.5)
            self.nmf.fit (X)        
        return self
    
    
    def transform (self, X, y=None, **fit_params):
        
        if type (X) == scipy.sparse.csr.csr_matrix:
            X = X.todense()
        X_pca = self.pca.transform (X)
        X_nmf = self.nmf.transform (X)
        if X.ndim==1 :
            return np.concatenate ([X_pca, X_nmf])
        return np.hstack ([X_pca, X_nmf])

In [None]:
class SVD_NMF_FeatureTransformer (BaseEstimator, TransformerMixin):
    
    def __init__(self, n_components=[0.001, 0.01, 0.1], C=[0.1, 1, 10], isNMF=True):
        
        self.svd = TruncatedSVD ()
        self.nmf = None
        self.n_components = n_components
        self.C   = C
        self.isNMF = isNMF
        return

    def fit (self, X, Y, **fit_params):
        
        print ('Find optimal SVD dims and the same no. of NMF features for X.shape =', X.shape)
        self.n_components = [int (i * X.shape[1]) for i in self.n_components if int (i * X.shape[1]) > 0]
        # if too many components then limit upto 3000 due to memory constraints
        if self.n_components[-1] > 3000:
            self.n_components = [100, 800, 2000, 3000]
        # logistic = SGDClassifier(loss='log', penalty='l2', max_iter=10000, tol=1e-5, random_state=0)
        logistic = LogisticRegression (penalty="l2", class_weight='balanced')
        pipeline = Pipeline (steps=[('svd', self.svd), ('logistic', logistic)])
        
        # Parameters of pipelines can be set using ‘__’ separated parameter names:
        param_grid = {
            'svd__n_components' : self.n_components,
            'logistic__C'       : self.C,      # for LogisticRegression
            'logistic__fit_intercept' : [True,False]      # for LogisticRegression
        }
        
        gridSearchCV = GridSearchCV (pipeline, param_grid, iid=False, cv=5)
        gridSearchCV.fit (X, Y)  # Does not automatically sets the params of the pca or pipe
        print ("Best parameter (CV score=%0.3f):" % gridSearchCV.best_score_)
        print (gridSearchCV.best_params_)
        
        # Now use the optimal params to fit the SVD
        self.svd = TruncatedSVD (n_components=gridSearchCV.best_params_['svd__n_components'])
        self.svd.fit (X)
        svdDim = self.svd.transform (X[:2,:]).shape[1]
        # or simply use to get already fitted best estimator: self.pca = gridSearchCV.best_estimator_
        print("SVD dimensionality, explainedVarRatio = ", svdDim, self.svd.explained_variance_ratio_.sum())
        # global RESULTS
        # RESULTS.append(benchmark(gridSearchCV.best_estimator_))
        
        if self.isNMF :
            self.nmf = NMF (n_components=svdDim, random_state=1, alpha=.1, l1_ratio=.5)
            self.nmf.fit (X)        
        return self
    
    
    def transform (self, X, y=None, **fit_params):
        
        X_svd = self.svd.transform (X)
        X_nmf = self.nmf.transform (X)
        if X.ndim==1 :
            return np.concatenate ([X_svd, X_nmf])
        return np.hstack ([X_svd, X_nmf])

In [15]:
class ICA_FeatureTransformer (BaseEstimator, TransformerMixin):
    
    def __init__(self, n_components=[0.30, 0.40, 0.50, 0.60], C=[0.1, 1, 10]):
        
        self.ica = FastICA (random_state=0, whiten=True)
        self.n_components = n_components
        self.C = C
        return

    def fit (self, X, Y, **fit_params):
        
        print ('ICA_FeatureTransformer: type(X), X.shape =', type(X), X.shape)
        logistic = LogisticRegression (penalty="l2", class_weight='balanced')
        pipeline = Pipeline(steps=[('to_dense', DenseTransformer()), ('ica', self.ica), ('logistic', logistic)])
        self.n_components = [int (i* X.shape[1]) for i in self.n_components if int (i* X.shape[1]) > 0]
        self.n_components = [nc for nc in self.n_components if nc>0]
        if not self.n_components:
            self.n_components = [2]
        
        # Parameters of pipelines can be set using ‘__’ separated parameter names:
        param_grid = {
            'ica__n_components' : self.n_components,
            # 'logistic__alpha' : np.logspace (-4, 4, 5),  # for logistic - SGDClassifier
            'logistic__C'       : self.C,      # for LogisticRegression
            'logistic__fit_intercept' : [True, False]      # for LogisticRegression
        }
        
        gridSearchCV = GridSearchCV (pipeline, param_grid, iid=False, cv=5)
        gridSearchCV.fit (X, Y)  # Does not automatically sets the params of the pca or pipe
        print ("Best parameter (CV score=%0.3f):" % gridSearchCV.best_score_)
        print (gridSearchCV.best_params_)
        
        # Now use the optimal params to fit the PCA
        self.ica = FastICA (n_components=gridSearchCV.best_params_['ica__n_components'], whiten=True)
        if type (X) == scipy.sparse.csr.csr_matrix:
            X = X.todense ()
        self.ica.fit (X)
        icaDim = self.ica.transform (X[:2,:]).shape[1]
        # or simply use to get already fitted best estimator: self.pca = gridSearchCV.best_estimator_
        print ("ICA dimensionality, explainedVarRaio = ", icaDim, self.ica.n_components)
        # global RESULTS
        # RESULTS.append (benchmark (gridSearchCV.best_estimator_))
        return self
    
    
    def transform (self, X, y=None, **fit_params):
        
        if type (X) == scipy.sparse.csr.csr_matrix:
            X = X.todense ()
        X_ica = self.ica.transform (X)
        return X_ica

In [None]:
class ICASparse_FeatureTransformer (BaseEstimator, TransformerMixin):
    
    def __init__(self, n_components=[0.30, 0.40, 0.50, 0.60], C=[0.1, 1, 10]):
        
        self.ica = FastICA (random_state=0, whiten=True)
        self.n_components = n_components
        self.C = C
        return

    def fit (self, X, Y, **fit_params):
        
        print ('ICA_FeatureTransformer: type(X), X.shape =', type (X), X.shape)
        logistic = LogisticRegression (penalty="l2", class_weight='balanced')
        pipeline = Pipeline(steps=[('ica', self.ica), ('logistic', logistic)])
        self.n_components = [int (i*X.shape[1]) for i in self.n_components if int (i*X.shape[1]) > 0]
        if not self.n_components:
            self.n_components = [1]
        
        # Parameters of pipelines can be set using ‘__’ separated parameter names:
        param_grid = {
            'ica__n_components' : self.n_components,
            # 'logistic__alpha' : np.logspace (-4, 4, 5),  # for logistic - SGDClassifier
            'logistic__C'       : self.C,      # for LogisticRegression
            'logistic__fit_intercept' : [True, False]      # for LogisticRegression
        }
        
        gridSearchCV = GridSearchCV (pipeline, param_grid, iid=False, cv=5)
        gridSearchCV.fit (X, Y)  # Does not automatically sets the params of the pca or pipe
        print ("Best parameter (CV score=%0.3f):" % gridSearchCV.best_score_)
        print (gridSearchCV.best_params_)
        
        # Now use the optimal params to fit the PCA
        self.ica = FastICA (n_components=gridSearchCV.best_params_['ica__n_components'], whiten=True)
        self.ica.fit (X)
        icaDim = self.ica.transform (X[:2,:]).shape[1]
        # or simply use to get already fitted best estimator: self.pca = gridSearchCV.best_estimator_
        print ("ICA dimensionality, explainedVarRaio = ", icaDim, self.ica.n_components)
        # global RESULTS
        # RESULTS.append (benchmark (gridSearchCV.best_estimator_))
        return self
    
    
    def transform (self, X, y=None, **fit_params):
        
        X_ica = self.ica.transform (X)
        return X_ica

# Find K-best features based on Mutual-Info / F-Score / Chi^2
Use K-best feature selection with logistic regression classifier in GridSearchCV to find optimal val of K

In [2]:
class SelectKBest_feature_selector (BaseEstimator, TransformerMixin):
    
    def __init__(self, n_components=[0.6, 0.70, 0.80, 0.90, 0.95, 1.0], score_func=[chi2, mutual_info_classif]):
        
        self.score_func   = score_func
        self.n_components = n_components
        self.selectKBest  = SelectKBest()
        self.shift_k      = 0
        return

    def fit (self, X, Y, **fit_params):
        
        print ('Find K-best features based on Mutual-Info / Chi^2 for X.shape =', X.shape)
        # this works only when X >= 0, hence shift by a constant so that it is >=0
        self.shift_k = np.abs (X.min()) * 1.5 + 1
        X = X + self.shift_k
        # convert fractions to int feature count
        self.n_components = ((np.array(self.n_components) * X.shape[1]).astype (int)).tolist ()
        self.n_components = [nc for nc in self.n_components if nc>0]
        if not self.n_components:
            self.n_components = [1]
        # logistic = SGDClassifier(loss='log', penalty='l2', max_iter=10000, tol=1e-5, random_state=0)
        logistic = LogisticRegression (penalty="l2", class_weight='balanced', C=0.1)
        pipeline = Pipeline (steps=[('selectKBest', self.selectKBest), ('logistic', logistic)])
        
        # Parameters of pipelines can be set using ‘__’ separated parameter names:
        param_grid = {            
            'selectKBest__k'          : self.n_components,
            'selectKBest__score_func' : self.score_func
        }
        
        gridSearchCV = GridSearchCV (pipeline, param_grid, iid=False, cv=5)
        gridSearchCV.fit (X, Y)  # Does not automatically sets the params of the models oe pipeline
        print ("Best parameter (CV score=%0.3f):" % gridSearchCV.best_score_)
        print (gridSearchCV.best_params_)
        
        # Now use the optimal params to fit the model
        self.k          = gridSearchCV.best_params_['selectKBest__k']
        self.score_func = gridSearchCV.best_params_['selectKBest__score_func']
        print ("SelectKBest: k, score_func =", self.k, self.score_func)
        # print(gridSearchCV.grid_scores_)
        
        # Now create the model with the best params and fit over the data
        self.selectKBest = SelectKBest (self.score_func, self.k)
        self.selectKBest.fit (X, Y)
        # or simply use to get already fitted best estimator: self.selectKBest = gridSearchCV.best_estimator_
        # global RESULTS
        # RESULTS.append(benchmark(gridSearchCV.best_estimator_))
        return self
    
    def transform (self, X, y=None, **fit_params):
        
        return self.selectKBest.transform (X)

NameError: name 'BaseEstimator' is not defined

# Through L1-regularized SVM similar to Lasso, identify the useful sparse features

In [34]:
class SparseSVM_feature_selector (BaseEstimator, TransformerMixin):
    
    def __init__(self, C=[10.0, 100.0]): 
        
        self.C   = C
        self.sfm = None
        return

    def fit (self, X, Y, **fit_params):
        
        print ('LinearSVC with L1-based feature selection for X.shape =', X.shape)
        # The smaller C, the stronger the regularization.
        # The more regularization, the more sparsity.
        pipeline = Pipeline ([
          ('feature_selection', SelectFromModel (LinearSVC (penalty="l1", dual=False, tol=1e-3))),
          ('classification', LinearSVC(penalty="l2"))  ])
        
        # Parameters of pipelines can be set using ‘__’ separated parameter names:
        param_grid = {            
            'feature_selection__estimator__C' : self.C
        }
        
        gridSearchCV = GridSearchCV (pipeline, param_grid, iid=False, cv=5)
        print ('X.shape =', X.shape,  'Y.shape =', Y.shape)
        gridSearchCV.fit (X, Y)  # Does not automatically sets the params of the models
        print ("Best parameter (CV score=%0.3f):" % gridSearchCV.best_score_)
        print (gridSearchCV.best_params_)
        
        # Now use the optimal params to fit the model
        self.C   = gridSearchCV.best_params_['feature_selection__estimator__C']
        print ("Best feature_selection__estimator__C =", self.C)
        svc = LinearSVC (penalty="l1", dual=False, tol=1e-3, C=self.C)
        self.sfm = SelectFromModel (svc) #, threshold=0.0001)
        self.sfm.fit (X, Y)
        # or simply use to get already fitted best estimator: self.sfm = gridSearchCV.best_estimator_
        print ('New #features = ', self.sfm.transform (X[:2,:]).shape[1] )
        # global RESULTS
        # RESULTS.append(benchmark(gridSearchCV.best_estimator_))
        return self
    
    
    def transform (self, X, y=None, **fit_params):
        
        return self.sfm.transform (X)

# Kernel PCA features
The KPCA does an unsupervised dimensionality reduction, while the logistic regression does the prediction. We use a GridSearchCV to set the dimensionality of the KPCA

In [36]:
class KPCA_FeatureTransformer (BaseEstimator, TransformerMixin):
    
    def __init__(self, n_components=[0.70, 0.75, 0.77], kernel=['poly', 'rbf', 'cosine']):
        """
        A high value for n_components such as 0.80 leads to some errors sometimes. Hence don't exceed 0.77
        """
        
        self.kpca         = KernelPCA(remove_zero_eig=True)
        self.n_components = n_components
        self.kernel       = kernel 
        return

    def fit (self, X, Y, **fit_params):
        
        print ('Kernel PCA features for X.shape =', X.shape)
        # convert fractions to int feature count
        self.n_components = ((np.array (self.n_components) * KernelPCA (kernel='sigmoid').fit_transform (X).shape[1]).astype (int)).tolist ()
        self.n_components = [nc for nc in self.n_components if nc>0]
        if not self.n_components:
            self.n_components = [1]
        # logistic = SGDClassifier(loss='log', penalty='l2', max_iter=10000, tol=1e-5, random_state=0)
        logistic = LogisticRegression (penalty="l2", class_weight='balanced', C=0.1)
        pipeline = Pipeline (steps=[('to_dense', DenseTransformer()), ('kpca', self.kpca), ('logistic', logistic)])
        
        # Parameters of pipelines can be set using ‘__’ separated parameter names:
        param_grid = {
            'kpca__n_components' : self.n_components,            
            'kpca__kernel'       : self.kernel
        }
        
        gridSearchCV = GridSearchCV (pipeline, param_grid, iid=False, cv=5)
        gridSearchCV.fit (X, Y)  # Does not automatically sets the params of the pca
        print ("Best parameter (CV score=%0.3f):" % gridSearchCV.best_score_)
        print (gridSearchCV.best_params_)
        
        # Now use the optimal params to fit the KPCA
        self.kpca = KernelPCA (n_components=gridSearchCV.best_params_['kpca__n_components'], 
                               kernel=gridSearchCV.best_params_['kpca__kernel'],
                               remove_zero_eig=True)
        if type (X) == scipy.sparse.csr.csr_matrix:
            X = X.todense ()
        self.kpca.fit (X)           
        # or simply use to get already fitted best estimator: self.kpca = gridSearchCV.best_estimator_
        # global RESULTS
        # RESULTS.append(benchmark(gridSearchCV.best_estimator_))
        return self
    
    
    def transform (self, X, y=None, **fit_params):
        """
        X = sparse matrix
        """
        
        if type (X) == scipy.sparse.csr.csr_matrix:
            X = X.todense ()
        return self.kpca.transform (X)

In [None]:
class KPCASparse_FeatureTransformer (BaseEstimator, TransformerMixin):
    
    def __init__(self, n_components=[0.70, 0.75, 0.77], kernel=['poly', 'rbf', 'cosine']):
        """
        A high value for n_components such as 0.80 leads to some errors sometimes. Hence don't exceed 0.77
        """
        
        self.kpca         = KernelPCA(remove_zero_eig=True)
        self.n_components = n_components
        self.kernel       = kernel 
        return

    def fit (self, X, Y, **fit_params):
        
        print ('Kernel PCA features for X.shape =', X.shape)
        # convert fractions to int feature count
        self.n_components = ((np.array (self.n_components) * KernelPCA (kernel='sigmoid').fit_transform (X).shape[1]).astype (int)).tolist ()
        self.n_components = [nc for nc in self.n_components if nc>0]
        if not self.n_components:
            self.n_components = [1]
        # logistic = SGDClassifier(loss='log', penalty='l2', max_iter=10000, tol=1e-5, random_state=0)
        logistic = LogisticRegression (penalty="l2", class_weight='balanced', C=0.1)
        pipeline = Pipeline (steps=[('kpca', self.kpca), ('logistic', logistic)])
        
        # Parameters of pipelines can be set using ‘__’ separated parameter names:
        param_grid = {
            'kpca__n_components' : self.n_components,            
            'kpca__kernel'       : self.kernel
        }
        
        gridSearchCV = GridSearchCV (pipeline, param_grid, iid=False, cv=5)
        gridSearchCV.fit (X, Y)  # Does not automatically sets the params of the pca
        print ("Best parameter (CV score=%0.3f):" % gridSearchCV.best_score_)
        print (gridSearchCV.best_params_)
        
        # Now use the optimal params to fit the KPCA
        self.kpca = KernelPCA (n_components=gridSearchCV.best_params_['kpca__n_components'], 
                               kernel=gridSearchCV.best_params_['kpca__kernel'],
                               remove_zero_eig=True)
        self.kpca.fit (X)           
        # or simply use to get already fitted best estimator: self.kpca = gridSearchCV.best_estimator_
        # global RESULTS
        # RESULTS.append(benchmark(gridSearchCV.best_estimator_))
        return self
    
    
    def transform (self, X, y=None, **fit_params):
        """
        X = sparse matrix
        """
        
        return self.kpca.transform (X)

# Latent Dirichlet Allocation Features
The LDA does an unsupervised dimensionality reduction, while the logistic regression does the prediction. We use a GridSearchCV to set the dimensionality of the LDA

In [41]:
class LDA_FeatureTransformer (BaseEstimator, TransformerMixin):
    
    def __init__(self, n_components=[0.01, 0.1, 0.5, 1.0]):
        
        self.lda          = LatentDirichletAllocation(max_iter=5,
                                learning_method='online', learning_offset=50., random_state=0)
        self.n_components = n_components
        return

    def fit (self, X, Y, **fit_params):
        
        print ('LDA Features for X.shape =', X.shape)
        # convert fractions to int feature count
        self.n_components = ((np.array(self.n_components) * min (X.shape[0],X.shape[1])).astype (int)).tolist ()
        self.n_components = [nc for nc in self.n_components if nc>0]
        if not self.n_components:
            self.n_components = [1]
        
        # logistic = SGDClassifier(loss='log', penalty='l2', max_iter=10000, tol=1e-5, random_state=0)
        logistic = LogisticRegression (penalty="l2", class_weight='balanced', C=0.1)
        pipeline = Pipeline (steps=[('LDA', self.lda), ('logistic', logistic)])
        
        # Parameters of pipelines can be set using ‘__’ separated parameter names:
        param_grid = {
            'LDA__n_components' : self.n_components
        }
        
        gridSearchCV = GridSearchCV (pipeline, param_grid, iid=False, cv=5)
        gridSearchCV.fit (X, Y)  # Does not automatically sets the params of the pca
        print ("Best parameter (CV score=%0.3f):" % gridSearchCV.best_score_)
        print (gridSearchCV.best_params_)
        
        # Now use the optimal params to fit the LDA
        n_components = gridSearchCV.best_params_['LDA__n_components']
        self.lda     = LatentDirichletAllocation (n_components=n_components, max_iter=5,
                                learning_method='online', learning_offset=50., random_state=0)
        self.lda.fit (X) 
        # or simply use to get already fitted best estimator: self.lda = gridSearchCV.best_estimator_
        # global RESULTS
        # RESULTS.append(benchmark(gridSearchCV.best_estimator_))
        return self
    
    
    def transform (self, X, y=None, **fit_params):
        
        return self.lda.transform (X)

# Feature Selection based on clf.feature_importances_ with CV on threshold
clf = RandomForest / Xgboost / CatBoost
we use CV to determine threshold for SelectFromModel( rf_clf, threshold=? )

@param: clf: an already fitted(X,Y) clf.

In [43]:
class CLF_importance_feature_selector (BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold=[0.0005, 0.001, 0.005]):
        
        self.clf             = RandomForestClassifier ()
        self.selectFromModel = None
        self.threshold       = threshold
        return

    def fit (self, X, Y, **fit_params):       
        
        print ('Feature Selection based on rf.feature_importances_ for X.shape =', X.shape)
        pipeline = Pipeline ([
          ('feature_selection', SelectFromModel (self.clf)),
          ('classification', LinearSVC(penalty="l2"))  ])
        
        # Parameters of pipelines can be set using ‘__’ separated parameter names:
        param_grid = {
            'feature_selection__threshold' : self.threshold
        }
        
        gridSearchCV = GridSearchCV (pipeline, param_grid, iid=False, cv=5)
        gridSearchCV.fit (X, Y)  
        print ("Best parameter (CV score=%0.3f):" % gridSearchCV.best_score_)
        print (gridSearchCV.best_params_)        
        # self.clf = gridSearchCV.best_estimator_
        
        # self.clf.fit(X,Y) # since we aleady get a fitted clf
        self.selectFromModel = SelectFromModel (self.clf, gridSearchCV.best_params_['feature_selection__threshold'])
        self.selectFromModel.fit (X, Y)
        clfDim = self.selectFromModel.transform (X[:2,:]).shape[1]        
        print ('clfDim = ', clfDim)                
        # global RESULTS
        # RESULTS.append (benchmark (gridSearchCV.best_estimator_))
        return self
        
    def transform (self, X, y=None, **fit_params):
        
        return self.selectFromModel.transform (X)

# Ada-Boosted Classifiers from a base clf after CV over boosting params

# ab_params     = { 'n_estimators'  :  50 }
# ab_param_grid = { 'learning_rate' : [0.5, 0.75, 1.0] }

def get_clf_adaBoosted_cv (X, Y, clf, params=None, param_grid=None):
    
    ab_clf = AdaBoostClassifier(base_estimator=clf)
    if params:
        ab_clf.set_params(**params)
    
    if param_grid:
        gridSearchCV = GridSearchCV (ab_clf, param_grid, iid=False, cv=5)
        gridSearchCV.fit (X, Y)  
        print ("get_clf_adaBoosted_cv(): Best parameter (CV score=%0.3f):" % gridSearchCV.best_score_)
        print (gridSearchCV.best_params_)
        ab_clf = gridSearchCV.best_estimator_
        global RESULTS
        RESULTS.append(benchmark(gridSearchCV.best_estimator_))
        # TODO: Plot scores for each split, and get its' variance
    
    return ab_clf

# Get optimal Classifier after CV

In [None]:
class Best_clf_cv_transformer (BaseEstimator, TransformerMixin): 
            
    def __init__(self, params):
        
        self.cv    = 5
        if 'cv' in params:
            self.cv= int(params['cv'])
        clf        =  None
        name       =  params['name']
        if name   == 'Logit':
            clf    =  LogisticRegression ()
        elif name == 'DT':
            clf    =  DecisionTreeClassifier ()
        elif name == 'RidgClf':
            clf    =  RidgeClassifier ()
        elif name == 'Prcpt':
            clf    =  Perceptron ()
        elif name == 'PssAggClf':
            clf    =  PassiveAggressiveClassifier ()
        elif name == 'Knn':
            clf    =  KNeighborsClassifier ()
        elif name == 'RF':
            clf    =  RandomForestClassifier ()
        elif name == 'NearCent':
            clf    =  NearestCentroid ()
        elif name == 'MultNB':
            clf    =  MultinomialNB ()
        elif name == 'BernNB':
            clf    =  BernoulliNB ()    
        elif name == 'Svc':
            clf    =  SVC (probability=True)
        elif name == 'LSvc':
            clf    =  LinearSVC ()
        elif name == 'Xgb':
            clf    =  xgb.XGBClassifier() # XGBRFClassifier()
        elif name == 'Catb' :  # issues with CV
            clf    =  CatBoostClassifier()
        else:
            print('ERROR Get_best_clf_cv(): invalid @param name \n')
        
        if 'params' in params:
            clf.set_params(**params['params'])
        self.param_grid = None
        if 'param_grid' in params:
            self.param_grid = params['param_grid']
        self.clf = clf
        self.cv_score = 0
        self.name = name
        return
    
    
    def fit (self, X, Y):
        
        print ('training', self.name, 'for X.shape =', X.shape)
        if self.param_grid:
            
            gridSearchCV = GridSearchCV (self.clf, self.param_grid, iid=False, cv=self.cv)
            gridSearchCV.fit (X, Y)  
            print (self.name, ": get_best_clf_cv(): Best parameter (CV score=%0.3f):" % gridSearchCV.best_score_)
            print (gridSearchCV.best_params_)
            self.clf = gridSearchCV.best_estimator_
            self.cv_score = gridSearchCV.best_score_
            # global RESULTS
            # RESULTS.append(benchmark(self.clf))
            # TODO: Plot scores for each split, and get its' variance
        else:
            
            self.clf.fit (X,Y) 
            predY = self.clf.predict (X)
            self.cv_score = metrics.accuracy_score (Y, predY)
            print(self.name, ": accuracy:   %0.3f" % self.cv_score)
        return self
    
    def get_cv_score (self):
        return self.cv_score
    
    def transform (self, X, Y=None, **fit_params):
        return self.clf.transform(X, Y)
    
    def predict (self, X, **fit_params):
        return self.clf.predict(X)
    
    def predict_proba (self, X):
        return self.clf.predict_proba (X)
    
    def predict_log_proba (self, X):
        return self.clf.predict_log_proba (X)
    
    def score (self, X, Y, **fit_params):
        return self.clf.score(X, Y)
    
    def decision_function (self, X, **fit_params):
        return self.clf.decision_function (X)
    
    def set_params (self, **params):
        return self.clf.set_params(**params)
    
    def get_params(self, deep=None):
        return self.clf.get_params(deep)
    
    def apply (self, X):
        return self.clf.apply(X)
    
    def decision_path (self, X):
        return self.clf.decision_path (X)
    
    def staged_decision_function (self, X):
        return self.clf.staged_decision_function (X)
    
    def staged_predict (self, X):
        return self.clf.staged_predict (X)
    
    def staged_predict_proba (self, X):
        return self.clf.staged_predict_proba (X)
    
    def staged_score (self, X):
        return self.clf.staged_score (X)

# Get Bagging Classifier from a base clf, after CV on boosting params

# params     = { 'max_samples'  : 1.0,  'n_estimators' : 10 }
# param_grid = { 'max_features' : [0.7, 0.8, 0.9, 1.0] }

def get_bagging_clf_cv (X, Y, clf, params=None, param_grid=None): 
    
    bag_clf = BaggingClassifier(base_estimator=clf)
    if params:
        bag_clf.set_params(**params)
    
    if param_grid:
        gridSearchCV = GridSearchCV (bag_clf, param_grid, iid=False, cv=5)
        gridSearchCV.fit (X, Y)  
        print ("get_bagging_clf_cv(): Best parameter (CV score=%0.3f):" % gridSearchCV.best_score_)
        print (gridSearchCV.best_params_)
        bag_clf = gridSearchCV.best_estimator_
        global RESULTS
        RESULTS.append(benchmark(gridSearchCV.best_estimator_))
        # TODO: Plot scores for each split, and get its' variance
    
    return bag_clf

def getBaggedXGB_RESULTS(Xtrain, Xtest, Ytrain):
    
    param = {}
    param['booster'] = 'gbtree'
    param['objective'] = 'multi:softprob'
    param['num_class'] = 9
    param['eval_metric'] = 'logloss'
    param['scale_pos_weight'] = 1.0
    param['bst:eta'] = 0.3
    param['bst:max_depth'] = 6
    param['bst:colsample_bytree'] = 0.5
    param['silent'] = 1
    param['nthread'] = 16
    num_round = 100
    plst = list(param.items())
    watchlist = []
    
    time0 = time()
    idxTrain = range(len(Ytrain))
    Ytestxg  = np.zeros((Xtest.shape[0], 9))
    
    bgs = 2 # 20
    for bg in range(bgs):
        param['seed'] = bg + 1
        plst = list(param.items())
        
        newidxTrain = random.sample(idxTrain, int(len(idxTrain) * 1.0))
        
        for i in range(int(len(idxTrain) * 7.0)):
            newidxTrain.append(random.choice(idxTrain))
        
        Xdatatrain = xgb.DMatrix(data = Xtrain[newidxTrain], label = Ytrain[newidxTrain])
        Xdatatest = xgb.DMatrix(data = Xtest)
        
        bst = xgb.train(plst, Xdatatrain, num_round, watchlist)
        
        curpred = bst.predict(Xdatatest).reshape((Xtest.shape[0], 9))        
        Ytestxg += curpred
        
        print (bg, (time() - time0) / 60.)
        
    Ytestxg /= bgs
    return Ytestxg

Ytestxg = getBaggedXGB_RESULTS(X_TRAIN, X_TEST, Y_TRAIN)
Ytestxg

In [2]:
[int (i*10) for i in [0.1, 0.2, 0.5]]

[1, 2, 5]