In [1]:
# helper modules
from sklearn.base import clone
from itertools import combinations
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split as tts

In [None]:
class SBS():
    def __init__(self,estimator,k_features,scoring = accuracy_score,test_size = 0.2,random_state = 0):
        self.estimator = clone(estimator) # cloning the learning algo / estimator ex : LogisticRegression
        self.k_features = k_features #Number of features in the feature subset
        self.scoring = scoring #criteria to select best feature subset combination
        self.test_size = test_size # test size
        self.random_state = random_state # random state
    def fit(self,X,Y):
        assert(X.shape[0]==Y.shape[0]) # debugging checking if X,Y have same number of rows ( instances )
        '''Splitting data into train and test data to evaluate the accuracy'''
        X_train,X_test,y_train,y_test = tts(X,Y,test_size = self.test_size,random_state = self.random_state,stratify = Y)
        dim = X_train.shape[1] # Actual number of features
        self.indices_ = tuple(range(dim)) # tuple of feature combinations
        self.subsets_ = [self.indices_] # subset of preceeding tuple
        score = self._return_score(X_train,y_train,X_test,y_test,self.indices_) #score for the indices combination
        self.scores_ = [score] # list of scores
        self.dim_ = [] #list to store different dimensions
        while dim > self.k_features:
            scores = [] # One each iteration starts with empty list
            subsets = []
            self.dim_.append(dim) # each dim is added to the dim_ list 
            for p in combinations(self.indices_,r=dim-1):
                score = self._return_score(X_train,y_train,X_test,y_test,p) #score for every combination of k features.
                scores.append(score)
                subsets.append(p)
            best = np.argmax(scores) #index of the maximum score
            self.indices_ = subsets[best] #combination of k features yielding best results.
            self.subsets_.append(self.indices_)
            self.scores_.append(scores[best])
            self._k_score = self.scores_[-1]
            dim -= 1 
        return self
    def transform(self,X):
        return X[:,self.indices_] # returning data with best combination of desired number of features
    def _return_score(self,X_train,y_train,X_test,y_test,indices): # evaluation function returns scores
        self.estimator.fit(X_train[:,indices],y_train)
        y_pred = self.estimator.predict(X_test[:,indices])
        score = self.scoring(y_test,y_pred)
        return score