In [1]:
class FeatureSelector():
    
    '''
    класс, реализующий отбор признаков
    
    гиперпараметры:
        1) estimator - модель, с помощью которой осуществляется отбор признаков
        2) cv - схема валидации
        3) metric - оптимизируемая метрика
        4) use_recursion - использовать рекурсию
        5) fill_value - значение, которым заполняются np.nan
        6) use_value - использовать отбор значений признаков
        7) show_progress - печатать прогресс
    '''
    
    def __init__(self, estimator, cv, metric, use_recursion, fill_value, use_value, show_progress):
        self.estimator = estimator
        self.cv = cv
        self.use_recursion = use_recursion
        self.metric = metric
        self.fill_value = fill_value
        self.use_value = use_value
        self.show_progress = show_progress
        
    def fit(self, X, y):
        '''
        1) считается валидация каждого признака+значений признаков
        2) жадным способом добавляются признаки/значения 
        3) если используется рекурсия, добавляется рекурсивный отбор
            
        '''
        
        try:
            X[np.isnan(X)] = self.fill_value
        except:
            pass
        
        scores = []
        for col_idx in tqdm_notebook(range(X.shape[1])):
            series = X[:, col_idx]
            if self.use_value is None:
                scores.append((col_idx, None,\
                               cross_validate(self.estimator, series.reshape(-1,1), y,\
                                              scoring =self.metric, cv = self.cv)['test_score'].mean()))
            else:                                
                if col_idx in self.use_value:
                    unique_values = np.unique(series[~np.isnan(series)])
                    for val in tqdm_notebook(unique_values):
                        _x = (series == val).astype(int).reshape(-1,1)
                        scores.append((col_idx, val, cross_validate(self.estimator, _x,\
                                              y, scoring =self.metric, cv = self.cv)['test_score'].mean()))
                else:
                    scores.append((col_idx, None,\
                                   cross_validate(self.estimator, series.reshape(-1,1), y,\
                                                  scoring =self.metric, cv = self.cv)['test_score'].mean()))
                
        order = []
        for row in sorted(scores, key=lambda x: x[-1], reverse = True):
            order.append((row[0], row[1]))
                
        D_selected = defaultdict(list)
        best_score = .5
        best_features = []
        to_drop = []
        
        for element in tqdm_notebook(order):            
            column, value = element[0], element[1]

            if value is not None:
                D_selected[column].append(value)
                L = []
                for feat, values in D_selected.items():
                    L.append(list(map(lambda x: x if x in values else -1, X[:, feat])))
                xcurr1 = np.column_stack(L)
            else:
                current_features = best_features + [column]
                xcurr2 = X[:, current_features]
            try:
                xcurr = csc_matrix(hstack([xcurr1, xcurr2]))
            except:
                try:
                    xcurr = xcurr1
                except:
                    xcurr = xcurr2
                    
            current_score = cross_validate(self.estimator, xcurr, y,\
                                           scoring =self.metric, cv = self.cv)['test_score'].mean()
            if current_score>best_score:
                best_score = current_score
                if self.show_progress:
                    print('new best score = {}'.format(best_score))
                if value is not None:
                    pass
                else:
                    best_features = current_features
            else:
                if value is not None:
                    D_selected[column] = [val for val in D_selected[column] if val != value]
                    to_drop.append((column, value))
                else:
                    to_drop.append((column, value)) 
                    
        if self.use_recursion:
            to_drop_before = to_drop
            to_drop_after = []
            while True:
                for element in tqdm_notebook(to_drop_before):            
                    column, value = element[0], element[1]
                    if value is not None:
                        D_selected[column].append(value)
                        L = []
                        for feat, values in D_selected.items():
                            L.append(list(map(lambda x: x if x in values else -1, X[:, feat])))
                        xcurr1 = np.column_stack(L)
                    else:
                        current_features = best_features + [column]
                        xcurr2 = X[:, current_features]
                    try:
                        xcurr = csc_matrix(hstack([xcurr1, xcurr2]))
                    except:
                        try:
                            xcurr = xcurr1
                        except:
                            xcurr = xcurr2

                    current_score = cross_validate(self.estimator, xcurr, y,\
                                                   scoring =self.metric, cv = self.cv)['test_score'].mean()
                    if current_score>best_score:
                        best_score = current_score
                        if self.show_progress:
                            print('new best score = {}'.format(best_score))
                        if value is not None:
                            pass
                        else:
                            best_features = current_features
                    else:
                        if value is not None:
                            D_selected[column] = [val for val in D_selected[column] if val != value]
                            to_drop_after.append((column, value))
                        else:
                            to_drop_after.append((column, value)) 
                            
                if len(to_drop_after) == len(to_drop_before):
                    break
                else:
                    to_drop_before= to_drop_after
                    to_drop_after = []
        
        self.D_selected = D_selected
        self.best_features = best_features
        self.best_score = best_score
        return self
        
    def transform(self, X):
        '''
        в матрице X оставляются отобранные признаки+значения признаков
        '''
        try:
            X[np.isnan(X)] = self.fill_value
        except:
            pass
        if len(self.best_features) !=0:
            x1 = X[:, self.best_features]
        else:
            x1 = None
        if len(list(self.D_selected.keys())) !=0:
            L=[]
            for k, v in self.D_selected.items():
                L.append(list(map(lambda x: x if x in v else self.fill_value, X[:, k])))
            x2 = np.column_stack(L)
        else:
            x2 = None
            
        if (x1 is not None) & (x2 is not None):
            _X = csc_matrix(hstack([f1, f2]))
        if (x1 is not None) & (x2 is None):
            _X = x1
        if (x1 is None) & (x2 is not None):
            _X = x2
        return _X  
    
    def best_score(self):
        return self.best_score