<a id='toc'></a>
* [1. FeatureBase](#feature_base)
    * [1.2 Tests](#fb_tests)
* [2. NumericalFeature](#numerical_feature)
* [3. AggregatedFeature](#aggregated_feature)
* [4. CategoricalFeature](#categorical_feature)
* [5. CategoricalCombiner](#categorical_combiner)
* [6. FeaturesStorage](#features_storage)

In [1]:
import sys
import numpy as np
import pandas as pd
import scipy
_add_to_path = True

In [6]:
if _add_to_path:
    sys.path.append('../')
from ml.feature import *

FEATURE_PREFIXES = \
{'CAT': '',
 'NUM': '',
 'LE' : '',    # LabelEncoded feature
 'OHE': 'Ohe', # OneHotEncoded feature
 'CTR': 'Ctr', # Counter feature
 'LOO': 'Loo', # LeaveOneOut feature
 'FIL': 'Fil'}

def print_columns(*args):
    all_labels = []
    all_values = []
    v_length = 0
    m_length = 0
    for label, values in args:
        m_length = max(m_length, len(label))
        all_labels.append(label)
        all_values.append(values)
        v_length = max(v_length, max([len(str(v)) for v in values]))
    for label, values in args:
        s = []
        if m_length > 0:
            s.append(label.ljust(m_length) + ':')
        for v in values:
            s.append(str(v).ljust(v_length))
        print(' '.join(s))

In [7]:
print(FeatureKernel.__doc__)


    FeatureKernel - класс, реализующий общий функционал класса FeatureBase. 
    Является одним из аттрибутов экземпляров класса FeatureBase (SparseFeatureBase и DenseFeatureBase).
    Реализуемые операции включают в себя: 
        1) проверку корректности значений признака 
        2) вывод сообщений о некорректности значений
        3) предобработку и постобработку признаков
        4) получение характеристик признаков (размера, формата и т.п.)
    


<a id='feature_base'></a>
## 1. FeatureBase [[toc](#toc)] [[up](#toc)] [[down](#fb_tests)]

In [4]:
%run test_feature_base.py

.....
----------------------------------------------------------------------
Ran 5 tests in 0.019s

OK


<a id='fb_tests'></a>
### 1.2 Tests<sup>[toc](#toc)</sup>

In [5]:
for sparse in [True, False]:
    values = [0, 1, 0, 0, 0, 1, 1.1, 1, 0, 4, 1, 7, 0, 0, 0]
    values = np.array(values)
    if sparse:
        values = csc_matrix(values)
        values.eliminate_zeros()
    name = 'f'
    feature = FeatureBase(values, name)

    fvalues = feature.values
    _fvalues = feature._values
    if sparse:
        print('SPARSE =', sparse)
        print('str(feature) =', feature)
        print('feature.values.shape = {}\nfeature.values = {}'.format(fvalues.shape, fvalues.tocsr()))
        print('feature._values.shape = {}\nfeature._values = {}'.format(_fvalues.shape, _fvalues.tocsr()))
        print('[Dense ]:', feature.get_values(sparse=False).flatten())
        print('[Sparse]:', feature.get_values(sparse=True).tocsr())
    else:
        print('\n\n\nSPARSE =', sparse)
        print('str(feature) =', feature)
        print('feature.values.shape = {}\nfeature.values = {}'.format(fvalues.shape, fvalues.flatten()))
        print('feature._values.shape = {}\nfeature._values = {}'.format(_fvalues.shape, _fvalues.flatten()))
        print('[Dense ]:', feature.get_values(sparse=False).flatten())
        print('[Sparse]:', feature.get_values(sparse=True).tocsr())
    print('feature.shape = {}, feature.name = {}'.format(feature.shape, feature.name))
    print('is_numeric = ', feature.is_numeric())

SPARSE = True
str(feature) = [FeatureBase: f, (1, 15)]
feature.values.shape = (15, 1)
feature.values =   (1, 0)	1.0
  (5, 0)	1.0
  (6, 0)	1.1
  (7, 0)	1.0
  (9, 0)	4.0
  (10, 0)	1.0
  (11, 0)	7.0
feature._values.shape = (1, 15)
feature._values =   (0, 1)	1.0
  (0, 5)	1.0
  (0, 6)	1.1
  (0, 7)	1.0
  (0, 9)	4.0
  (0, 10)	1.0
  (0, 11)	7.0
[Dense ]: [[ 0.   1.   0.   0.   0.   1.   1.1  1.   0.   4.   1.   7.   0.   0.   0. ]]
[Sparse]:   (1, 0)	1.0
  (5, 0)	1.0
  (6, 0)	1.1
  (7, 0)	1.0
  (9, 0)	4.0
  (10, 0)	1.0
  (11, 0)	7.0
feature.shape = (1, 15), feature.name = f
is_numeric =  True



SPARSE = False
str(feature) = [FeatureBase: f, (15,)]
feature.values.shape = (15, 1)
feature.values = [ 0.   1.   0.   0.   0.   1.   1.1  1.   0.   4.   1.   7.   0.   0.   0. ]
feature._values.shape = (15,)
feature._values = [ 0.   1.   0.   0.   0.   1.   1.1  1.   0.   4.   1.   7.   0.   0.   0. ]
[Dense ]: [ 0.   1.   0.   0.   0.   1.   1.1  1.   0.   4.   1.   7.   0.   0.   0. ]
[Sparse]:   (1

<a id='numerical_feature'></a>
## 2. NumericalFeature<sup>[toc](#toc)</sup>

In [6]:
%run test_numerical_feature.py

.....
----------------------------------------------------------------------
Ran 5 tests in 0.021s

OK


<a id='aggregated_feature'></a>
## 3. AggregatedFeature<sup>[toc](#toc)</sup> <sup>[down](#categorical_feature)</sup>

In [16]:
class AggregatedFeature(Checker):
    DELETE_FEATURE = 4
    """
    Позволяет хранить значения нескольких признаков, например, OHE представления категориальных признаков.
    """
    def __init__(self, features, name, copy=True, verbose=0, treat_const='none'):
        """
        Аргументы:
            :param features - список объектов FeatureBase
            :param name     - имя агрегированного признака
            :param exclude_const - исключить константные признаки из множества?
            :param copy     - если True, то каждый каждый признак будет скопирован
            :param verbose  - уровень печати (nonnegative int)
        """
        self._set_treat_const(treat_const)
        self._set_features(features, name, copy)
        self._verbose = verbose
        
    def _set_treat_const(self, treat_const):
        treat_vals = ['none', 'delete', 'assert']
        if treat_const not in treat_vals:
            raise ValueError('treat_const must be one of the following: {}.'.format(treat_vals))
        self._treat_const = treat_const
        
    def _set_features(self, features, name, copy=True):
        """
            :param features - список объектов FeatureBase
            :param name - имя агрегированного признака
            :param copy - если True, то сохраняются копии признаков
        """
        self._check_features(features, name)
        self._name = name
        self._feature_names = [feature._name for feature in features]
        if copy:
            _features = {feature._name: feature.deepcopy() for feature in features}
        else:
            _features = {feature._name: feature for feature in features}
        if self._treat_const == 'none':
            self._features = _features
        elif self._treat_const == 'delete':
            self._features = {feature._name: features for feature in _features if not feature.is_constant()}
        else:
            for feature in _features:
                if feature.is_constant():
                    raise ValueError('Constant feature "{}" for treat_const="{}"'.format(
                        feature._name, self._treat_const))
            self._features = _features
        
    def _check_features(self, features, name):
        """
        Проверяет, что признаки хранятся в list или np.ndarray, все признаки имеют тип FeatureBase, размеры признаков равны.
        """
        if not isinstance(features, (np.ndarray, list)):
            raise TypeError('Wrong format of "features" with name "{}" for "{}".'.format(name, type(self).__name__))
        if not all([isinstance(feature, FeatureBase) for feature in features]):
            raise TypeError('One of subfeatures of feature "{}" is not an object of FeatureBase.'.format(name))
        lengths = [len(feature) for feature in features]
        if min(lengths) != max(lengths):
            raise ValueError('Provided features with name "{}" have different lengths'.format(name))
        if min(lengths) == 0:
            raise ValueError('Features with name "{}" have zero length. Must have positive length.'.format(name))

    def exclude_constant(self, verbose=False):
        """
        Исключает константные подпризнаки из рассмотрения.
        """
        to_delete = []
        for feature_name in self._features:
            if self._features[feature_name].is_constant():
                to_delete.append(feature_name)
        for feature_name in to_delete:
            if verbose: print('Deleting constant feature "{}"'.format(feature_name))
            del self._features[feature_name]
        self._feature_names = [feature_name for feature_name in self._feature_names 
                               if feature_name in self._features]

    def is_constant(self):
        if len(self._features) == 0: # In case if all feature are excluded due to constant values
            return True
        return all([feature.is_constant() for feature in self._features.values()])
            
    def get_values(self, feature_names=None, sparse=False, as_dataframe=False, **kwargs):
        if feature_names is None:
            feature_names = self._feature_names
        features = [self._features[feature_name] for feature_name in feature_names 
                    if feature_name in self._features.keys()]
        # TODO проверки
        X = []
        for feature in features:
            X.append(feature.get_values(sparse=sparse))
        if len(X) == 0: # Если вдруг все пусто
            raise ValueError("All values are constant. Senseless feature!")
        if sparse:
            X = scipy.sparse.hstack(X)
        else:
            X = np.concatenate(X, axis=1)
            if as_dataframe:
                X = pd.DataFrame(X, columns=feature_names)
        return X
    
    def __repr__(self):
        s = 'AggregatedFeature['
        for feature_name in [feature_name for feature_name in self._feature_names 
                             if feature_name in self._features]:
            feature = self._features[feature_name]
            s += str(feature)
        s += ']'
        return s
    
n_features = 5
features = []
feature_names = []
size = 10
sparse = True; as_dataframe=True
for n_feature in range(n_features):
    values = np.random.randint(low=0, high=2, size=size)
    features.append(NumericalFeature(values, 'F' + str(n_feature), verbose=0))
    feature_names.append(features[-1].get_name())
aggr_feature = AggregatedFeature(features, 'AGGR', copy=False)
print(aggr_feature)
values = aggr_feature.get_values(sparse=sparse, as_dataframe=as_dataframe)
print(values)
if sparse:
    print(values.todense())
values = aggr_feature.get_values(feature_names=feature_names[1:4], sparse=sparse, as_dataframe=as_dataframe) 
print(values)
if sparse:
    print(values.todense())

AggregatedFeature[[NumericalFeature: F0, (10,)][NumericalFeature: F1, (10,)][NumericalFeature: F2, (10,)][NumericalFeature: F3, (10,)][NumericalFeature: F4, (10,)]]
  (2, 0)	1
  (4, 0)	1
  (9, 0)	1
  (2, 1)	1
  (4, 1)	1
  (5, 1)	1
  (7, 1)	1
  (9, 1)	1
  (3, 2)	1
  (6, 2)	1
  (8, 2)	1
  (0, 3)	1
  (5, 3)	1
  (9, 3)	1
  (0, 4)	1
  (3, 4)	1
  (4, 4)	1
  (8, 4)	1
  (9, 4)	1
[[0 0 0 1 1]
 [0 0 0 0 0]
 [1 1 0 0 0]
 [0 0 1 0 1]
 [1 1 0 0 1]
 [0 1 0 1 0]
 [0 0 1 0 0]
 [0 1 0 0 0]
 [0 0 1 0 1]
 [1 1 0 1 1]]
  (2, 0)	1
  (4, 0)	1
  (5, 0)	1
  (7, 0)	1
  (9, 0)	1
  (3, 1)	1
  (6, 1)	1
  (8, 1)	1
  (0, 2)	1
  (5, 2)	1
  (9, 2)	1
[[0 0 1]
 [0 0 0]
 [1 0 0]
 [0 1 0]
 [1 0 0]
 [1 0 1]
 [0 1 0]
 [1 0 0]
 [0 1 0]
 [1 0 1]]


<a id='categorical_feature'></a>
## 4. CategoricalFeature<sup>[toc](#toc)</sup>

In [10]:
import numpy as np
import unittest
import copy
import numbers
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from itertools import product, chain

class CategoricalFeature(FeatureBase):
    """
    Класс для хранения категориальных признаков. На данный момент доступна только реализация с 
    dense хранением данных. 
    
    Список методов:
        deepcopy
        set_label2cat
        set_cat2label
        get_cat_values
        get_filter_feature
        get_counter_feature
        get_loo_feature
        get_le_feature
        get_ohe_feature
    """
    ################################################################################### 

    CAT_FEATURE_INIT = 8
    OHE = 9
    def _is_label_encoded(self, values, name=None):
        """
        Возвращает True, если значения признака label encoded. Иначе возвращает False или вызывает исключение
        (в зависимости от параметра throw).
        Аргументы:
            :param values - категориальные значения (np.ndarray)
            :param name - имя категориального признака (str)
        """
        if not self.is_numeric():
            return False
        labels = sorted(list(set(values)))
        prev_value = labels[0]
        if prev_value != 0:
            return False
        for value in labels[1:]:
            if value != prev_value + 1:
                return False
            prev_value = value
        return True
    
    def is_label_encoded(self):
        return self._is_label_encoded(self._values, self._name)

    def _check_label_encoded(self, values, name, throw=True):
        """
        Возвращает True, если значения признака label encoded. Иначе возвращает False или вызывает исключение
        (в зависимости от параметра throw).
        Аргументы:
            :param values - значения признака (np.ndarray)
            :param name   - имя категориального признака (str)
            :param throw  - вызывать исключение? (bool)
        """
        if not self._is_label_encoded(values):
            if throw: 
                raise ValueError(self._error_msg('Feature "{}" is not label-encoded'.format(name)))
            return False
        return True
            
    def _check_cat2label(self, values, name, cat2label, throw=True):
        """
        Проверяет, что преобразование категорий в метки корректно. Возвращает True в случае корректности.
        Иначе возвращает False или вызывает исключение (в зависимости от параметра throw).
        Аргументы:
            :param values    - значения признака (np.ndarray)
            :param name      - имя признака (cat)
            :param cat2label - преобразование в метки (dict)
            :param throw     - вызывать исключение? (bool)
        """
        if not set(values) == set(cat2label.values()):
            if throw: 
                print(set(values), set(cat2label.values()))
                raise ValueError(self._error_msg('Num of values != number of labels for feature "{}"'.format(name)))
            else: 
                return False
        if not len(set(cat2label.keys())) == len(set(cat2label.values())):
            if throw: 
                raise ValueError(self._error_msg('There is no one-to-one correspondance in cat2label for feature "{}"'.format(name)))
            else: 
                return False
        return True
    
    ###################################################################################
    def deepcopy(self):
        new_feature = CategoricalFeature(copy.deepcopy(self._values), self._name, verbose=self._verbose)
        new_feature.set_cat2label(self._cat2label)
        return new_feature
        
    def __repr__(self):
        return type(self).__name__ + '({}; {})'.format(self.name, self.values.shape)
    
    def __init__(self, values, name, cat2label=None, verbose=0):
        """
        По завершении работы конструктора признаки оказываются закодированы метками от 1 до N, где
        N - число различных значений признака.
        Аргументы:
            :param values - значения категориальной переменной (np.ndarray, list)
            :param name   - имя категориальной переменной (str)
            :param cat2label - mapping для преобразования категорий в метки (dict)
        """
        assert isinstance(values, (np.ndarray, list))
        super().__init__(values, name, verbose)
        self._name = self._get_categorical_name(name)
        msg_init = self._info_msg('__init__({})'.format(name))
        
        self._cat2label = None
        self._label2cat = None
        if cat2label is not None:
            self._printers[self.CAT_FEATURE_INIT](msg_init + ': applying mapping "cat2label" to values')
            self._values = np.array(list(map(lambda cat: cat2label[cat], self._values)))
            self._check_cat2label(self._values, self._name, cat2label)
            self._cat2label = copy.deepcopy(cat2label)
            self._label2cat = {label:cat for cat, label in cat2label.items()}
            
        self._properties = {}
        self._properties['is_numeric'] = self.is_numeric()
        self._properties['is_label_encoded'] = self.is_label_encoded()
        self._properties['is_constant'] = self.is_constant()
        if self._properties['is_label_encoded']:
            self._printers[self.CAT_FEATURE_INIT](msg_init + ': feature "{}" is already label encoded'.format(name))
        else:
            self._printers[self.CAT_FEATURE_INIT](msg_init + ': label encoding feature "{}"'.format(name))
        self._label_encode()
  
        # These values are used for filtering rare values
        self._threshold = None
        self._unique_label = None
    
        assert self._properties['is_label_encoded'], 'By the end of constructor feature "{}" is not label encoded. Something is wrong.'.format(self.name)
        assert self._properties['is_numeric'], 'By the end of the constructor feature "{}" is not numeric. Something is wrong'.format(self.name)
        
    ##################################################################################
    def set_label2cat(self, label2cat=None):
        if label2cat is None:
            self._label2cat = None
            self._cat2label = None
        else:
            cat2label = {cat:label for label, cat in label2cat.items()}
            self._check_cat2label(self._values, self._name, cat2label, True)
            self._label2cat = copy.deepcopy(label2cat)
            self._cat2label = cat2label
            
    def set_cat2label(self, cat2label=None):
        """
        Подразумевает, что сейчас в self._values хранятся метки
        """
        if cat2label is None:
            self._cat2label = None
            self._label2cat = None
        else:
            self._check_cat2label(self._values, self._name, cat2label, True)
            self._cat2label = copy.deepcopy(cat2label)
            self._label2cat = {label:cat for cat, label in cat2label.items()}
        
    def get_cat_values(self):
        """
        Возвращает признаки в виде изначальных категорий, а не в LE-закодированном виде, в котором 
        они хранятся внутри класса CategoricalFeature.
        """
        if self._label2cat is None:
            # Такое возможно только если признак изначально был передан в закодированном виде
            assert self._properties['is_label_encoded'], 'Expected encoded feature.'
            return np.array(self._values)
        return np.array(list(map(lambda label: self._label2cat[label], self._values)))
        
    ##################################################################################
        
    def _filter_feature(self, threshold):
        """
        Отфильтровывает те категории, которые встречаются не более threshold раз. Заменяет их на новую 
        категорию. Данная категория будет иметь максимальное значение метки. Применение данной функции 
        ведет к преобразованию имени признака: добавляется приставка FIL_
        
        Аргументы:
            :param threshold - если число появлений категории не превосходит threshold, 
                                то она отсеивается (int, float)
        """
        
        # Checking if the feature is label encoded
        if not self._properties['is_label_encoded']:
            raise ValueError('Cannot filter feature "{}" as it is not label encoded.'.format(self.name))
        # Even if the filtration does not change feature values, we change its name and threshold parameters
        self._name = self._get_filtered_name(self._name, threshold)
        self._threshold = threshold
        
        # Checking if there are rare values present in the feature
        counts = Counter(self._values)
        for label, n_occurences in counts.items():
            if n_occurences <= threshold:
                self._unique_label = self._values.max() + 1
                self._properties['is_label_encoded'] = False
                break
        if self._unique_label is None: 
            # There are no labels which occur less or equal to threshold times
            return
        
        # Some features occur less or equal threshold times. Let us find them
        rare_labels = set()
        rare_categories = set()
        # Changing rare labels to the chosen unique_label
        for n, label in enumerate(self._values):
            if counts[label] <= threshold:
                if self._cat2label is not None:
                    rare_labels.add(label)
                    rare_categories.add(self._label2cat[label])
                self._values[n] = self._unique_label # setting rare label to new value

        # Forming new categories names
        if self._cat2label is not None:    
            if len(rare_categories) > 1:
                new_cat = '(' + '|'.join(sorted(list(rare_categories))) + ')'
            elif len(rare_categories) == 1:
                new_cat = list(rare_categories)[0]
            else:
                assert False, '"rare_categories" must not be empty at this point. Something is wrong.'

            for label in rare_labels:
                del self._label2cat[label]
            self._label2cat[self._unique_label] = new_cat
            for cat in rare_categories:
                del self._cat2label[cat]
            self._cat2label[new_cat] = self._unique_label
            
        self._properties['is_constant'] = self.is_constant()
        self._label_encode()
        assert self._properties['is_label_encoded']
        assert self._properties['is_numeric']
        
    def get_filtered_feature(self, threshold):
        """ 
        Возвращает признак, полученный из данного фильтрацией категорий по порогу threshold: 
        все категории, встречающиеся не чаще чем threshold, отфильтровываются функцией _filter_feature.
        Все отфильтрованные категории становятся новой категорией.
       
        Аргументы:
            :param - если число появлений категории не превосходит threshold, то она отсеивается (int, float)
        """
        new_feature = self.deepcopy()
        new_feature._filter_feature(threshold)
        return new_feature
    
    def get_counter_feature(self):
        """
        Возвращает признак NumericalFeature, равный числу появления каждой из категорий.
        """
        counts = Counter(self._values)
        new_values = np.zeros_like(self._values)
        for n, value in enumerate(self._values):
            new_values[n] = counts[value]
        new_name = self._get_counter_name(self._name)
        return NumericalFeature(new_values, new_name)

    def get_loo_feature(self, Y_train, cv, alpha=0.01, seed=1234, scale=0.01):
        """
        Предполагает, что первые len(Y_train) примеров принадлежат обучающей выборке
        """
        assert isinstance(Y_train, (np.ndarray, list))
        assert len(Y_train) <= len(self._values)
        train_size = len(Y_train)
        test_size = len(self._values) - train_size
        
        np.random.seed(seed)
        X_train = self._values[:train_size]
        mean_y = np.mean(Y_train)
        all_labels = set(self._values)
        X_new_train = np.zeros(len(X_train))

        for n_split, (train_indices, test_indices) in enumerate(cv.split(X_train, Y_train)):
            x_train, y_train = X_train[train_indices], Y_train[train_indices]
            x_test, y_test = X_train[test_indices], Y_train[test_indices]
            for label in all_labels:
                N_all = x_train.shape[0]
                train_mask = x_train == label
                N_label = np.sum(train_mask)
                print('n_split = {}, label = {}, den = {}'.format(n_split, label, N_label + alpha * N_all))
                X_new_train[test_indices[x_test == label]] = \
                    (np.sum(y_train[train_mask]) + alpha * mean_y * N_all) / (max(N_label, 1) + alpha * N_all)
        if scale > 0:
            multipliers = np.random.normal(loc=1.0, scale=scale, size=len(self._values))
        else:
            multipliers = np.ones(len(self._values))
        if test_size > 0:
            X_test = self._values[train_size:]
            X_new_test = np.zeros(test_size)
            for label in all_labels:
                train_mask = X_train == label
                N_all = train_size
                N_label = np.sum(train_mask)
                X_new_test[X_test == label] = (np.sum(Y_train[train_mask]) +
                                               alpha * mean_y * train_size) / (max(N_label, 1) + alpha * N_all)

            X_new = np.concatenate([X_new_train, X_new_test]) * multipliers
        else:
            X_new = X_new_train * multipliers
        new_name = self._get_loo_name(self._name)
        return NumericalFeature(X_new, new_name)
        
    ############################################################
    ##                       Кодировщики                      ##
    ############################################################
    def _label_encode(self):
        """
        Выполняет label-кодирование признака.
        """
        if self._properties['is_label_encoded']:
            if len(FEATURE_PREFIXES['LE']) > 0:
                if not self._name.startswith(FEATURE_PREFIXES['LE']):
                    self.name = self._get_label_encoded_name(self._name)
            return
        
        label_encoder = LabelEncoder()
        self._values = label_encoder.fit_transform(self._values)
        classes = label_encoder.classes_
        old_label2new_label = {old_label:new_label for new_label, old_label in enumerate(classes)}
        new_label2old_label = {new_label:old_label for new_label, old_label in enumerate(classes)}

        self._name = self._get_label_encoded_name(self.name)
        self._properties['is_label_encoded'] = self.is_label_encoded()
        self._properties['is_numeric'] = self.is_numeric()
        self._properties['is_constant'] = self.is_constant()
        
        if self._unique_label is not None:
            # This placed can be reached when _label_encode() is invoked from _filter_feature()
            self._unique_label = old_label2new_label[self._unique_label]
            assert self._unique_label == len(old_label2new_label) - 1
            assert (FEATURE_PREFIXES['FIL'] + '{}_'.format(self._threshold)) in self._name
            
        if self._label2cat is None:
            self._cat2label = old_label2new_label
            self._label2cat = new_label2old_label
        else:
            new_label2cat = {}
            for old_label in self._label2cat:
                new_label = old_label2new_label[old_label]
                new_label2cat[new_label] = self._label2cat[old_label]
            cat2new_label = {cat:new_label for new_label, cat in new_label2cat.items()}
            self._cat2label = cat2new_label
            self._label2cat = new_label2cat
            
        assert self._properties['is_label_encoded']
        assert self._properties['is_numeric']

    def get_le_feature(self):
        """
        Возвращает LE-закодированный признак, полученный на основе данного. В данной реализации CategoricalFeature
        поддерживается инваринат: внутреннее состояние признака всегда LE-закодированное. Поэтому вызов
        _label_encode() в реализации функции по сути бесполезен. Возможно что-то измениться в будущих версиях.
        """
        new_feature = self.deepcopy()
        new_feature._label_encode()
        return new_feature
    
    def get_ohe_feature(self, sparse=True, omit_uniques=False):
        """
        Аргументы:
            :param sparse       - вернуть sparse или dense представление? (bool)
            :param omit_uniques - если True, то отфильтрованная категория не войдет в состав OHE-признака (bool)
        """
        assert self._properties['is_label_encoded']
        assert self._properties['is_numeric']
        msg_base = self._method_msg('get_ohe_feature(): ')
        
        if (not omit_uniques) or (self._unique_label is None):
            unique_label = -1
        else:
            unique_label = self._unique_label
        
        ohe_name = self._get_ohe_name(self._name)
        counter = Counter(self._values)
        
        if self._properties['is_constant']:
            # No sense of OHE for constant feature
            assert np.sum(self._values) == 0
            assert len(self._cat2label) == 1
            assert list(self._cat2label.values())[0] == 0
            self._printers[self.OHE](msg_base + 'OHE of constant feature "{}".'.format(self._name))
            return NumericalFeature(self._values, ohe_name)
        
        if (len(counter) == 2):
            # In case of binary feature one column of OHE representation can be omitted
            assert set(self._cat2label.values()) == set([0, 1])
            self._printers[self.OHE](msg_base + 'OHE senseless for binary feature "{}".'.format(self._name))
            return NumericalFeature(self._values, ohe_name)
        
            # На данный момент непонятно, почему при unique_label >= 0 возвращали константу
            """if unique_label >= 0:
                assert unique_label == 1
                self._printers[self.OHE](msg_base + 'omiting unique label for "{}" turns it constant.'.format(self._name))
                return NumericalFeature(np.zeros(len(self._values)), ohe_name)
            else:
                self._printers[self.OHE](msg_base + 'OHE senseless for binary feature "{}".'.format(self._name))
                return NumericalFeature(self._values, ohe_name)"""
        
        ohe_values = OneHotEncoder(sparse=sparse).fit_transform(self._values[:, np.newaxis])
        if sparse:
            ohe_values = ohe_values.tocsc()
        if unique_label >= 0:
            assert unique_label == len(counter) - 1
            mask = (self._values == unique_label)
            if sparse:
                last_column = ohe_values[:, unique_label].toarray().flatten()
            else:
                last_column = ohe_values[:, unique_label]
            assert np.all(last_column == mask), 'Last column of ohe feature must correspond to unique_label.'
            ohe_values = ohe_values[:, :unique_label]       
        
        feature_names = []
        feature_values = []
        for label in sorted(self._label2cat.keys()):
            if label != unique_label:
                feature_names.append(self._label2cat[label])
                feature_values.append(ohe_values[:, label])
        features = [NumericalFeature(fvalues, fname) for fvalues, fname in zip(feature_values, feature_names)]
        return AggregatedFeature(features, ohe_name, verbose=self._verbose, copy=False)
        
    def get_properties(self):
        return copy.deepcopy(self._properties)


name = 'f'
cat_values = ['A', 'A', 'B', 'A', 'B', 'C', 'A', 'B', 'C', 'D', 'A', 'B', 'C', 'D', 'E']
values     = [0,   0,    1,   0,   1,   2,   0,   1,   2,   3,   0,   1,   2,   3,   4]
cat2label  = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
f = CategoricalFeature(cat_values, name, cat2label, verbose=0)

test_initial = False
test_counter = False
test_filtered = False
test_ohe = False
test_ohe_filtered = False
test_loo = False

if test_initial:
    print('INITIAL FEATURE:')
    print('True values    : ', values)
    print('Obtained values: ', list(f.get_values().flatten()))
    print('\nTrue CAT values: ', cat_values)
    print('Obtained CATs  : ', list(f.get_cat_values()))

if test_counter:
    print('\n\nCOUNTER FEATURE')
    print('Initial feature: ', f)
    counter_f = f.get_counter_feature()
    print('Counter feature: ', counter_f)
    print(counter_f.get_values().flatten())
    print('Values of initial and counter features:')
    args = [('initial', f.get_values().flatten()), ('counter', counter_f.get_values().flatten())]
    print_columns(*args)

if test_filtered:
    print('\n\nFILTERED FEATURES')
    ffs = {n:f.get_filtered_feature(n) for n in range(6)}
    for n in range(6):
        fil_feature = ffs[n].get_values().flatten()
        cat_feature = ffs[n].get_cat_values().flatten()
        ctr_feature = ffs[n].get_counter_feature().get_values().flatten()
        print('fil_feature props: ', ffs[n].get_properties())
        print('fil feature name: ', ffs[n])
        print('ctr feature name: ', ffs[n].get_counter_feature())
        args = [('fil_feature', fil_feature), ('cat_feature', cat_feature), ('ctr_feature', ctr_feature)]
        print_columns(*args)
        print('\n\n')

if test_ohe:   
    print('\n\nOHE FEATURES')
    ohe_feature = f.get_ohe_feature()
    a = f.get_ohe_feature(sparse=False).get_values(sparse=True).toarray()
    b = f.get_ohe_feature(sparse=False).get_values(sparse=False)
    c = f.get_ohe_feature(sparse=True).get_values(sparse=False)
    d = f.get_ohe_feature(sparse=True).get_values(sparse=True).toarray()
    assert np.allclose(a, b)
    assert np.allclose(b, c)
    assert np.allclose(c, d)
    assert np.allclose(d, a)
    print('Initial feature:', ohe_feature)
    print('\tvalues:\n', f.get_values().flatten())
    print('OHE feature:', ohe_feature)
    print('\tOHE values:\n', a)

if test_ohe_filtered:
    print('\n\nOHE FEATURES + FILTRATION')
    for threshold, omit_uniques in product([1, 2, 3, 4, 5], [False, True]):
        ff = f.get_filtered_feature(threshold=threshold)
        print('OHE feature with omit_uniques = {} and threshold = {}'.format(omit_uniques, threshold))
        print('Initial  feature name:', f)
        print('Filtered feature name:', ff)
        print('Initial feature values:', f.get_values().flatten())
        print('Filtered fature values:', ff.get_values().flatten())
        print('threhold =', ff._threshold, '  unique_label =', ff._unique_label)
        ff_ohe = ff.get_ohe_feature(omit_uniques=omit_uniques)
        print('FilOHE feature name:  ', ff_ohe)
        print('FilOHE feature values:\n', ff_ohe.get_values(sparse=False))
        print('FilOHE is constant:   ', ff_ohe.is_constant())
        print('\n\n')

if test_loo:
    print('\n\nLEAVE ONE OUT')
    X = np.array([0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0])
    y = np.array([0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0])
    cat_feature = CategoricalFeature(X, 'f')
    random_state = 345
    n_splits = 2
    cv = StratifiedKFold(n_splits, shuffle=True, random_state=random_state)
    print_columns(('ind', np.arange(len(X))), ('X', X), ('y', y))
    for n_split, (train_indices, test_indices) in enumerate(cv.split(X, y)):
        print('\n\nn_split =', n_split)
        X_tr, y_tr = X[train_indices], y[train_indices]
        X_ts, y_ts = X[test_indices],  y[test_indices]
        X_loo_true = np.array([1/4., 2/3., 0, 2/3., 0, 2/3., 2/3., 2/3., 2/3., 0, 0, 1/4.])
        print()
        print_columns(('ind', train_indices), ('X_tr', X_tr), ('y_tr', y_tr))
        print()
        print_columns(('ind', test_indices), ('X_ts', X_ts), ('y_ts', y_ts))
        
    loo_feature = cat_feature.get_loo_feature(y, cv, alpha=0, scale=0.0)
    X_loo_found = loo_feature.get_values(sparse=False).flatten()
    print('LOO:\n')
    print_columns(('True', X_loo_true), ('Found', X_loo_found))
    np.allclose(X_loo_found, X_loo_true)
    #print(cat_feature, cat_feature)
    #print(loo_feature, loo_feature.values

<a id='categorical_combiner'></a>
# 5. CategoricalCombiner<sup>[toc](#toc)</sup>

In [8]:
from itertools import product, chain, combinations

class CategoricalCombiner(Checker):
    """
    get_all_combinations
    get_combined_feature
    """
    METHOD = 4
    def __init__(self, verbose=0):
        super().__init__()
        self._verbose = verbose

    
        
    def get_all_combinations(self, features, degree=None, hash=hash):
        """
        Возвращает всевозможные комбинации степени degree из признаков 
        Аргументы:
            :param features - признаки для комбинирования; все должны быть CategoricalFeature
            :param degree   - степень комбинаций; каждый новый признак - это комбинация degree признаков
            :param hash     - функция превращения комбинации признаков в значение нового признака
        """
        feature_names = [feature.name for feature in features]
        
        method_msg = self._method_msg('get_all_combinations')
        methdo_msg = method_msg + '({}, degree={})'.format(feature_names, degree)
        self._printers[self.METHOD](method_msg)
        
        combined_features = {}
        if degree is None:
            degree_range = range(1, len(feature_names) + 1)
        else:
            degree_range = [degree]
        for degree in degree_range:
            for some_features in combinations(features, degree):
                new_feature = self.get_combined_feature(some_features, hash) 
                combined_features[new_feature.get_name()] = new_feature
        return combined_features

    def get_combined_feature(self, features, hash=hash):
        self.check_sizes_(features)
        if len(features) < 1:
            raise ValueError('At least one feature name must be given')
        if len(features) == 1:
            return features[0].deepcopy()
                             
        feature_values = []
        feature_names = []
        for feature in features:
            values = feature.get_values(False).flatten()
            feature_values.append(values)
            feature_names.append(feature.get_name())
            
        new_values = []
        for hyper_value in zip(*feature_values):
            new_values.append(hash(hyper_value))
        new_values = LabelEncoder().fit_transform((new_values))
        new_name = '+'.join(feature_names)
        return CategoricalFeature(new_values, new_name)

    def check_sizes_(self, features):
        if len(Counter([len(feature) for feature in features])) != 1:
            raise ValueError('Features must have equal sizes!')

            
test = True
if test:
    features = {'f1': [0, 1, 2, 0, 1, 2, 2, 0, 1, 2, 2, 0],
                'f2': [0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0],
                'f3': [1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1]}
    cat_features = [CategoricalFeature(features[name], name) for name in sorted(features.keys())]
    cat_combiner = CategoricalCombiner()
    new_feature = cat_combiner.get_combined_feature(cat_features)
    print('comb_feature:', new_feature)
    print('name = {}, values = {}'.format(new_feature._name, new_feature._values))
    fil_feature = new_feature.get_filtered_feature(1)
    print('fil_feature: ', fil_feature)
    print('name = {}, values = {}'.format(fil_feature.name, fil_feature._values))
    for degree in [1, 2, 3, None]:
        print('\ndegree = {}'.format(degree))
        new_features = cat_combiner.get_all_combinations(cat_features, degree=degree)
        print('new_features:', new_features)
        args = []
        for f_name, feature in new_features.items():
            args.append(('  ' + f_name, feature.get_values(False).flatten()))
        args = sorted(args, key=lambda x: x[0])
        print_columns(*args)

comb_feature: [CategoricalFeature: f1+f2+f3, (12,)]
name = f1+f2+f3, values = [0 2 6 1 4 7 7 1 3 5 6 0]
fil_feature:  [CategoricalFeature: Fil1_f1+f2+f3, (12,)]
name = Fil1_f1+f2+f3, values = [0 4 2 1 4 3 3 1 4 4 2 0]

degree = 1
new_features: {'f3': CategoricalFeature(f3; (12, 1)), 'f2': CategoricalFeature(f2; (12, 1)), 'f1': CategoricalFeature(f1; (12, 1))}
  f1: 0 1 2 0 1 2 2 0 1 2 2 0
  f2: 0 1 0 1 0 1 1 1 1 0 0 0
  f3: 1 0 1 1 1 1 1 1 1 0 1 1

degree = 2
new_features: {'f1+f3': CategoricalFeature(f1+f3; (12, 1)), 'f2+f3': CategoricalFeature(f2+f3; (12, 1)), 'f1+f2': CategoricalFeature(f1+f2; (12, 1))}
  f1+f2: 0 2 4 1 3 5 5 1 2 4 4 0
  f1+f3: 0 2 4 0 1 4 4 0 1 3 4 0
  f2+f3: 1 3 1 2 1 2 2 2 2 0 1 1

degree = 3
new_features: {'f1+f2+f3': CategoricalFeature(f1+f2+f3; (12, 1))}
  f1+f2+f3: 0 2 6 1 4 7 7 1 3 5 6 0

degree = None
new_features: {'f1+f3': CategoricalFeature(f1+f3; (12, 1)), 'f2+f3': CategoricalFeature(f2+f3; (12, 1)), 'f3': CategoricalFeature(f3; (12, 1)), 'f1+f2+f3': Ca

<a id='features_storage'></a>
# 6. CategoricalFeaturesManager<sup>[toc](#toc)</sup>

In [48]:
import collections

class CategoricalFeaturesManager(Checker):
    """
    Данный класс отвечает за хранение категориальных признаков. Предоставляет пользователю 
    следующие возможности:
        is_present
        add_feature
        set_feature
        del_feature
        get_feature
        get_list_of_features
    """
    METHOD = 4
    def __init__(self, features=None, verbose=0):
        """
            :param verbose - уровень печати сообщений
        """
        super().__init__()
        self._features = collections.OrderedDict()
        self._n_samples = None
        self._verbose = verbose
        self._categorical_combiner = CategoricalCombiner(verbose)
        if features is not None:
            self.set_features(features)
        
    def __contains__(self, feature_name):
        return feature_name in self._features
 
    ###################################################################
    def is_present(self, name):
        """
        Возвращает True, если признак с таким именем содержится в хранилище. Иначе - False.
        """
        return (name in self._features)
    def _check_if_present(self, *args):
        for name in args:
            if not self.is_present(name):
                raise ValueError(self._error_msg("unknown feature \"{}\"".format(name)))  
    def _is_binary(self, values):
        if len(Counter(values)) == 2:
            return True
        return False
    def _check_feature(self, feature):
        self._check_type(feature, str(feature), CategoricalFeature)
        if (self._n_samples is not None) & (len(feature) != self._n_samples):
            raise ValueError("Given feature vector has size {} while must have size {}.".format(
                        len(feature), self._n_samples))

    ###################################################################
    def add_feature(self, feature, copy=True, replace=False):
        self.set_feature(feature, copy, replace)
    def set_feature(self, feature, copy=True, replace=False):
        """
        Помещает признак в хранилище. По умолчанию всегда вызывает исключение, если признак с таким
        именем уже есть в хранилище. По умолчанию всегда сохраняет в хранилище копию признака.
        Аргументы:
            :param feature - словарь из {имя_признака: признак}. (dict)
            :param copy    - если True, то в хранилище будет помещена копия признака. (bool)
            :parma replace - если True, то признак с таким же именем будет заменен;
                             если False, то наличине признака с таким же именем вызывает исключение. (bool)
        """
        
        self._check_feature(feature) # новый признак имеет правильный размер и категориальный тип
        if not replace:              # если замена признака запрещена ...
            if self.is_present(feature.get_name()): # и уже есть признак с таким именем, то ...
                error_msg = self._method_msg('set_feature: ') +\
                    'feature "{}" cannot be replaced. Check "replace" parameter'.format(feature.get_name())
                raise ValueError(error_msg)

        self._n_samples = len(feature)
        self.del_feature(feature, throw=False)
        if copy:
            self._features[feature.get_name()] = feature.deepcopy()
        else:
            self._features[feature.get_name()] = feature
    def set_features(self, features, copy=True, replace=False):
        for feature in features:
            self.set_feature(feature, copy=copy, replace=replace)
    def del_feature(self, feature_name, throw=True):
        """
        Аргументы:
            :param feature_name
            :param throw
        """
        if not self.is_present(feature_name):
            if throw:
                error_msg = self._method_msg('del_feature') +\
                    'feature "{}" is not present in storage. Cannot be deleted.'.format(feature_name)
                raise KeyError(error_msg)
            return False
        else:
            del self._features[feature_name]
            if len(self._features) == 0:
                self._n_samples = None
            return True
    
    def get_feature(self, feature_name, copy=True):
        self._check_if_present(feature_name)
        if copy:
            return self._features[feature_name].deepcopy()
        return self._features[feature_name]
    
    def get_list_of_features(self):
        return list(self._features.keys())


    ###################################################################
    #    Функции комбинирования категориальных признаков              #
    ###################################################################
    def add_all_combinations(self, feature_names, degree, hash=hash):
        self.get_all_combinations(feature_names, degree, hash=hash, store=True, copy=False)
        
    def get_all_combinations(self, feature_names, degree, hash=hash, store=True, copy=True):
        method_msg = self._method_msg('get_all_combinations')
        self._printers[self.METHOD](method_msg + '(names={}, degree={}, store={})'.format(feature_names, degree, store))
        self._check_if_present(feature_names)
        
        features = {name: self._features[name] for name in feature_names}
        combined_features = self._categorical_combiner.get_all_combinartions(features, degree=degree, hash=hash)
        if store:
            if degree > 1:
                for name, combined_feature in combined_features:
                    self.set_feature(combined_feature, copy=copy, replace=False)
        return combined_features

    def add_combined_feature(self, feature_names, hash=hash):
        self.get_combined_feature(feature_names, hash=hash, store=True, copy=False)
        
    def get_combined_feature(self, feature_names, hash=hash, store=True, copy=True):
        method_msg = self._method_msg('get_combined_feature')
        self._printers[self.METHOD](method_msg + '(names={}, store={})'.format(feature_names, store))
        self._check_if_present(*feature_names)
        
        features = [self._features[name] for name in feature_names]
        combined_feature = CategoricalCombiner().get_combined_feature(features, hash=hash)
        if store:
            if len(feature_names) > 1:
                self.set_feature(combined_feature, copy=copy, replace=False)
        return combined_feature


    ############################################################
    ##       Сборка итогового признакового представления      ##
    ############################################################
    def assemble_data_frame(self, feature_names=None):
        """
        Возвращает dense матрицу
        """
        if feature_names is None:
            feature_names = list(self._features.keys())
        self._check_if_present(*feature_names)
        feature_values = []
        for feature_name in feature_names:
            feature_values.append(self._features[feature_name].get_values(sparse=False))
        return pd.DataFrame(np.hstack(feature_values), columns=feature_names)
    
    def assemble(self, feature_names, sparse=False):
        """
        Аргументы:
            :param feature_names
            :param sparse
        """
        self._check_if_present(feature_names)
        X = []
        feature_map = copy.deepcopy(feature_map)
        for feature_name in feature_names:
            feature = self._features[feature_name]
            X.append(feature.get_values(sparse=sparse))
        if sparse:
            return scipy.sparse.hstack(X)
        return np.hstack(X)

    def filter_features(self, names=None, threshold=1):
        if names is None:
            names = list(self._features.keys())
        for name in names:
            self._features[name]._filter_feature(threshold)
        self._update_dict()
        
    def _update_dict(self):
        new_features = collections.OrderedDict()
        for name in self._features:
            feature = self._features[name]
            new_name = feature.get_name()
            new_features[new_name] = feature
        self._features = new_features
        
    def add_filtered(self, name, threshold):
        self.get_filtered(name, threshold, store=True, copy=False)
    def get_filtered(self, name, threshold, store=True, copy=True):
        self._check_if_present(name)
        new_feature = self._features[name].get_filtered_feature(threshold)
        if store:
            self.set_feature(new_feature, copy=copy, replace=False)
        return new_feature

    def add_counter(self, name):
        self.get_filtered(name, store=True, copy=False)
    def get_counter(self, name, store=True, copy=True):
        self._check_if_present(name)
        new_feature = self._features[name].get_counter_feature(threshold)
        if store:
            self.set_feature(new_feature, copy=copy, replace=False)
        return new_feature
    def get_loo(self, name, y_train, cv, seed=1234):
        self._check_if_present(name)
        new_feature = self._features[name].get_loo_feature(y_train, cv=cv, seed=seed)
        return new_feature
        
    

test = True
if test:      
    manager = CategoricalFeaturesManager(verbose=0)
    features = [CategoricalFeature(np.array([0, 1, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4]), 'f1'),
                CategoricalFeature(np.array([1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1]), 'f2'),
                CategoricalFeature(np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]), 'f3')]
    for feature in features:
        manager.set_feature(feature)
        assert feature.shape == (14, )
        assert len(feature) == 14
    fnames = manager.get_list_of_features()
    print('SET UP: {}\n'.format(fnames))
    
    for fname in fnames:
        print('\t{} in manager = {}'.format(fname, fname in manager))
    
    print()
    df = manager.assemble_data_frame(fnames)
    print(df)
    
    print('\nDELETING AND SETTING FEATURES:')
    manager.del_feature('f1')
    print('del f1:', manager.get_list_of_features()) 
    manager.del_feature('f2')
    print('del f2:', manager.get_list_of_features())
    manager.del_feature('f3')
    print('del f3:', manager.get_list_of_features())

    for feature in features:
        manager.set_feature(feature)    
        print('set {}:'.format(feature.get_name()), manager.get_list_of_features())
    

    print('\nCOMBINING FEATURES:')
    new_feature = manager.get_combined_feature(['f1'])
    print(new_feature, new_feature.values.flatten())
    new_feature = manager.get_combined_feature(['f2'])
    print(new_feature, new_feature.values.flatten())
    new_feature = manager.get_combined_feature(['f3'])
    print(new_feature, new_feature.values.flatten())
    new_feature = manager.get_combined_feature(['f1', 'f2'])
    print(new_feature, new_feature.values.flatten())
    new_feature = manager.get_combined_feature(['f1', 'f3'])
    print(new_feature, new_feature.values.flatten())
    new_feature = manager.get_combined_feature(['f2', 'f3'])
    print(new_feature, new_feature.values.flatten())
    new_feature = manager.get_combined_feature(['f1', 'f2', 'f3'])
    print(new_feature, new_feature.values.flatten())
    print('all:', manager.get_list_of_features())
    
    print('\nFILTERED FEATURES:')
    print('\tbefore filtration')
    print('feature names:', manager.get_list_of_features())
    df = manager.assemble_data_frame()
    print(df)
    thr = 2
    manager.filter_features(threshold=2)
    #manager.add_filtered('f1', thr)
    #manager.add_filtered('f2', thr)
    #manager.add_filtered('f3', thr)
    print('\tafter filtration')
    print('feature names:', manager.get_list_of_features())
    df = manager.assemble_data_frame()
    print(df)
    
    manager = CategoricalFeaturesManager(features)

    """print('\nObtaining counters:')
    for name in ['f1', 'f2', 'f3']:
        FStest.add_counter(name)
        feature = FStest.get_feature('CTR_' + name)
        print(feature.name, feature.values)
    print('all:', FStest.get_list_of_features())
    print('cat:', FStest.get_list_of_features('CAT'))
    print('num:', FStest.get_list_of_features('NUM'))
    print('\nAssembling features')
    print(FStest.assemble(['f1', 'f2', 'f3', 'CTR_f3', 'f1+f2', 'f1+f3', 'f2+f3', 'f1+f2+f3', 'f4'], sparse=False))
    print(FStest.assemble(['FA2_f1', 'f2', 'f3', 'f1+f2', 'f1+f3', 'f2+f3'], 
                          {'FA2_f1': ['def', 'ohe'],
                           'f2': ['def', 'ohe'],
                           'f3': ['def', 'ohe']},
                          sparse=False))"""

SET UP: ['f1', 'f2', 'f3']

	f1 in manager = True
	f2 in manager = True
	f3 in manager = True

    f1  f2  f3
0    0   1   0
1    1   1   1
2    0   0   0
3    1   1   1
4    2   1   0
5    0   0   1
6    1   1   0
7    2   1   1
8    3   0   0
9    0   1   1
10   1   1   0
11   2   0   1
12   3   1   0
13   4   1   1

DELETING AND SETTING FEATURES:
del f1: ['f2', 'f3']
del f2: ['f3']
del f3: []
set f1: ['f1']
set f2: ['f1', 'f2']
set f3: ['f1', 'f2', 'f3']

COMBINING FEATURES:
[CategoricalFeature: f1, (14,)] [0 1 0 1 2 0 1 2 3 0 1 2 3 4]
[CategoricalFeature: f2, (14,)] [1 1 0 1 1 0 1 1 0 1 1 0 1 1]
[CategoricalFeature: f3, (14,)] [0 1 0 1 0 1 0 1 0 1 0 1 0 1]
[CategoricalFeature: f1+f2, (14,)] [1 2 0 2 4 0 2 4 6 1 2 3 5 7]
[CategoricalFeature: f1+f3, (14,)] [0 2 0 2 4 1 3 5 6 1 3 5 6 7]
[CategoricalFeature: f2+f3, (14,)] [3 2 0 2 3 1 3 2 0 2 3 1 3 2]
[CategoricalFeature: f1+f2+f3, (14,)] [ 3  5  0  5  8  1  4  7 10  2  4  6  9 11]
all: ['f1', 'f2', 'f3', 'f1+f2', 'f1+f3', 'f2+f3', 'f1

In [45]:
list(manager._features.keys())

['Fil2_f1',
 'Fil2_f2',
 'Fil2_f3',
 'Fil2_f1+f2',
 'Fil2_f1+f3',
 'Fil2_f2+f3',
 'Fil2_f1+f2+f3']

In [None]:
    """def add_categorized(self, name, bins, right=True):
        self.check_if_present_(name)
        self.check_type_(name, 'NUM')
        new_feature = self.features[name].get_categorized_feature(bins, right)
        self.features[new_feature.name] = new_feature
        self.types[new_feature.name] = 'CAT'
        return new_feature.name"""
    