<a id='toc'></a>
* [1. FeatureBase](#feature_base)
    * [1.2 Tests](#fb_tests)
* [2. NumericalFeature](#numerical_feature)
* [3. AggregatedFeature](#aggregated_feature)
* [4. CategoricalFeature](#categorical_feature)
* [5. CategoricalCombiner](#categorical_combiner)
* [6. FeaturesStorage](#features_storage)

In [1]:
import sys
import numpy as np
import pandas as pd
import scipy

In [2]:
%run feature_base.py

.....
----------------------------------------------------------------------
Ran 5 tests in 0.021s

OK


In [46]:
FEATURE_PREFIXES = \
{'CAT': '',
 'NUM': '',
 'LE' : '',    # LabelEncoded feature
 'OHE': 'Ohe', # OneHotEncoded feature
 'CTR': 'Ctr', # Counter feature
 'LOO': 'Loo', # LeaveOneOut feature
 'FIL': 'Fil'}

<a id='feature_base'></a>
## 1. FeatureBase<sup>[toc](#toc)</sup> <sup>[up](#toc)</sup> <sup>[down](#fb_tests)</sup>

In [50]:
# %load feature_base.py
import numpy as np
import unittest
import numbers
import copy
from scipy.sparse import csr_matrix, csc_matrix
from collections import Counter
from helpers import Checker, Printer
      
class FeatureKernel:
    """
    FeatureKernel - класс, реализующий базовые операции, класса FeatureBase. 
    Данные операции включают в себя: 
        1) проверку корректности значений признака 
        2) вывод сообщений о некорректности значений
        3) предобработку и постобработку признаков
        4) получение характеристик признаков (размера, формата и т.п.)
    """
    def __init__(self, owner):
        self._owner = owner
        self._printers = owner._printers
        self._info_msg = owner._info_msg
        self._error_msg = owner._error_msg
        self._warning_msg = owner._warning_msg
        self._method_msg = owner._method_msg
        
    def _is_numeric(self, values, name=None):
        """
        Возвращает True, если тип признак числовой. Используется только в конструкторе NumericalFeature.
        Аргументы:
            :param values - значения признака (np.ndarray, csr_matrix, csc_matrix)
            :param name   - имя признака (str)
        """
        return isinstance(values.dtype.type(), numbers.Number)
    
    def _check_numeric(self, values, name=None, throw=True):
        """
        Возвращает True, если тип признак числовой. Иначе возвращает False или вызывает исключение
        (в зависимости от параметра throw) .
        Аргументы:
            :param values - значения признака (np.ndarray, csr_matrix, csc_matrix)
            :param name   - имя признака (str)
            :param throw  - если True, то вызывает исключение, если признак не числовой (bool)
        """
        if not self._is_numeric(values):
            if throw:
                error_msg = 'Feature values are not numerical! Their type is {}'.format(values.dtype)
                raise TypeError(self._error_msg(error_msg))
            return False
        return True
    
    def _is_constant(self, values, name=None):
        """
        Проверяет значения признака на константность. 
        Аргументы:
            :param values - значения признака в виде np.ndarray
            :param name   - имя признака (str)
        """
        assert isinstance(values, np.ndarray)
        counter = Counter()
        for v in values:
            counter[v] += 1
            if len(counter) != 1:
                return False  
        return True
    
    def _check_constant(self, values, name=None, throw=True):
        """
        Возвращает True, если значения признака идентичны для всех объетов. Иначе возращает False или 
        вызывает исключение (в зависимости от параметра throw)
        Аргументы:
            :param values - значения признака (np.ndarray, csr_matrix, csc_matrix)
            :param name   - имя признака (str)
            :param throw  - если True, то вызывает исключение, если признак не константный (bool)
        """
        if self._is_constant(values):
            if throw:
                raise ValueError(self._error_msg("Given feature {} is constant".format(name)))
            return False
        return True
    
    def _is_shaped(self, values, name=None):
        self._undefined_method('_is_shapd')
    def _check_shaped(self, values, name=None, throw=True):
        self._undefined_method('_check_shaped')
    
    def _get_length(self):
        self._undefined_method('_get_length')
    def _get_values(self):
        self._undefined_method('_get_values')
    def _get_dense(self):
        self._undefined_method('_get_dense')
    def _get_sparse(self):
        self._undefined_method('_get_sparse')
        
    def _preprocess(self, values, name):
        self._undefined_method('_preprocess')
        
    def _undefined_method(self, method_name):
        error_msg = 'Method "{}" of the abstract class "{}"'\
            'must be redefined in derivative classes.'.format(method_name, type(self).__name__)
        assert False, error_msg

class SparseFeatureKernel(FeatureKernel):
    def __init__(self, owner):
        super().__init__(owner)

    ########################################################################
    def _is_shaped(self, values, name=None):
        return (len(values.shape) == 2) & (values.shape[0] == 1) & (values.shape[1] > 1)
    def _check_shaped(self, values, name=None, throw=True):
        if not self._is_shaped(values, name):
            if throw:
                error_msg = "Given sparse feature vector must have shape of type (1, size), but has {}".format(
                    values.shape)
                raise ValueError(self._error_msg(error_msg))
            return False
        return True
    
    ################
    def _is_constant(self, values, name=None):
        values = values.toarray().flatten()
        return super()._is_constant(values, name)
    
    ########################################################################
    def _preprocess(self, values, name):
        self._printers[self._owner.METHODS](self._method_msg('_preprocess'))
        assert isinstance(values, (csr_matrix, csc_matrix))
        init_shape = values.shape
        if len(init_shape) != 2:
            error_msg = "Feature \"{}\" has shape {} but must have a shape of length 2.".format(
                name, init_shape)
            raise ValueError(self._error_msg(error_msg))
        if (init_shape[0] == 1) & (init_shape[1] > 1):
            new_values = values.tocsr() 
        elif (init_shape[0] > 1) & (init_shape[1] == 1):
            new_values = values.transpose().tocsr()
        else:
            error_msg = "Feature \"{}\" has incorrect shape {}. Must be either (1, n) or (n, 1)".format(
                name, shape, tuple(reversed(shape)))
            raise ValueError(self._error_msg(error_msg))
        info_msg = "Feature \"{}\" is transformed from shape {} to csr_matrix of shape {}".format(
            name, init_shape, new_values.shape)
        self._printers[self._owner.FORMAT_CHANGE_LEVEL](self._info_msg(info_msg))
        info_msg = '_preprocess returns {}'.format(type(new_values))
        self._printers[self._owner.METHODS](self._info_msg(info_msg))
        return new_values
    
    ########################################################################
    def _get_length(self):
        return self._owner._values.shape[1]
    def _get_values(self, sparse=True, *args, **kwargs):
        if sparse:
            return self._owner._values.transpose().tocsc()
        else:
            return self._owner._values.todense().T
    def _get_dense(self):
        values = self._owner._values
        name = self._owner._name
        verbose = self._owner._verbose
        return DenseFeatureBase(values.toarray().flatten(), name, verbose)
    def _get_sparse(self):
        values = self._owner._values
        name = self._owner._name
        verbose = self._owner._verbose
        return SparseFeatureBase(values, name, verbose)


class DenseFeatureKernel(FeatureKernel):
    def __init__(self, owner):
        super().__init__(owner)
        
    ########################################################################
    def _is_shaped(self, values, name=None):
        return len(values.shape) == 1
    def _check_shaped(self, values, name=None, throw=True):
        if not self._is_shaped(values, name):
            if throw:
                raise ValueError(self._error_msg("Given dense feature vector must have shape of type (size, ), but has {}".format(values.shape)))
            return False
        return True
    
    ########################################################################
    def _preprocess(self, values, name):
        self._printers[self._owner.METHODS](type(self).__name__ + '._preprocess')
        assert isinstance(values, (np.ndarray, list))
        values = np.array(values)
        init_shape = values.shape
        if len(init_shape) not in [1, 2]:
            error_msg = "Feature \"{}\" has shape {} but must have a shape of length either 1 or 2.".format(
                name, init_shape)
            raise ValueError(self._error_msg(error_msg))
        if len(init_shape) == 1:
            new_values = values
        elif (init_shape[0] == 1) & (init_shape[1] > 1):
            new_values = values[0]
        elif (init_shape[0] > 1) & (init_shape[1] == 1):
            new_values = values[:, 0]
        else:
            error_msg = "Feature \"{}\" has incorrect shape {}. Must be either (n, ) or (1, n) or (n, 1)".format(
                name, shape, tuple(reversed(shape)))
            raise ValueError(self._error_msg(error_msg))
        info_msg = "Feature \"{}\" is transformed from shape {} to shape {}".format(
            name, init_shape, new_values.shape)
        self._printers[self._owner.FORMAT_CHANGE_LEVEL](self._info_msg(info_msg))
        return new_values
    
    ########################################################################
    def _get_length(self):
        return self._owner._values.shape[0]
    def _get_values(self, sparse=False, *args, **kwargs):
        if sparse:
            return csc_matrix(self._owner._values[:, np.newaxis])
        else:
            return self._owner._values[:, np.newaxis]
    def _get_dense(self):
        values = self._owner._values
        name = self._owner._name
        verbose = self._owner._verbose
        return DenseFeatureBase(values, name, verbose)
    def _get_sparse(self):
        values = csr_matrix(self._owner._values[np.newaxis, :])
        values.eliminate_zeros()
        name = self._owner._name
        verbose = self._owner._verbose
        return SparseFeatureBase(values, name, verbose)
    
    
class FeatureBase(Checker):
    """
    FeatureBase - базовый класс, контейнер для одного признака. 
    Признак хранится в виде либо разряженном, либо в плотном формате.
    Признак обладает своим именем. При проведении преобразований признака, его имя преобразуется.
    
    Уровни печати (значение verbose):
    * FORMAT_CHANGE_LEVEL - уровень, выше которого выводятся в печать сообщения о преобразованиях формата признака при инициализации
    """

    FORMAT_CHANGE_LEVEL = 10
    METHODS = 11
    
    def is_constant(self):
        """
        Возвращает True, если значения признака идентичны для всех объетов. Иначе - False.
        """
        return self._kernel._is_constant(self._values, self._name)

    def is_numeric(self):
        """
        Возвращает True, если признак числовой. Иначе - False.
        """
        return self._kernel._is_numeric(self._values, self._name)
    
    ########################################################################
    def __init__(self, values, name, verbose=0):
        attributes = ['name', 'values', 'shape']
        setters = {}; getters = {}
        for attr in attributes:
            setters[attr] = self.__getattribute__('set_' + attr)
            getters[attr] = self.__getattribute__('get_' + attr)
        super().__setattr__('_setters', setters)
        super().__setattr__('_getters', getters)
        super().__init__()
        self._verbose = verbose
        if isinstance(values, (csr_matrix, csc_matrix)):
            self._sparse = True
            self._kernel = SparseFeatureKernel(self)
        elif isinstance(values, (np.ndarray, list)):
            self._sparse = False
            self._kernel = DenseFeatureKernel(self)
        else:
            raise TypeError('Type of "values" is unacceptable!')
        self.set_name(name)
        self.set_values(values)

    def __str__(self):
        return '[{}: {}, {}]'.format(type(self).__name__, self._name, self._shape)
    
    def __repr__(self):
        return str(self)
        
    def __len__(self):
        return self._kernel._get_length()
        
    ########################################################################
    def __getattr__(self, name):
        if name in self._getters:
            return self._getters[name]()
        raise AttributeError('Attribute "{}" not found!'.format(name))
    def __setattr__(self, name, value):
        if name in self._setters:
            return self._setters[name](value)
        return super().__setattr__(name, value)
    
    def set_name(self, name):
        self._check_type(name, 'feature_name', str)
        self._name = name
    def set_values(self, values):
        """
        Всегда сохраняется копия values.
        """
        self._printers[self.METHODS](self._method_msg('set_values({})'.format(type(values))))
        _values = self._kernel._preprocess(values, self._name)
        self._kernel._check_shaped(_values, self._name, throw=True)
        self._values = _values
        self._shape  = _values.shape
        self._printers[self.METHODS](self._info_msg('set_values setted {}'.format(type(self._values))))
    def set_shape(self, shape):
        assert False, 'Setting "shape" is not allowed'
    def get_name(self):
        return self._name
    def get_values(self, *args, **kwargs):
        return self._kernel._get_values(*args, **kwargs)
    def get_shape(self):
        return self._shape
    
    ########################################################################
    def _get_categorical_name(self, name):
        return FEATURE_PREFIXES['CAT'] + name
    def _get_numerical_name(self, name):
        return FEATURE_PREFIXES['NUM'] + name
    def _get_counter_name(self, name):
        return FEATURE_PREFIXES['CTR'] + name
    def _get_loo_name(self, name):
        return FEATURE_PREFIXES['LOO'] + name
    def _get_filtered_name(self, name, threshold):
        return FEATURE_PREFIXES['FIL'] + '{}_'.format(threshold) + name
    def _get_label_encoded_name(self, name):
        return FEATURE_PREFIXES['LE'] + name
    def _get_ohe_name(self, name):
        return FEATURE_PREFIXES['OHE'] + name

    ########################################################################
    def to_dense(self):
        if self._sparse:
            self._values = self._kernel._get_dense()._values
            self._kernel = DenseFeatureKernel(self)
        assert isinstance(self._values, np.ndarray)
        assert isinstance(self._kernel, DenseFeatureKernel)
    def to_sparse(self):
        if not self._sparse:
            self._values = self._kernel._get_sparse()._values
            self._kernel = SparseFeatureKernel(self)
        assert isinstance(self._values, csr_matrix)
        assert isinstance(self._kernel, SparseFeatureKernel)
    ########################################################################
    def deepcopy(self):
        """
        Функция глубокого копирования объекта. При необходимости копирования должна вызываться ТОЛЬКО она.
        Функция copy.deepcopy выполняет некорректное копирование из-за переопределения операции работы с атрибутами
        в классе FeatureBase/
        """
        return FeatureBase(copy.deepcopy(self._values), self._name, self._verbose)    
    
        
class TestFeatureBase(unittest.TestCase):
    def setUp(self):
        self.values1 = [0, 1, 0, 0, 0, 1, 1.1, 1, 0, 4, 1, 7, 0, 0, 0]
        
    def test_init(self):
        self.dense_values = np.array(self.values1)     # (L, )
        
        self.sparse_values1 = csc_matrix(self.values1) # (1, L)
        self.sparse_values1.eliminate_zeros()          # (1, L)
        
        self.sparse_values2 = csc_matrix(np.array(self.values1)[:, np.newaxis]) # (L, 1)
        self.sparse_values2.eliminate_zeros()          # (L, 1)
        
        self.sparse_values3 = csr_matrix(self.values1) # (1, L)
        self.sparse_values3.eliminate_zeros()          # (1, L)
        
        self.sparse_values4 = csr_matrix(np.array(self.values1)[:, np.newaxis]) # (1, L)
        self.sparse_values4.eliminate_zeros()          # (1, L)
        
        feature  = FeatureBase(self.dense_values, name='f')
        feature1 = FeatureBase(self.sparse_values1, name='f1')
        feature2 = FeatureBase(self.sparse_values2, name='f2')
        feature3 = FeatureBase(self.sparse_values3, name='f3')
        feature4 = FeatureBase(self.sparse_values4, name='f4')
        
        self.assertTrue(feature.shape == (15,))
        self.assertTrue(feature1.shape == (1, 15))
        self.assertTrue(feature2.shape == (1, 15))
        self.assertTrue(feature3.shape == (1, 15))
        self.assertTrue(feature4.shape == (1, 15))
        
        self.assertTrue(isinstance(feature._values, np.ndarray))
        self.assertTrue(isinstance(feature1._values, csr_matrix))
        self.assertTrue(isinstance(feature2._values, csr_matrix))
        self.assertTrue(isinstance(feature3._values, csr_matrix))
        self.assertTrue(isinstance(feature4._values, csr_matrix))
        
        self.assertTrue(isinstance(feature.values, np.ndarray))
        self.assertTrue(isinstance(feature1.values, csc_matrix))
        self.assertTrue(isinstance(feature2.values, csc_matrix))
        self.assertTrue(isinstance(feature3.values, csc_matrix))
        self.assertTrue(isinstance(feature4.values, csc_matrix))
        
        self.assertEqual(feature.name, 'f')
        self.assertEqual(feature1.name, 'f1')
        self.assertEqual(feature2.name, 'f2')
        self.assertEqual(feature3.name, 'f3')
        self.assertEqual(feature4.name, 'f4')
        
        
        self.assertEqual(len(feature), 15)
        self.assertEqual(len(feature1), 15)
        self.assertEqual(len(feature2), 15)
        self.assertEqual(len(feature3), 15)
        self.assertEqual(len(feature4), 15)

    def test_get_values(self):
        self.dense_values = np.array(self.values1)
        self.sparse_values1 = csc_matrix(self.values1)
        self.sparse_values1.eliminate_zeros()
        self.sparse_values2 = csc_matrix(np.array(self.values1)[:, np.newaxis])
        self.sparse_values2.eliminate_zeros()
        self.sparse_values3 = csr_matrix(self.values1)
        self.sparse_values3.eliminate_zeros()
        self.sparse_values4 = csr_matrix(np.array(self.values1)[:, np.newaxis])
        self.sparse_values4.eliminate_zeros()
        
        feature = FeatureBase(self.dense_values, name='f')
        feature1 = FeatureBase(self.sparse_values1, name='f1')
        feature2 = FeatureBase(self.sparse_values2, name='f2')
        feature3 = FeatureBase(self.sparse_values3, name='f3')
        feature4 = FeatureBase(self.sparse_values4, name='f4')
        
        features = [feature, feature1, feature2, feature3, feature4]
        for sparse in [False, True]:
            for feature in features:
                returned_values = feature.get_values(sparse=sparse)
                if sparse:
                    self.assertTrue(isinstance(returned_values, csc_matrix))
                    self.assertTrue((returned_values.shape[0] > 1) & (returned_values.shape[1] == 1))
                    returned_values = returned_values.toarray()
                self.assertTrue(np.allclose(self.dense_values[:, np.newaxis], returned_values))
                
    def test_check_shaped(self):
        with self.assertRaises(ValueError):
            feature = FeatureBase(np.array(self.values1)[:, np.newaxis, np.newaxis], name='f', verbose=1)
      
    def test_check_constant(self):
        values = np.array([1, 1, 1, 1, 1, 1])
        with self.assertRaises(ValueError):
            feature = FeatureBase(values, name='feature')
            feature._kernel._check_constant(feature._values)
             
        values = csc_matrix(np.array([2, 2, 2, 2, 2, 2])) 
        with self.assertRaises(ValueError):
            feature = FeatureBase(values, name='feature')
            feature._kernel._check_constant(feature._values)      
        

    def test_check_numeric(self):
        f_values = np.array([1, 'sd', 34, True, -1])
        with self.assertRaises(TypeError):
            feature_base = FeatureBase(f_values, name='feature')
            feature_base._check_numeric(feature_base._values)
            
if __name__ == '__main__':
    unittest.main()

E
ERROR: /run/user/1000/jupyter/kernel-9c4ca024-daee-4b4b-bb3e-43a3ab305ed1 (unittest.loader._FailedTest)
----------------------------------------------------------------------
AttributeError: module '__main__' has no attribute '/run/user/1000/jupyter/kernel-9c4ca024-daee-4b4b-bb3e-43a3ab305ed1'

----------------------------------------------------------------------
Ran 1 test in 0.001s

FAILED (errors=1)


SystemExit: True

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


<a id='fb_tests'></a>
### 1.2 Tests<sup>[toc](#toc)</sup>

In [48]:
for sparse in [True, False]:
    values = [0, 1, 0, 0, 0, 1, 1.1, 1, 0, 4, 1, 7, 0, 0, 0]
    values = np.array(values)
    if sparse:
        values = csc_matrix(values)
        values.eliminate_zeros()
    name = 'f'
    feature = FeatureBase(values, name)

    fvalues = feature.values
    _fvalues = feature._values
    if sparse:
        print('SPARSE =', sparse)
        print('str(feature) =', feature)
        print('feature.values.shape = {}\nfeature.values = {}'.format(fvalues.shape, fvalues.tocsr()))
        print('feature._values.shape = {}\nfeature._values = {}'.format(_fvalues.shape, _fvalues.tocsr()))
        print('[Dense ]:', feature.get_values(sparse=False).flatten())
        print('[Sparse]:', feature.get_values(sparse=True).tocsr())
    else:
        print('\n\n\nSPARSE =', sparse)
        print('str(feature) =', feature)
        print('feature.values.shape = {}\nfeature.values = {}'.format(fvalues.shape, fvalues.flatten()))
        print('feature._values.shape = {}\nfeature._values = {}'.format(_fvalues.shape, _fvalues.flatten()))
        print('[Dense ]:', feature.get_values(sparse=False).flatten())
        print('[Sparse]:', feature.get_values(sparse=True).tocsr())
    print('feature.shape = {}, feature.name = {}'.format(feature.shape, feature.name))
    print('is_numeric = ', feature.is_numeric())

SPARSE = True
str(feature) = [FeatureBase: f, (1, 15)]
feature.values.shape = (15, 1)
feature.values =   (1, 0)	1.0
  (5, 0)	1.0
  (6, 0)	1.1
  (7, 0)	1.0
  (9, 0)	4.0
  (10, 0)	1.0
  (11, 0)	7.0
feature._values.shape = (1, 15)
feature._values =   (0, 1)	1.0
  (0, 5)	1.0
  (0, 6)	1.1
  (0, 7)	1.0
  (0, 9)	4.0
  (0, 10)	1.0
  (0, 11)	7.0
[Dense ]: [[ 0.   1.   0.   0.   0.   1.   1.1  1.   0.   4.   1.   7.   0.   0.   0. ]]
[Sparse]:   (1, 0)	1.0
  (5, 0)	1.0
  (6, 0)	1.1
  (7, 0)	1.0
  (9, 0)	4.0
  (10, 0)	1.0
  (11, 0)	7.0
feature.shape = (1, 15), feature.name = f
is_numeric =  True



SPARSE = False
str(feature) = [FeatureBase: f, (15,)]
feature.values.shape = (15, 1)
feature.values = [ 0.   1.   0.   0.   0.   1.   1.1  1.   0.   4.   1.   7.   0.   0.   0. ]
feature._values.shape = (15,)
feature._values = [ 0.   1.   0.   0.   0.   1.   1.1  1.   0.   4.   1.   7.   0.   0.   0. ]
[Dense ]: [ 0.   1.   0.   0.   0.   1.   1.1  1.   0.   4.   1.   7.   0.   0.   0. ]
[Sparse]:   (1

<a id='numerical_feature'></a>
## 2. NumericalFeature<sup>[toc](#toc)</sup>

In [51]:
# %load numerical_feature.py
# from feature_base import *
import numpy as np
import unittest
import copy
import numbers
from scipy.sparse import csc_matrix, csr_matrix

class NumericalFeature(FeatureBase):
    def __init__(self, values, name, verbose=0):
        super().__init__(values, name, verbose)
        self._name = self._get_numerical_name(name)
        self._kernel._check_numeric(self._values, self._name, True)

    def get_categorical_feature(self, bins, right=True, include_lowest=False):
        """
        Создает категориальный признак из числового.
        TODO include_lowest
        Аргументы:
            :param bins - 
            :param right -  
            :param include_lowest - на данный момент не используется
        """
        if self._sparse:
            values = self._values.toarray().flatten()
        else:
            values = self._values
        cat_values = np.array(pd.cut(values, bins, right=right))
        cat_name = self._get_categorical_name(self._name)
        return CategorialFeature(cat_values, cat_name)


class TestNumericalFeature(unittest.TestCase):
    def setUp(self):
        self.dense_values1 = [0, 4, 5.5, 9, 3.7, 0, 1, 0, 0, 0, 0]
        self.df1 = NumericalFeature(self.dense_values1, name='df1')
        self.sparse_values1 = csc_matrix(self.dense_values1)
        self.sparse_values1.eliminate_zeros()
        self.sf1 = NumericalFeature(self.sparse_values1, name='sf1')
        
        self.dense_values2 = np.array([1, 2, 3, 4, 5, 6, 7]).reshape((-1, 1))
        self.df2 = NumericalFeature(self.dense_values2, name='df2')
        self.sparse_values2 = csc_matrix(self.dense_values2)
        self.sparse_values2.eliminate_zeros()
        self.sf2 = NumericalFeature(self.sparse_values2, name='sf2')

    def test_init_shape(self):
        self.assertTrue(self.df1.shape == (len(self.dense_values1),))
        self.assertTrue(self.df2.shape == (self.dense_values2.shape[0],))
        self.assertTrue(self.sf1.shape == (1, len(self.dense_values1)))
        self.assertTrue(self.sf2.shape == (1, self.dense_values2.shape[0]))
        
    def test_init_array(self):
        self.assertTrue(isinstance(self.df1._values, np.ndarray))
        self.assertTrue(isinstance(self.df2._values, np.ndarray))
        self.assertTrue(isinstance(self.sf1._values, csr_matrix))
        self.assertTrue(isinstance(self.sf2._values, csr_matrix))
      
    def test_init_shape_error(self):
        with self.assertRaises(ValueError):
            feature = NumericalFeature(np.array(self.dense_values1)[:, None, None], name='f2')
            
    def test_init_numeric_error(self):
        feature = NumericalFeature(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]), name='feature2')
        feature = NumericalFeature(np.array([1.4, 8.2, 82.2]), name='feature3')
        with self.assertRaises(TypeError):
            feature = NumericalFeature(np.array(list('jskdfjdskfjsd')), name='feature4')
            
    def test_categorical(self): # TODO
        values = [0.3, 0.5, 0.8, 0.2, 0.6, 0.1]
        name = 'feature2'
        feature = NumericalFeature(values, name)
        # bins = 
        #cat_feature = 
    
if __name__ == '__main__':
    unittest.main()

E
ERROR: /run/user/1000/jupyter/kernel-9c4ca024-daee-4b4b-bb3e-43a3ab305ed1 (unittest.loader._FailedTest)
----------------------------------------------------------------------
AttributeError: module '__main__' has no attribute '/run/user/1000/jupyter/kernel-9c4ca024-daee-4b4b-bb3e-43a3ab305ed1'

----------------------------------------------------------------------
Ran 1 test in 0.001s

FAILED (errors=1)


SystemExit: True

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


<a id='aggregated_feature'></a>
## 3. AggregatedFeature<sup>[toc](#toc)</sup> <sup>[down](#categorical_feature)</sup>

In [52]:
class AggregatedFeature(Checker):
    DELETE_FEATURE = 4
    """
    Позволяет хранить значения нескольких признаков, например, OHE представления категориальных признаков.
    """
    def __init__(self, features, name, copy=True, verbose=0):
        """
        Аргументы:
            :param features - список объектов FeatureBase
            :param name     - имя агрегированного признака
            :param exclude_const - исключить константные признаки из множества?
            :param copy     - если True, то каждый каждый признак будет скопирован
            :param verbose  - уровень печати (nonnegative int)
        """
        
        self.set_features(features, name)
        self._verbose = verbose
        
    def set_features(self, features, name, copy=True):
        self._check_features(features, name)
        self._name = name
        self._feature_names = [feature._name for feature in features]
        if copy:
            self._features = {feature._name: feature.deepcopy() for feature in features}
        else:
            self._features = {feature._name: feature for feature in features}
        
    def _check_features(self, features, name):
        if not isinstance(features, (np.ndarray, list)):
            raise TypeError('Wrong format of "features" with name "{}" for "{}".'.format(name, type(self).__name__))
        if not all([isinstance(feature, FeatureBase) for feature in features]):
            raise TypeError('One of subfeatures of feature "{}" is not an object of FeatureBase.'.format(name))
        lengths = [len(feature) for feature in features]
        if min(lengths) != max(lengths):
            raise ValueError('Provided features with name "{}" have different lengths'.format(name))
        if min(lengths) == 0:
            raise ValueError('Features with name "{}" have zero length. Must have positive length.'.format(name))

    def exclude_constant(self):
        """
        Исключает константные подпризнаки из рассмотрения.
        """
        to_delete = []
        for feature_name in self._features:
            if self._features[feature_name].is_constant():
                to_delete.append(feature_name)
        for feature_name in to_delete:
            self._printers[self.DELETE_FEATURE]('Deleting constant feature "{}"'.format(feature_name))
            del self._features[feature_name]
        self._feature_names = [feature_name for feature_name in self._feature_names 
                               if feature_name in self._features]

    def is_constant(self):
        if len(self._features) == 0: # In case if all feature are excluded due to constant values
            return True
        return all([feature.is_constant() for feature in self._features.values()])
            
    def get_values(self, feature_names=None, sparse=False, as_dataframe=False, **kwargs):
        if feature_names is None:
            feature_names = self._feature_names
        features = [self._features[feature_name] for feature_name in feature_names 
                    if feature_name in self._features.keys()]
        # TODO проверки
        X = []
        for feature in features:
            X.append(feature.get_values(sparse=sparse))
        if len(X) == 0: # Если вдруг все пусто
            raise ValueError("All values are constant. Senseless feature!")
        if sparse:
            X = scipy.sparse.hstack(X)
        else:
            X = np.concatenate(X, axis=1)
            if as_dataframe:
                X = pd.DataFrame(X, columns=feature_names)
        return X
    
    def __repr__(self):
        s = 'AggregatedFeature['
        for feature_name in [feature_name for feature_name in self._feature_names 
                             if feature_name in self._features]:
            feature = self._features[feature_name]
            s += str(feature)
        s += ']'
        return s
    
n_features = 5
features = []
feature_names = []
size = 10
sparse = True; as_dataframe=True
for n_feature in range(n_features):
    values = np.random.randint(low=0, high=2, size=size)
    features.append(NumericalFeature(values, 'F' + str(n_feature), verbose=0))
    feature_names.append(features[-1].get_name())
aggr_feature = AggregatedFeature(features, 'AGGR', copy=False)
print(aggr_feature)
values = aggr_feature.get_values(sparse=sparse, as_dataframe=as_dataframe)
print(values)
if sparse:
    print(values.todense())
values = aggr_feature.get_values(feature_names=feature_names[1:4], sparse=sparse, as_dataframe=as_dataframe) 
print(values)
if sparse:
    print(values.todense())

AggregatedFeature[[FeatureBase: F0, (10,)][FeatureBase: F1, (10,)][FeatureBase: F2, (10,)][FeatureBase: F3, (10,)][FeatureBase: F4, (10,)]]
  (0, 0)	1
  (1, 0)	1
  (3, 0)	1
  (5, 0)	1
  (6, 0)	1
  (8, 0)	1
  (9, 0)	1
  (0, 1)	1
  (1, 1)	1
  (2, 1)	1
  (3, 1)	1
  (4, 1)	1
  (8, 1)	1
  (2, 2)	1
  (3, 2)	1
  (8, 2)	1
  (5, 3)	1
  (7, 3)	1
  (8, 3)	1
  (9, 3)	1
  (2, 4)	1
  (3, 4)	1
  (6, 4)	1
  (7, 4)	1
  (8, 4)	1
[[1 1 0 0 0]
 [1 1 0 0 0]
 [0 1 1 0 1]
 [1 1 1 0 1]
 [0 1 0 0 0]
 [1 0 0 1 0]
 [1 0 0 0 1]
 [0 0 0 1 1]
 [1 1 1 1 1]
 [1 0 0 1 0]]
  (0, 0)	1
  (1, 0)	1
  (2, 0)	1
  (3, 0)	1
  (4, 0)	1
  (8, 0)	1
  (2, 1)	1
  (3, 1)	1
  (8, 1)	1
  (5, 2)	1
  (7, 2)	1
  (8, 2)	1
  (9, 2)	1
[[1 0 0]
 [1 0 0]
 [1 1 0]
 [1 1 0]
 [1 0 0]
 [0 0 1]
 [0 0 0]
 [0 0 1]
 [1 1 1]
 [0 0 1]]


<a id='categorical_feature'></a>
## 4. CategoricalFeature<sup>[toc](#toc)</sup>

In [91]:
# %load categorical_feature.py
# from feature_base import *
import numpy as np
import unittest
import copy
import numbers
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from itertools import product, chain

class CategoricalFeature(FeatureBase):
    """
    Класс для хранения категориальных признаков. На данный момент доступна только реализация с 
    dense хранением данных. 
    """
    ################################################################################### 

    CAT_FEATURE_INIT = 8
    OHE = 9
    def _is_label_encoded(self, values, name=None):
        """
        Возвращает True, если значения признака label encoded. Иначе возвращает False или вызывает исключение
        (в зависимости от параметра throw).
        Аргументы:
            :param values - категориальные значения (np.ndarray)
            :param name - имя категориального признака (str)
        """
        if not self.is_numeric():
            return False
        labels = sorted(list(set(values)))
        prev_value = labels[0]
        if prev_value != 0:
            return False
        for value in labels[1:]:
            if value != prev_value + 1:
                return False
            prev_value = value
        return True
    
    def is_label_encoded(self):
        return self._is_label_encoded(self._values, self._name)

    def _check_label_encoded(self, values, name, throw=True):
        """
        Возвращает True, если значения признака label encoded. Иначе возвращает False или вызывает исключение
        (в зависимости от параметра throw).
        Аргументы:
            :param values - значения признака (np.ndarray)
            :param name   - имя категориального признака (str)
            :param throw  - вызывать исключение? (bool)
        """
        if not self._is_label_encoded(values):
            if throw: 
                raise ValueError(self._error_msg('Feature "{}" is not label-encoded'.format(name)))
            return False
        return True
            
    def _check_cat2label(self, values, name, cat2label, throw=True):
        """
        Проверяет, что преобразование категорий в метки корректно. Возвращает True в случае корректности.
        Иначе возвращает False или вызывает исключение (в зависимости от параметра throw).
        Аргументы:
            :param values    - значения признака (np.ndarray)
            :param name      - имя признака (cat)
            :param cat2label - преобразование в метки (dict)
            :param throw     - вызывать исключение? (bool)
        """
        if not set(values) == set(cat2label.values()):
            if throw: 
                print(set(values), set(cat2label.values()))
                raise ValueError(self._error_msg('Num of values != number of labels for feature "{}"'.format(name)))
            else: 
                return False
        if not len(set(cat2label.keys())) == len(set(cat2label.values())):
            if throw: 
                raise ValueError(self._error_msg('There is no one-to-one correspondance in cat2label for feature "{}"'.format(name)))
            else: 
                return False
        return True
    
    ###################################################################################
    def deepcopy(self):
        new_feature = CategoricalFeature(copy.deepcopy(self._values), self._name, verbose=self._verbose)
        new_feature.set_cat2label(self._cat2label)
        return new_feature
        
    def __repr__(self):
        return type(self).__name__ + '({}; {})'.format(self.name, self.values.shape)
    
    def __init__(self, values, name, cat2label=None, verbose=0):
        """
        По завершении работы конструктора признаки оказываются закодированы метками от 1 до N, где
        N - число различных значений признака.
        Аргументы:
            :param values - значения категориальной переменной (np.ndarray, list)
            :param name   - имя категориальной переменной (str)
            :param cat2label - mapping для преобразования категорий в метки (dict)
        """
        assert isinstance(values, (np.ndarray, list))
        super().__init__(values, name, verbose)
        self._name = self._get_categorical_name(name)
        msg_init = self._info_msg('__init__({})'.format(name))
        
        self._cat2label = None
        self._label2cat = None
        if cat2label is not None:
            self._printers[self.CAT_FEATURE_INIT](msg_init + ': applying mapping "cat2label" to values')
            self._values = np.array(list(map(lambda cat: cat2label[cat], self._values)))
            self._check_cat2label(self._values, self._name, cat2label)
            self._cat2label = copy.deepcopy(cat2label)
            self._label2cat = {label:cat for cat, label in cat2label.items()}
            
        self._properties = {}
        self._properties['is_numeric'] = self.is_numeric()
        self._properties['is_label_encoded'] = self.is_label_encoded()
        self._properties['is_constant'] = self.is_constant()
        if self._properties['is_label_encoded']:
            self._printers[self.CAT_FEATURE_INIT](msg_init + ': feature "{}" is already label encoded'.format(name))
        else:
            self._printers[self.CAT_FEATURE_INIT](msg_init + ': label encoding feature "{}"'.format(name))
        self._label_encode()
  
        # These values are used for filtering rare values
        self._threshold = None
        self._unique_label = None
    
        assert self._properties['is_label_encoded'], 'By the end of constructor feature "{}" is not label encoded. Something is wrong.'.format(self.name)
        assert self._properties['is_numeric'], 'By the end of the constructor feature "{}" is not numeric. Something is wrong'.format(self.name)
        
    ##################################################################################
    def set_label2cat(self, label2cat=None):
        if label2cat is None:
            self._label2cat = None
            self._cat2label = None
        else:
            cat2label = {cat:label for label, cat in label2cat.items()}
            self._check_cat2label(self._values, self._name, cat2label, True)
            self._label2cat = copy.deepcopy(label2cat)
            self._cat2label = cat2label
            
    def set_cat2label(self, cat2label=None):
        """
        Подразумевает, что сейчас в self._values хранятся метки
        """
        if cat2label is None:
            self._cat2label = None
            self._label2cat = None
        else:
            self._check_cat2label(self._values, self._name, cat2label, True)
            self._cat2label = copy.deepcopy(cat2label)
            self._label2cat = {label:cat for cat, label in cat2label.items()}
        
    def get_cat_values(self):
        """
        Возвращает признаки в виде изначальных категорий, а не в LE-закодированном виде, в котором 
        они хранятся внутри класса CategoricalFeature.
        """
        if self._label2cat is None:
            # Такое возможно только если признак изначально был передан в закодированном виде
            assert self._properties['is_label_encoded'], 'Expected encoded feature.'
            return np.array(self._values)
        return np.array(list(map(lambda label: self._label2cat[label], self._values)))
        
    ##################################################################################
        
    def _filter_feature(self, threshold):
        """
        Отфильтровывает те категории, которые встречаются не более threshold раз. Заменяет их на новую 
        категорию. Данная категория будет иметь максимальное значение метки. Применение данной функции 
        ведет к преобразованию имени признака: добавляется приставка FIL_
        
        Аргументы:
            :param threshold - если число появлений категории не превосходит threshold, 
                                то она отсеивается (int, float)
        """
        
        # Checking if the feature is label encoded
        if not self._properties['is_label_encoded']:
            raise ValueError('Cannot filter feature "{}" as it is not label encoded.'.format(self.name))
        # Even if the filtration does not change feature values, we change its name and threshold parameters
        self._name = self._get_filtered_name(self._name, threshold)
        self._threshold = threshold
        
        # Checking if there are rare values present in the feature
        counts = Counter(self._values)
        for label, n_occurences in counts.items():
            if n_occurences <= threshold:
                self._unique_label = self._values.max() + 1
                self._properties['is_label_encoded'] = False
                break
        if self._unique_label is None: 
            # There are no labels which occur less or equal to threshold times
            return
        
        # Some features occur less or equal threshold times. Let us find them
        rare_labels = set()
        rare_categories = set()
        # Changing rare labels to the chosen unique_label
        for n, label in enumerate(self._values):
            if counts[label] <= threshold:
                if self._cat2label is not None:
                    rare_labels.add(label)
                    rare_categories.add(self._label2cat[label])
                self._values[n] = self._unique_label # setting rare label to new value

        # Forming new categories names
        if self._cat2label is not None:    
            if len(rare_categories) > 1:
                new_cat = '(' + '|'.join(sorted(list(rare_categories))) + ')'
            elif len(rare_categories) == 1:
                new_cat = list(rare_categories)[0]
            else:
                assert False, '"rare_categories" must not be empty at this point. Something is wrong.'

            for label in rare_labels:
                del self._label2cat[label]
            self._label2cat[self._unique_label] = new_cat
            for cat in rare_categories:
                del self._cat2label[cat]
            self._cat2label[new_cat] = self._unique_label
            
        self._properties['is_constant'] = self.is_constant()
        self._label_encode()
        assert self._properties['is_label_encoded']
        assert self._properties['is_numeric']
        
    def get_filtered_feature(self, threshold):
        """ 
        Возвращает признак, полученный из данного фильтрацией категорий по порогу threshold: 
        все категории, встречающиеся не чаще чем threshold, отфильтровываются функцией _filter_feature.
        Все отфильтрованные категории становятся новой категорией.
       
        Аргументы:
            :param - если число появлений категории не превосходит threshold, то она отсеивается (int, float)
        """
        new_feature = self.deepcopy()
        new_feature._filter_feature(threshold)
        return new_feature
    
    def get_counter_feature(self):
        """
        Возвращает признак NumericalFeature, равный числу появления каждой из категорий.
        """
        counts = Counter(self._values)
        new_values = np.zeros_like(self._values)
        for n, value in enumerate(self._values):
            new_values[n] = counts[value]
        new_name = self._get_counter_name(self._name)
        return NumericalFeature(new_values, new_name)

    def get_loo_feature(self, Y_train, cv, alpha=0.01, seed=1234, scale=0.01):
        """
        Предполагает, что первые len(Y_train) примеров принадлежат обучающей выборке
        """
        assert isinstance(Y_train, (np.ndarray, list))
        assert len(Y_train) <= len(self._values)
        train_size = len(Y_train)
        test_size = len(self._values) - train_size
        
        np.random.seed(seed)
        X_train = self._values[:train_size]
        mean_y = np.mean(Y_train)
        all_labels = set(self._values)
        X_new_train = np.zeros(len(X_train))

        for n_split, (train_indices, test_indices) in enumerate(cv.split(X_train, Y_train)):
            x_train, y_train = X_train[train_indices], Y_train[train_indices]
            x_test, y_test = X_train[test_indices], Y_train[test_indices]
            for label in all_labels:
                N_all = x_train.shape[0]
                train_mask = x_train == label
                N_label = np.sum(train_mask)
                print('n_split = {}, label = {}, den = {}'.format(n_split, label, N_label + alpha * N_all))
                X_new_train[test_indices[x_test == label]] = \
                    (np.sum(y_train[train_mask]) + alpha * mean_y * N_all) / (max(N_label, 1) + alpha * N_all)
        if scale > 0:
            multipliers = np.random.normal(loc=1.0, scale=scale, size=len(self._values))
        else:
            multipliers = np.ones(len(self._values))
        if test_size > 0:
            X_test = self._values[train_size:]
            X_new_test = np.zeros(test_size)
            for label in all_labels:
                train_mask = X_train == label
                N_all = train_size
                N_label = np.sum(train_mask)
                X_new_test[X_test == label] = (np.sum(Y_train[train_mask]) +
                                               alpha * mean_y * train_size) / (max(N_label, 1) + alpha * N_all)

            X_new = np.concatenate([X_new_train, X_new_test]) * multipliers
        else:
            X_new = X_new_train * multipliers
        new_name = self._get_loo_name(self._name)
        return NumericalFeature(X_new, new_name)
        
    ############################################################
    ##                       Кодировщики                      ##
    ############################################################
    def _label_encode(self):
        """
        Выполняет label-кодирование признака.
        """
        if self._properties['is_label_encoded']:
            if len(FEATURE_PREFIXES['LE']) > 0:
                if not self._name.startswith(FEATURE_PREFIXES['LE']):
                    self.name = self._get_label_encoded_name(self._name)
            return
        
        label_encoder = LabelEncoder()
        self._values = label_encoder.fit_transform(self._values)
        classes = label_encoder.classes_
        old_label2new_label = {old_label:new_label for new_label, old_label in enumerate(classes)}
        new_label2old_label = {new_label:old_label for new_label, old_label in enumerate(classes)}

        self._name = self._get_label_encoded_name(self.name)
        self._properties['is_label_encoded'] = self.is_label_encoded()
        self._properties['is_numeric'] = self.is_numeric()
        self._properties['is_constant'] = self.is_constant()
        
        if self._unique_label is not None:
            # This placed can be reached when _label_encode() is invoked from _filter_feature()
            self._unique_label = old_label2new_label[self._unique_label]
            assert self._unique_label == len(old_label2new_label) - 1
            assert (FEATURE_PREFIXES['FIL'] + '{}_'.format(self._threshold)) in self._name
            
        if self._label2cat is None:
            self._cat2label = old_label2new_label
            self._label2cat = new_label2old_label
        else:
            new_label2cat = {}
            for old_label in self._label2cat:
                new_label = old_label2new_label[old_label]
                new_label2cat[new_label] = self._label2cat[old_label]
            cat2new_label = {cat:new_label for new_label, cat in new_label2cat.items()}
            self._cat2label = cat2new_label
            self._label2cat = new_label2cat
            
        assert self._properties['is_label_encoded']
        assert self._properties['is_numeric']

    def get_le_feature(self):
        """
        Возвращает LE-закодированный признак, полученный на основе данного. В данной реализации CategoricalFeature
        поддерживается инваринат: внутреннее состояние признака всегда LE-закодированное. Поэтому вызов
        _label_encode() в реализации функции по сути бесполезен. Возможно что-то измениться в будущих версиях.
        """
        new_feature = self.deepcopy()
        new_feature._label_encode()
        return new_feature
    
    def get_ohe_feature(self, sparse=True, omit_uniques=False):
        """
        Аргументы:
            :param sparse       - вернуть sparse или dense представление? (bool)
            :param omit_uniques - если True, то отфильтрованная категория не войдет в состав OHE-признака (bool)
        """
        assert self._properties['is_label_encoded']
        assert self._properties['is_numeric']
        msg_base = self._method_msg('get_ohe_feature(): ')
        
        if (not omit_uniques) or (self._unique_label is None):
            unique_label = -1
        else:
            unique_label = self._unique_label
        
        ohe_name = self._get_ohe_name(self._name)
        counter = Counter(self._values)
        
        if self._properties['is_constant']:
            # No sense of OHE for constant feature
            assert np.sum(self._values) == 0
            assert len(self._cat2label) == 1
            assert list(self._cat2label.values())[0] == 0
            self._printers[self.OHE](msg_base + 'OHE of constant feature "{}".'.format(self._name))
            return NumericalFeature(self._values, ohe_name)
        
        if (len(counter) == 2):
            # In case of binary feature one column of OHE representation can be omitted
            assert set(self._cat2label.values()) == set([0, 1])
            self._printers[self.OHE](msg_base + 'OHE senseless for binary feature "{}".'.format(self._name))
            return NumericalFeature(self._values, ohe_name)
        
            # На данный момент непонятно, почему при unique_label >= 0 возвращали константу
            """if unique_label >= 0:
                assert unique_label == 1
                self._printers[self.OHE](msg_base + 'omiting unique label for "{}" turns it constant.'.format(self._name))
                return NumericalFeature(np.zeros(len(self._values)), ohe_name)
            else:
                self._printers[self.OHE](msg_base + 'OHE senseless for binary feature "{}".'.format(self._name))
                return NumericalFeature(self._values, ohe_name)"""
        
        ohe_values = OneHotEncoder(sparse=sparse).fit_transform(self._values[:, np.newaxis])
        if sparse:
            ohe_values = ohe_values.tocsc()
        if unique_label >= 0:
            assert unique_label == len(counter) - 1
            mask = (self._values == unique_label)
            if sparse:
                last_column = ohe_values[:, unique_label].toarray().flatten()
            else:
                last_column = ohe_values[:, unique_label]
            assert np.all(last_column == mask), 'Last column of ohe feature must correspond to unique_label.'
            ohe_values = ohe_values[:, :unique_label]       
        
        feature_names = []
        feature_values = []
        for label in sorted(self._label2cat.keys()):
            if label != unique_label:
                feature_names.append(self._label2cat[label])
                feature_values.append(ohe_values[:, label])
        features = [NumericalFeature(fvalues, fname) for fvalues, fname in zip(feature_values, feature_names)]
        return AggregatedFeature(features, ohe_name, verbose=self._verbose, copy=False)
        
    def get_properties(self):
        return copy.deepcopy(self._properties)
        
def print_columns(*args):
    all_labels = []
    all_values = []
    v_length = 0
    m_length = 0
    for label, values in args:
        m_length = max(m_length, len(label))
        all_labels.append(label)
        all_values.append(values)
        v_length = max(v_length, max([len(str(v)) for v in values]))
    for label, values in args:
        s = []
        if m_length > 0:
            s.append(label.ljust(m_length) + ':')
        for v in values:
            s.append(str(v).ljust(v_length))
        print(' '.join(s))
    
        
name = 'f'
cat_values = ['A', 'A', 'B', 'A', 'B', 'C', 'A', 'B', 'C', 'D', 'A', 'B', 'C', 'D', 'E']
values     = [0,   0,    1,   0,   1,   2,   0,   1,   2,   3,   0,   1,   2,   3,   4]
cat2label  = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
f = CategoricalFeature(cat_values, name, cat2label, verbose=0)

test_initial = False
test_counter = False
test_filtered = False
test_ohe = False
test_ohe_filtered = False
test_loo = False

if test_initial:
    print('INITIAL FEATURE:')
    print('True values    : ', values)
    print('Obtained values: ', list(f.get_values().flatten()))
    print('\nTrue CAT values: ', cat_values)
    print('Obtained CATs  : ', list(f.get_cat_values()))

if test_counter:
    print('\n\nCOUNTER FEATURE')
    print('Initial feature: ', f)
    counter_f = f.get_counter_feature()
    print('Counter feature: ', counter_f)
    print(counter_f.get_values().flatten())
    print('Values of initial and counter features:')
    args = [('initial', f.get_values().flatten()), ('counter', counter_f.get_values().flatten())]
    print_columns(*args)

if test_filtered:
    print('\n\nFILTERED FEATURES')
    ffs = {n:f.get_filtered_feature(n) for n in range(6)}
    for n in range(6):
        fil_feature = ffs[n].get_values().flatten()
        cat_feature = ffs[n].get_cat_values().flatten()
        ctr_feature = ffs[n].get_counter_feature().get_values().flatten()
        print('fil_feature props: ', ffs[n].get_properties())
        print('fil feature name: ', ffs[n])
        print('ctr feature name: ', ffs[n].get_counter_feature())
        args = [('fil_feature', fil_feature), ('cat_feature', cat_feature), ('ctr_feature', ctr_feature)]
        print_columns(*args)
        print('\n\n')

if test_ohe:   
    print('\n\nOHE FEATURES')
    ohe_feature = f.get_ohe_feature()
    a = f.get_ohe_feature(sparse=False).get_values(sparse=True).toarray()
    b = f.get_ohe_feature(sparse=False).get_values(sparse=False)
    c = f.get_ohe_feature(sparse=True).get_values(sparse=False)
    d = f.get_ohe_feature(sparse=True).get_values(sparse=True).toarray()
    assert np.allclose(a, b)
    assert np.allclose(b, c)
    assert np.allclose(c, d)
    assert np.allclose(d, a)
    print('Initial feature:', ohe_feature)
    print('\tvalues:\n', f.get_values().flatten())
    print('OHE feature:', ohe_feature)
    print('\tOHE values:\n', a)

if test_ohe_filtered:
    print('\n\nOHE FEATURES + FILTRATION')
    for threshold, omit_uniques in product([1, 2, 3, 4, 5], [False, True]):
        ff = f.get_filtered_feature(threshold=threshold)
        print('OHE feature with omit_uniques = {} and threshold = {}'.format(omit_uniques, threshold))
        print('Initial  feature name:', f)
        print('Filtered feature name:', ff)
        print('Initial feature values:', f.get_values().flatten())
        print('Filtered fature values:', ff.get_values().flatten())
        print('threhold =', ff._threshold, '  unique_label =', ff._unique_label)
        ff_ohe = ff.get_ohe_feature(omit_uniques=omit_uniques)
        print('FilOHE feature name:  ', ff_ohe)
        print('FilOHE feature values:\n', ff_ohe.get_values(sparse=False))
        print('FilOHE is constant:   ', ff_ohe.is_constant())
        print('\n\n')

if test_loo:
    print('\n\nLEAVE ONE OUT')
    X = np.array([0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0])
    y = np.array([0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0])
    cat_feature = CategoricalFeature(X, 'f')
    random_state = 345
    n_splits = 2
    cv = StratifiedKFold(n_splits, shuffle=True, random_state=random_state)
    print_columns(('ind', np.arange(len(X))), ('X', X), ('y', y))
    for n_split, (train_indices, test_indices) in enumerate(cv.split(X, y)):
        print('\n\nn_split =', n_split)
        X_tr, y_tr = X[train_indices], y[train_indices]
        X_ts, y_ts = X[test_indices],  y[test_indices]
        X_loo_true = np.array([1/4., 2/3., 0, 2/3., 0, 2/3., 2/3., 2/3., 2/3., 0, 0, 1/4.])
        print()
        print_columns(('ind', train_indices), ('X_tr', X_tr), ('y_tr', y_tr))
        print()
        print_columns(('ind', test_indices), ('X_ts', X_ts), ('y_ts', y_ts))
        
    loo_feature = cat_feature.get_loo_feature(y, cv, alpha=0, scale=0.0)
    X_loo_found = loo_feature.get_values(sparse=False).flatten()
    print('LOO:\n')
    print_columns(('True', X_loo_true), ('Found', X_loo_found))
    np.allclose(X_loo_found, X_loo_true)
    #print(cat_feature, cat_feature)
    #print(loo_feature, loo_feature.values

<a id='categorical_combiner'></a>
# 5. CategoricalCombiner<sup>[toc](#toc)</sup>

In [107]:
from itertools import product, chain, combinations

class CategoricalCombiner(Checker):
    METHOD = 4
    def __init__(self, verbose=0):
        super().__init__()
        self._verbose = verbose

    def get_all_combinations(self, features, degree, hash=hash):
        """
        Аргументы:
            :param degree
            :param features
            :param hash
        """
        feature_names = [feature.name for feature in features]
        method_msg = self._method_msg('get_all_combinations')
        methdo_msg = method_msg + '({}, degree={})'.format(feature_names, degree)
        self._printers[self.METHOD](method_msg)
        combined_features = {}
        for some_features in combinations(features, degree):
            new_feature = self.get_combined_feature(some_features, hash) 
            combined_features[new_feature.get_name()] = new_feature
        return combined_features

    def get_combined_feature(self, features, hash=hash):
        self.check_sizes_(features)
        if len(features) < 1:
            raise ValueError('At least one feature name must be given')
        if len(features) == 1:
            return features[0].deepcopy()
                             
        feature_values = []
        feature_names = []
        for feature in features:
            values = feature.get_values(False).flatten()
            feature_values.append(values)
            feature_names.append(feature.get_name())
            
        new_values = []
        for hyper_value in zip(*feature_values):
            new_values.append(hash(hyper_value))
        new_values = LabelEncoder().fit_transform((new_values))
        new_name = '+'.join(feature_names)
        return CategoricalFeature(new_values, new_name)

    def check_sizes_(self, features):
        if len(Counter([len(feature) for feature in features])) != 1:
            raise ValueError('Features must have equal sizes!')
            
test = True
if test:
    features = {'f1': [0, 1, 2, 0, 1, 2, 2, 0, 1, 2, 2, 0],
                'f2': [0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0],
                'f3': [1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1]}
    cat_features = [CategoricalFeature(features[name], name) for name in sorted(features.keys())]
    cat_combiner = CategoricalCombiner()
    new_feature = cat_combiner.get_combined_feature(cat_features)
    print('comb_feature:', new_feature)
    print('name = {}, values = {}'.format(new_feature._name, new_feature._values))
    fil_feature = new_feature.get_filtered_feature(1)
    print('fil_feature: ', fil_feature)
    print('name = {}, values = {}'.format(fil_feature.name, fil_feature._values))
    for degree in range(1, 4):
        print('\ndegree = {}'.format(degree))
        new_features = cat_combiner.get_all_combinations(cat_features, degree=degree)
        print('new_features:', new_features)
        args = []
        for f_name, feature in new_features.items():
            args.append(('  ' + f_name, feature.get_values(False).flatten()))
        args = sorted(args, key=lambda x: x[0])
        print_columns(*args)

comb_feature: [CategoricalFeature: f1+f2+f3, (12,)]
name = f1+f2+f3, values = [0 2 6 1 4 7 7 1 3 5 6 0]
fil_feature:  [CategoricalFeature: Fil1_f1+f2+f3, (12,)]
name = Fil1_f1+f2+f3, values = [0 4 2 1 4 3 3 1 4 4 2 0]

degree = 1
new_features: {'f1': CategoricalFeature(f1; (12, 1)), 'f3': CategoricalFeature(f3; (12, 1)), 'f2': CategoricalFeature(f2; (12, 1))}
  f1: 0 1 2 0 1 2 2 0 1 2 2 0
  f2: 0 1 0 1 0 1 1 1 1 0 0 0
  f3: 1 0 1 1 1 1 1 1 1 0 1 1

degree = 2
new_features: {'f1+f3': CategoricalFeature(f1+f3; (12, 1)), 'f2+f3': CategoricalFeature(f2+f3; (12, 1)), 'f1+f2': CategoricalFeature(f1+f2; (12, 1))}
  f1+f2: 0 2 4 1 3 5 5 1 2 4 4 0
  f1+f3: 0 2 4 0 1 4 4 0 1 3 4 0
  f2+f3: 1 3 1 2 1 2 2 2 2 0 1 1

degree = 3
new_features: {'f1+f2+f3': CategoricalFeature(f1+f2+f3; (12, 1))}
  f1+f2+f3: 0 2 6 1 4 7 7 1 3 5 6 0


<a id='features_storage'></a>
# 6. FeaturesStorage<sup>[toc](#toc)</sup>

In [None]:
class CategoricalFeaturesManager(Checker):
    def __init__(self, verbose):
        super().__init__()
        self._features = {}
        self._n_samples = None
        self._verbose = verbose
        self._categorical_combiner = CategoricalCombiner(verbose)
        
    def __contains__(self, feature_name):
        return feature_name in self._features
 
    ###################################################################
    def is_present(self, name):
        return name in self._features
    def _check_if_present(self, *args):
        for name in args:
            if not self.is_present(name):
                raise ValueError(self._error_msg("unknown feature \"{}\"".format(name)))  
    def _is_binary(self, values):
        if len(Counter(values)) == 2:
            return True
        return False
    def _check_feature(self, feature):
        self._check_type(feature, str(feature), CategoricalFeature)
        if (self._n_samples is not None) & (len(feature) != self._n_samples):
            raise ValueError("Given feature vector has size {} while must have size {}.".format(
                        len(feature), self._n_samples))

    ################################################################### 
    def set_feature(self, feature, copy=True, replace=True):
        """
        Помещает признак в хранилище.
        Аргументы:
            :param feature - словарь из {имя_признака: признак}. (dict)
            :param copy    - если True, то в хранилище будет помещена копия признака. (bool)
            :parma replace - если True, то признак с таким же именем будет заменен;
                             если False, то наличине признака с таким же именем вызывает исключение. (bool)
        """
        
        self._check_feature(feature) # новый признак имеет правильный размер и категориальный тип
        if not replace:              # если замена признака запрещена ...
            if self.is_present(feature.get_name()): # и уже есть признак с таким именем, то ...
                error_msg = self._method_msg('set_feature') +\
                    'feature "{}" cannot be replaced. Check "replace" parameter'.format(feature.get_name())
                raise ValueError(error_msg)

        self._n_samples = len(feature)
        self.del_feature(feature, throw=False)
        if copy:
            self._features[feature.get_name()] = feature.deepcopy()
        else:
            self._features[feature.get_name()] = feature
            
    def del_feature(self, feature_name, throw=True):
        """
        Аргументы:
            :param feature_name
            :param throw
        """
        if not self.is_present(feature_name):
            if throw:
                error_msg = self._method_msg('del_feature') +\
                    'feature "{}" is not present in storage. Cannot be deleted.'.format(feature_name)
                raise KeyError(error_msg)
            return False
        else:
            del self._features[feature_name]
            if len(self._features) == 0:
                self._n_samples = None
            return True
    
    def get_feature(self, feature_name, copy=True):
        self._check_if_present(feature_name)
        if copy:
            return self._features[feature_name].deepcopy()
        return self._features[feature_name]
    
    def get_list_of_features(self):
        return sorted(list(self._features.keys()))


    ###################################################################
    #    Функции комбинирования категориальных признаков              #
    ###################################################################
    def add_all_combinations(self, feature_names, degree, hash=hash):
        self.get_all_combinations(feature_names, degree, hash=hash, store=True, copy=False)
        
    def get_all_combinations(self, feature_names, degree, hash=hash, store=True, copy=True):
        method_msg = self._method_msg('get_all_combinations')
        self._printers[self.METHOD](method_msg + '(names={}, degree={}, store={})'.format(feature_names, degree, store))
        self._check_if_present(feature_names)
        
        features = {name: self._features[name] for name in feature_names}
        combined_features = self._categorical_combiner.get_all_combinartions(features, degree=degree, hash=hash)
        if store:
            if degree > 1:
                for name, combined_feature in combined_features:
                    self.set_feature(combined_feature, copy=copy, replace=False)
        return combined_features

    def add_combined_feature(self, feature_names, hash=hash):
        self.get_combined_feature(feature_names, hash=hash, store=True, copy=False)
        
    def get_combined_feature(self, feature_names, hash=hash, store=True, copy=True):
        method_msg = self._method_msg('get_combined_feature')
        self._printers[self.METHOD](method_msg + '(names={}, store={})'.format(feature_names, store))
        self._check_if_present(feature_names)
        
        features = [self.features[name] for name in feature_names]
        combined_feature = CategorialCombiner().get_combined_feature(features, hash=hash)
        if store:
            if len(feature_names) > 1:
                self.set_feature(combined_feature, copy=copy, replace=False)
        return combined_feature


    ############################################################
    ##       Сборка итогового признакового представления      ##
    ############################################################
    def assemble_data_frame(self, feature_names):
        self._check_if_present(feature_names)
        feature_values = []
        for feature_name in feature_names:
            feature_values.append(self._features[feature_name].get_values())
        return pd.DataFrame(np.hstack(feature_values), columns=feature_names)
    
    def assemble(self, feature_names, sparse=False):
        """
        Аргументы:
            :param feature_names
            :param sparse
        """
        self._check_if_present(feature_names)
        X = []
        feature_map = copy.deepcopy(feature_map)
        for feature_name in feature_names:
            feature = self._features[feature_name]
            X.append(feature.get_values(sparse=sparse))
        if sparse:
            return scipy.sparse.hstack(X)
        return np.hstack(X)

    def add_filtered(self, name, threshold):
        self.get_filtered(name, threshold, store=True, copy=False)
    def get_filtered(self, name, threshold, store=True, copy=True):
        self._check_if_present(name)
        new_feature = self._features[name].get_filtered_feature(threshold)
        if store:
            self.set_feature(new_feature, copy=copy, replace=False)
        return new_feature
    
    def add_counter(self, name):
        self.get_filtered(name, store=True, copy=False)
    def get_counter(self, name, store=True, copy=True):
        self._check_if_present(name)
        new_feature = self._features[name].get_counter_feature(threshold)
        if store:
            self.set_feature(new_feature, copy=copy, replace=False)
        return new_feature
    def get_loo(self, name, y_train, cv, seed=1234):
        self._check_if_present(name)
        new_feature = self._features[name].get_loo_feature(y_train, cv=cv, seed=seed)
        return new_feature
        
    

test = False
if test:      
    FStest = FeaturesStorage(verbose=0)
    features = [CategorialFeature(np.array([0, 1, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4]), 'f1'),
                CategorialFeature(np.array([1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1]), 'f2'),
                CategorialFeature(np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]), 'f3'),
                NumericFeature(np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4]), 'f4')]
    for feature in features:
        FStest.set_feature(feature)
        assert feature.shape == (14, )
        assert len(feature) == 14

    #temp = features_storage.get_feature('f5')
    print('\nLists of features:')
    names = FStest.get_list_of_features()
    print('all:', names)
    print(FStest.assemble_data_frame(names))
    print('f1' in features_storage)
    print('f5' in features_storage)
    
    print('\nDeleting and setting features:')
    FStest.del_feature('f1')
    print('del f1:', FStest.get_list_of_features()) 
    FStest.del_feature('f4')
    print('del f4:', FStest.get_list_of_features())
    FStest.del_feature('f3')
    print('del f3:', FStest.get_list_of_features())
    FStest.del_feature('f2')
    print('del f2:', FStest.get_list_of_features())
    
    for feature in features:
        FStest.set_feature(feature)    
    
    num_feature = FStest.get_feature('f4')
    new_feature = num_feature.get_categorized_feature(np.linspace(0, 2, 3), right=False)
    FStest.set_feature(new_feature)
    print(new_feature, new_feature.values)
    
    print('\nCombining features:')
    new_feature = FStest.get_combined_feature(['f1'])
    print(new_feature, new_feature.values)
    new_feature = FStest.get_combined_feature(['f2'])
    print(new_feature, new_feature.values)
    new_feature = FStest.get_combined_feature(['f3'])
    print(new_feature, new_feature.values)
    new_feature = FStest.get_combined_feature(['f1', 'f2'])
    print(new_feature, new_feature.values)
    new_feature = FStest.get_combined_feature(['f1', 'f3'])
    print(new_feature, new_feature.values)
    new_feature = FStest.get_combined_feature(['f2', 'f3'])
    print(new_feature, new_feature.values)
    new_feature = FStest.get_combined_feature(['f1', 'f2', 'f3'])
    print(new_feature, new_feature.values)
    print('all:', FStest.get_list_of_features())
    
    print('\nFiltering values:')
    thr = 2
    FStest.add_filtered('f1', thr, True)
    FStest.add_filtered('f2', thr, False)
    FStest.add_filtered('f3', thr, False)
    print('all:', FStest.get_list_of_features())
    feature = FStest.get_feature('FA{}_f1'.format(thr))
    print(feature.name, feature.values)
    feature = FStest.get_feature('FN{}_f2'.format(thr))
    print(feature.name, feature.values)
    feature = FStest.get_feature('FN{}_f3'.format(thr))
    print(feature.name, feature.values)
    
    print('\nObtaining counters:')
    for name in ['f1', 'f2', 'f3']:
        FStest.add_counter(name)
        feature = FStest.get_feature('CTR_' + name)
        print(feature.name, feature.values)
    print('all:', FStest.get_list_of_features())
    print('cat:', FStest.get_list_of_features('CAT'))
    print('num:', FStest.get_list_of_features('NUM'))
    print('\nAssembling features')
    print(FStest.assemble(['f1', 'f2', 'f3', 'CTR_f3', 'f1+f2', 'f1+f3', 'f2+f3', 'f1+f2+f3', 'f4'], sparse=False))
    print(FStest.assemble(['FA2_f1', 'f2', 'f3', 'f1+f2', 'f1+f3', 'f2+f3'], 
                          {'FA2_f1': ['def', 'ohe'],
                           'f2': ['def', 'ohe'],
                           'f3': ['def', 'ohe']},
                          sparse=False))

In [None]:
    """def add_categorized(self, name, bins, right=True):
        self.check_if_present_(name)
        self.check_type_(name, 'NUM')
        new_feature = self.features[name].get_categorized_feature(bins, right)
        self.features[new_feature.name] = new_feature
        self.types[new_feature.name] = 'CAT'
        return new_feature.name"""
    