In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
from numpy import log as ln
import math

# Utility Functions

In [2]:
class SparseVectors():
    def common_bits(no_of_class):
        sparse_HVs = [];

        for i in range(0,no_of_class):
            sparse_HVs.append(generate_sparse_HV())

        return sum(np.sum(np.array(sparse_HVs), axis=0)==0) + sum(np.sum(np.array(sparse_HVs), axis=0)==26)

    def average_common_bits(no_of_class, iterations):
        total = 0;
        for i in range(0,iterations):
            total += common_bits(no_of_class)
        return total/iterations

    def redundant_bits_histogram(no_of_classes, sample_size=100):
        common_bits_dist = []

        for i in range(0,sample_size):
            common_bits_dist.append(common_bits(no_of_classes))

        sns.boxplot(common_bits_dist)
        
class SparseHDC():
    # Cyclic shifts the input hypervector arr by shift_count
    @classmethod
    def cyclic_shift(self, arr, shift_count=1):
        return np.roll(arr, shift_count)
    
    @classmethod
    def hamming_distance(hv1, hv2):
        return np.sum(hv1 & hv2)
    
    # Generate a random sparse HV with dimension and sparsity
    @classmethod
    def generate_random_sparse_HV(self, dimension = 10000, sparsity=0.3):
        percent_sparsity = int(100*sparsity)
        return np.vectorize(SparseHDC._generation_threshold)(np.random.randint(100,size=dimension), percent_sparsity)
    
    # Generate count number of sparse HVs with dimension and sparsity
    @classmethod
    def generate_random_sparse_HVs(self, count=10, dimension = 10000, sparsity=0.3):
        return [SparseHDC.generate_random_sparse_HV(dimension, sparsity) for i in range(0,count)]
    
    # PRIVATE METHODS
    
    # Returns 1 if num < percent_sparsity where 0<=num<=100
    @classmethod
    def _generation_threshold(self, num, percent_sparsity = 30):
        return 1 if num<percent_sparsity else 0

# Datasets

In [3]:
class ISOLET():
    def __init__ (self, train_filepath = 'isolet1+2+3+4.csv', test_filepath = 'isolet5.csv'):
        self.train = pd.read_csv(train_filepath, header=None)
        self.train_X = self.train[[i for i in range(0,617)]]
        self.train_y = self.train[617]
        self.test_X = pd.read_csv(test_filepath, header=None)

# Continuous Item Memory Generators

In [4]:
class LinearCIM():
    def __init__(self, sparsity=0.3, dimensions=10000):
        self.sparsity = sparsity
        self.dimensions = dimensions

    def generate(self, keys):
        cim = {}
        N = int(self.sparsity*self.dimensions)
        seed = np.concatenate((np.repeat(1,N), np.repeat(0,self.dimensions-N)))
        
        for i in range(0,len(keys)):
            cim[keys[i]] = np.roll(seed, i)
            
        return cim

# Binders

In [5]:
# Binding Methods

class AdditiveCDTBinder(): #(RachKovskij & Kussul, 2001)
    def __init__(self, sparsity=0.3, component_count=2):
        self.sparsity = sparsity
        self.component_count = component_count
        self.K = math.ceil(ln(1-(1/component_count))/ln(1-(sparsity*component_count)))

    def bind(self, components):
        if len(components)!=self.component_count:
            return "Number of components must be"+self.K

        # Disjunction of all components
        z = np.logical_or.reduce((components))
        
        # PERMUTE OR operation
        z_tilde = np.repeat(0,len(components[0]))
        for i in range(1,self.K+1):
            z_tilde = np.logical_or(z_tilde,SparseHDC.cyclic_shift(z,i))
        
        bound_hv = np.vectorize(int)(np.logical_and(z,z_tilde))
        return bound_hv

# Sparsifiers

In [6]:
# Sparsifying Method

class ThresholdingSparsifier():
    def __init__(self, sparsity=0.3, max_val=617):
        self.threshold = int(sparsity*max_val)
    
    def sparsify(self, hv):
        return np.vectorize(self._threshold)(hv)
    
    def _threshold(self, num):
        return 1 if num>self.threshold else 0

# Encoding Algorithms

## 1. Sparse Feature Encoder
   based on feature encoding with the operation $$X = [B_1*L_1 + B_2*L_2...]$$

   ### Constructor Parameters: <br />
   <ul>
       <li><b>cim_generator</b> : Algorithm to generator the continuous item memory level vectors <br /></li>
       <li><b>binder</b> : Algorithm for binding two vectors <br /></li>
       <li><b>sparsifier</b> : Algorithm to convert accumulation hypervector back to sparse vector <br /></li>
   </ul>
   <br />
   Default parameters are set for the ISOLET dataset <br />
   <br />
   #TODO: injected sparsity, implemented across all the injected algorithms

In [7]:
# ENCODING ALGORITHMS

class Sparse_FeatureEncoder():
    def __init__(self, cim_generator, binder, sparsifier, feature_count=617, qlevel_count=10, dimensions=10000):
        self.cim = cim_generator
        self.binder = binder
        self.sparsifier = sparsifier
        self.feature_count = feature_count
        self.qlevel_count = qlevel_count
        self.dimensions = dimensions
        self.base_hvs = SparseHDC.generate_random_sparse_HVs(count=feature_count, sparsity=0.3)
        
        #Setup functions
        self.qlevels = self.quantization_levels()
        self.setup_CIM()

    def encode(self, features):
        if len(features)!=self.feature_count:
            return "Invalid number of features"

        #Quantize
        quantized = np.vectorize(self.quantize)(features)
        
        #Map to CIM
        mapped_to_hvs = [self.cim[v] for v in quantized]
        
        # Bind and Accumulate (Summation of Base*Level)
        accumulated_hv = np.repeat(0,self.dimensions)
        for i in range(0,self.feature_count):
            accumulated_hv += self.bind(self.base_hvs[i], mapped_to_hvs[i])
        
        thresholded_hv = self.sparsify(accumulated_hv)
        
        return thresholded_hv
    
    # ENCAPSULATED DEPENDENCY METHODS
    def bind(self, feature, value):
        return self.binder.bind([feature, value])

    def setup_CIM(self):
        self.cim = self.cim.generate(self.qlevels)
    
    def sparsify(self, hv):
        return self.sparsifier.sparsify(hv)

    # ENCODING HELPERS
    def quantization_levels(self, min_val=-1, max_val=1, precision=5):
        step = (max_val - min_val) / self.qlevel_count
        return np.arange(min_val, max_val+step, step).round(precision)
            
    def quantize(self, value):
        return min(self.qlevels, key=lambda x:abs(x-value))

In [8]:
# END-TO-END

class HDC_Classifier():
    def __init__(class_count=26):
        self.class_count = class_count

    # TODO
    def train():
        pass
    
    # TODO
    def query(hv):
        pass

# Sample Usage

Import the isolet dataset and get the first row of the training features

In [17]:
isolet = ISOLET()
isolet_first_row = isolet.train_X.loc[0]

Create a feature encoder with linear continuous item memory, uses Additive CDT binding, and sparsifies using thresholding. <br/>
(All defaults are set to: dimension = 10000, sparsity = 0.3, class_count = 26

In [18]:
feature_encoder = Sparse_FeatureEncoder(
                    cim_generator = LinearCIM(),
                    binder = AdditiveCDTBinder(),
                    sparsifier = ThresholdingSparsifier()
                )

Encode the first row of the isolet dataset

In [19]:
feature_encoder.encode(isolet_first_row)

array([0, 0, 0, ..., 0, 0, 0])