In [2]:
# Matrix Manipulation
import numpy as np
import pandas as pd

# Utility operations
from numpy import log as ln
import math
import random

# Visualization
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns

# Saving objects
import pickle

# Optimization
from functools import partial
from itertools import chain

# Utility Functions

In [3]:
class HDCModels():
    @classmethod
    def save_model(self, model, filename):
        with open(filename, 'wb') as outp:
            pickle.dump(model, outp, pickle.HIGHEST_PROTOCOL)
    
    @classmethod
    def load_model(self, filename):
        with open(filename, 'rb') as inp:
            return pickle.load(inp)

class ItemMemories():
    @classmethod
    def save_IM(self, im, filename):
        with open(filename, 'wb') as outp:
            pickle.dump(im, outp, pickle.HIGHEST_PROTOCOL)
    
    @classmethod
    def load_IM(self, filename):
        with open(filename, 'rb') as inp:
            return pickle.load(inp)

class SparseHDC():
    # Cyclic shifts the input hypervector arr by shift_count
    @classmethod
    def cyclic_shift(self, arr, shift_count=1):
        return np.concatenate((arr[-shift_count:],arr[:-shift_count]))
    
    @classmethod
    def dot(self, hv1, hv2):
        return np.sum(np.logical_and(hv1, hv2))
    
    @classmethod
    def disp(self, hv):
        s = math.sqrt(len(hv))
        if (s-int(s)):
            return "Must be square"
        
        return np.array(hv).reshape(int(s),int(s))

    # Generate a random sparse HV with dimension and sparsity
    @classmethod
    def generate_random_sparse_HV(self, dim = 10000, sparsity=0.3):
        percent_sparsity = int(100*sparsity)
        return np.vectorize(SparseHDC._generation_threshold)(np.random.randint(101,size=dim), percent_sparsity)
    
    # Generate count number of sparse HVs with dimension and sparsity
    @classmethod
    def generate_random_sparse_HVs(self, count=10, dim = 10000, sparsity=0.3):
        return [SparseHDC.generate_random_sparse_HV(dim, sparsity) for i in range(0,count)]
    
    # Generate a sparse HV with exact sparsity
    @classmethod
    def generate_sparse_HV(self, dim=10000, sparsity=0.3):
        hv = np.repeat(0,dim)
        hv[random.sample(range(1,dim),int(sparsity*dim))]=1
        return hv
    
    # Generate count number of sparse HV with dimension and exact sparsity
    @classmethod
    def generate_sparse_HVs(self, count=10, dim=10000, sparsity=0.3):
        return [SparseHDC.generate_sparse_HV(dim, sparsity) for i in range(0,count)]
    
    # PRIVATE METHODS
    
    # Returns 1 if num < percent_sparsity where 0<=num<=100
    @classmethod
    def _generation_threshold(self, num, percent_sparsity = 30):
        return 1 if num<percent_sparsity else 0

# Datasets

In [4]:
class ISOLET():
    def __init__ (self, train_filepath = 'isolet1+2+3+4.csv', test_filepath = 'isolet5.csv'):
        self.train = pd.read_csv(train_filepath, header=None)
        self.train_X = self.train[[i for i in range(0,617)]]
        self.train_y = self.train[617]
        self.test = pd.read_csv(test_filepath, header=None)
        self.test_X = self.test[[i for i in range(0,617)]]
        self.test_y = self.test[617]
        
class ItemMemory():
    def __init__(self, cim, base_hvs):
        self.cim = cim
        self.base_hvs = base_hvs

# Continuous Item Memory Generators

In [5]:
class LinearCIM():
    def __init__(self, sparsity=0.3, dim=10000, seed=None):
        self.sparsity = sparsity
        self.dim = dim
        self.seed = seed
    
    def modify_specs(self, sparsity=None, dim=None):
        self.sparsity = sparsity if sparsity else self.sparsity
        self.dim = dim if dim else self.dim

    def generate(self, keys):
        if self.seed is None:
            seed = SparseHDC.generate_sparse_HV(sparsity=self.sparsity, dim=self.dim)
        else:
            seed = self.seed
        
        tracker = pd.Series(np.copy(seed))
        bit_step = int(np.sum(seed)/(len(keys)-1))
        hvs = [seed]

        for i in range(1,len(keys)):
            next_hv = np.copy(hvs[i-1])

            # TURN OFF K bits
            turnoff_index = random.sample(list(tracker[tracker==1].index), bit_step)
            tracker[turnoff_index]=-1 #Update to cannot be touched
            next_hv[turnoff_index]=0 #Turn them off from previous hv

            # TURN ON K bits
            turnon_index = random.sample(list(tracker[tracker==0].index), bit_step)
            tracker[turnon_index]=-1 #Update to cannot be touched
            next_hv[turnon_index]=1 #Turn them on

            hvs.append(next_hv)
            
        return dict(zip(keys,hvs))

# Sparsifiers

In [6]:
# Sparsifying Method

class ThresholdingSparsifier():
    def __init__(self, percent_max_val=0.3, max_val=617):
        self.percent_max_val = percent_max_val
        self.max_val = max_val
    
    def sparsify(self, hv):
        return np.array((hv>self.threshold())).astype(np.int)
    
    def threshold(self):
        return int(self.percent_max_val*self.max_val)

In [7]:
# END-TO-END

class HDC_Classifier():
    def __init__(self, encoder, ACC_THR = 125, training_data=ISOLET()):
        self.encoder = encoder
        self.data = training_data
        self.class_hvs = {}
        self.training_encoded = {}
        self.test_encoded = None
        self.ACC_THR = ACC_THR

    def train(self, save_encodes=True):      
        # Group rows by class
        classes = self.train_y().unique()
        class_rows = {}
        class_hvs = {}
        
        # Rows in each class
        class_indexes = {}
        for class_ in classes:
            class_indexes[class_] = list(self.train_y()[self.train_y()==class_].index)

        for class_ in classes:
            class_rows[class_] = np.array(list(self.train_X().loc[class_indexes[class_]].itertuples(index=False, name=None)))
        
        encoded = {}
        for class_ in classes:
            print("Encoding... {}% ".format(round(100*class_/classes[-1],2)))
            encoded[class_] = pd.Series(map(self.encoder.encode, class_rows[class_]))
        if save_encodes:
            self.training_encoded = encoded
        
        accumulated = np.array([np.sum(encoded[class_]) for class_ in classes])
        class_sparsifier = ThresholdingSparsifier(percent_max_val = self.ACC_THR/240, max_val=240)
        thresholded = pd.Series(map(class_sparsifier.sparsify, accumulated))
        thresholded.index = range(1,27)
        
        self.class_hvs = dict(thresholded)
        
        return "Done"
    
    def test(self):
        encoded_test = pd.Series(map(self.encoder.encode, np.array(self.test_X())))
        predictions = pd.Series(map(self.query, encoded_test))
        return np.sum(predictions == self.test_y())/len(self.test_y())

    # HELPER FUNCTIONS
    def query(self, query_hv):
        d = dict([[class_, SparseHDC.dot(class_hv, query_hv)] for class_,class_hv in self.class_hvs.items()])
        return max(d, key=d.get)
    
    def train_X(self):
        return self.data.train_X
    
    def train_y(self):
        return self.data.train_y
    
    def test_X(self):
        return self.data.test_X
    
    def test_y(self):
        return self.data.test_y

# MODEL DEVELOPMENT

### CONVERT NEXT THREE CELLS TO CODE AND RUN AS NEEDED

In [8]:
# RUN THIS ONCE

isolet = ISOLET()

# Rows in each class
class_indexes = {}
classes = range(1,27)
for class_ in classes:
    class_indexes[class_] = list(isolet.train_y[isolet.train_y==class_].index)

# Rows for each class
# class_rows[class_no][sample_no], class_no corresponds to A-Z but 1-26 instead
class_rows = {}
for class_ in classes:
    class_rows[class_] = np.array(list(isolet.train_X.loc[class_indexes[class_]].itertuples(index=False, name=None)))
    
# 10 rows for each class
test_class_rows = {}

for class_, rows in class_rows.items():
    test_class_rows[class_] = rows[0:10]

# TESTING

In [9]:
class HoloGNEncoder():
    def __init__(self, M):
        self.M = M
        self.qlevels = self.quantization_levels(M)
        self.shifts = random.sample(range(0,M),M) #np.arange(0,M)

    def quantization_levels(self, M, min_val=-1, max_val=1, precision=5):
        step = (max_val - min_val) / (M-1)
        qlevels = list(np.arange(min_val, max_val+(0.1*step), step).round(precision))
        return qlevels

    def get_shift(self, value, index=False):
        closest_value = min(self.qlevels, key=lambda x:abs(x-value))
        return self.qlevels.index(closest_value) if index else closest_value

In [274]:
hologn = HoloGNEncoder(10)
enc_thresh = ThresholdingSparsifier(percent_max_val = 40/100, max_val=100)
acc_thresh = ThresholdingSparsifier(percent_max_val = 2/100, max_val=100)

In [275]:
base_hvs = SparseHDC.generate_sparse_HVs(count=617, sparsity=0.05)
# Pairwise dot product: 25/500

In [276]:
A_acc = []

for row in test_class_rows[1]:
    shifts = np.vectorize(hologn.get_shift)(row, True)
    shifted_hvs = []
    for i in range(0,617):
        shifted_hvs.append(SparseHDC.cyclic_shift(base_hvs[i],shifts[i]))
    A_acc.append(np.sum(shifted_hvs, axis=0))
    
A = pd.Series(map(enc_thresh.sparsify, A_acc))
A_c = acc_thresh.sparsify(np.sum(A))

B_acc = []

for row in test_class_rows[2]:
    shifts = np.vectorize(hologn.get_shift)(row, True)
    shifted_hvs = []
    for i in range(0,617):
        shifted_hvs.append(SparseHDC.cyclic_shift(base_hvs[i],shifts[i]))
    B_acc.append(np.sum(shifted_hvs, axis=0))
    
B = pd.Series(map(thresh.sparsify, B_acc))
B_c = acc_thresh.sparsify(np.sum(B))

In [277]:
np.sum(A_c)

411

In [278]:
np.sum(B_c)

389

In [279]:
for a in A:
    print(SparseHDC.dot(B_c,a))

87
63
66
79
66
71
71
70
74
68


In [280]:
for a in A:
    print(SparseHDC.dot(A_c,a))

118
142
146
142
158
164
146
154
175
194


In [281]:
# Intraclass
dots = []
for i in range(0,len(A)):
    for j in range(i+1,len(A)):
        if i!=j:
            dots.append(SparseHDC.dot(A[i],A[j]))
            
print("Mean no of ones: {}".format(np.average([np.sum(hv) for hv in A])))
print("Std no of ones: {}".format(np.std([np.sum(hv) for hv in A])))
print("Mean dots: {}".format(np.average(dots)))
print("Std dots: {}".format(np.std(dots)))

Mean no of ones: 427.3
Std no of ones: 15.053571004914415
Mean dots: 65.46666666666667
Std dots: 15.041572021862905


In [282]:
# Intraclass
dots = []
for i in range(0,len(B)):
    for j in range(i+1,len(B)):
        if i!=j:
            dots.append(SparseHDC.dot(B[i],B[j]))
            
print("Mean no of ones: {}".format(np.average([np.sum(hv) for hv in B])))
print("Std no of ones: {}".format(np.std([np.sum(hv) for hv in B])))
print("Mean dots: {}".format(np.average(dots)))
print("Std dots: {}".format(np.std(dots)))

Mean no of ones: 426.2
Std no of ones: 20.721003836687064
Mean dots: 60.977777777777774
Std dots: 13.512938472916964


In [283]:
dots = [] 
for a in A:
    for b in B:
        dots.append(SparseHDC.dot(a,b))

print("Mean dots: {}".format(np.average(dots)))
print("Std dots: {}".format(np.std(dots)))

Mean dots: 52.97
Std dots: 8.644599470189466


In [None]:
base_hvs = []

while len(base_hvs)!=100:
    hv = SparseHDC.generate_sparse_HV(dim=5000, sparsity=0.10)
    if np.sum([SparseHDC.dot(hv,x) for x in base_hvs])<4000:
        base_hvs.append(hv)

In [None]:
dots = []
for i in range(0,len(base_hvs)):
    for j in range(i+1, len(base_hvs)):
        dots.append(SparseHDC.dot(base_hvs[i], base_hvs[j]))

print("Mean {}".format(np.average(dots)))
sns.boxplot(dots)

In [None]:
A = SparseHDC.random