In [1]:
import sys
import logging
import h5py
import cupy as cp
import numpy as np
import math
import random
import datetime
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

class dataprep:
    def __init__(self, hdf5_path, max_artists=-1, random_state=2, lbp=False):
        '''
        Inicia a classe
        :param hdf5_path: path com o arquivo hdf5 de origem
        :param random_state: seed para randomizar
        '''
        self.hdf5_path = hdf5_path
        self.random_state = random_state
        self.max_artists = max_artists
        self.lbp = lbp

    def load_hdf5(self, min_photos_by_artist=7):
        '''
        :return: numpy array com os artistas unicos
        '''
        f = h5py.File(self.hdf5_path, 'r')
        self.np_data = np.array(f['descriptor'])
        self.cp_data = cp.array(f['descriptor'])

        unique_artists, photos_by_artist = np.unique(self.np_data[:, 0], return_counts=True)

        unique_artists = unique_artists[photos_by_artist >= min_photos_by_artist]

        np.random.seed(self.random_state)

        np.random.shuffle(unique_artists)
        if self.max_artists > -1:
            unique_artists = unique_artists[:self.max_artists]

        self.unique_artists = unique_artists
        return self.unique_artists

    def get_dictionary_artists(self, list_unique_artists=[]):
        self.dict_artists = {}
        if not list_unique_artists:
            list_unique_artists = self.unique_artists

        for artist in list_unique_artists:
            # filtra os registros/fotografias de cada artista
            dataset = self.cp_data[cp.where(self.cp_data[:, 0] == artist)]
            # adiciona um tupla (x,y) por artista
            if self.lbp:
              self.dict_artists[artist] = (dataset[:,1:]/255, dataset[:,:1])
            else:
              self.dict_artists[artist] = (dataset[:,1:], dataset[:,:1])
        return self.dict_artists

    def get_dictionary_kfold_test(self, train_percent=0.8, k=5):
        self.kfolds = k
        self.dict_artists_kfold = {}
        self.dict_artists_test = {}
        self.dict_artists_train = {}
        folds_fotos = 0
        test_fotos = 0
        lost_artist = False
        for artist in self.dict_artists.keys():

            artist_photos = self.dict_artists[artist][0].shape[0]
            split_train = math.floor(artist_photos*train_percent)
            if split_train < 1 or artist_photos < 2:
                lost_artist = True
                logging.warning("artista {} nao possui registros suficientes para treino e teste.".format(artist))
            else:
                if split_train < k:
                    lost_artist = True
                    logging.warning("artista {} nao possui registros suficientes kfold.".format(artist))
                else:
                    train_x = self.dict_artists[artist][0][:split_train]
                    train_y = self.dict_artists[artist][1][:split_train]

                    folds_x = cp.array_split(train_x, k)
                    folds_y = cp.array_split(train_y, k)

                    self.dict_artists_kfold[artist] = (folds_x, folds_y)

                    folds_fotos += split_train
                    test_fotos += (artist_photos - split_train)

                    test_x = self.dict_artists[artist][0][split_train:]
                    test_y = self.dict_artists[artist][1][split_train:]

                    self.dict_artists_train[artist] = (train_x, train_y)
                    self.dict_artists_kfold[artist] = (folds_x, folds_y)
                    self.dict_artists_test[artist] = (test_x, test_y)


        # por restringir artistas, muda a lista de artistas unicos
        if lost_artist:
            self.unique_artists = list(self.dict_artists_kfold.keys())

        logging.info("add {} artistas / {} fotos ao dict_artists_kfolds.".format(len(self.dict_artists_kfold.keys()), folds_fotos))
        logging.info("add {} artistas / {} fotos ao dict_artists_tests.".format(len(self.dict_artists_test.keys()), test_fotos))

        return self.dict_artists_train, self.dict_artists_kfold, self.dict_artists_test

    def get_dictionary_moc_ecoc_artists(self, ecoc=1, lbp=False):
        # quebra o dataset em treino ,teste e validacao

        if ecoc <= 1:
            logging.info("ini treino MOC")
        else:
            logging.info("ini treino ECOC com %fx o numero de bits.", ecoc)

        if lbp:
            logging.info("Descritor: LBP")
        else:
            logging.info("Descritor: HOG")

        qty_classes = len(self.unique_artists)
        logging.info("gerando matriz de binarios para : %i artistas", qty_classes)

        # gera uma matriz de binários para atribuir às classes
        self.moc_ecoc_qty = math.ceil(math.ceil(math.log2(qty_classes)) * ecoc)

        binary_format = "{0:0" + str(self.moc_ecoc_qty) + "b}"

        # lista_randomica
        max_random_val = 2 ** self.moc_ecoc_qty

        np.random.seed(self.random_state)
        if ecoc > 1:
            randomlist = []
            # evita repeticao de randomicos
            while len(set(randomlist)) < qty_classes:
                randomlist = list(set(randomlist))
                n = np.random.randint(0, max_random_val)
                randomlist.append(n)
        else:
            randomlist = [x for x in range(max_random_val)]

        matriz_binarios = []
        for i in randomlist:
            bin_rep = binary_format.format(i)
            matriz_binarios.append(bin_rep)


        np.random.shuffle(matriz_binarios)

        self.dict_moc_ecoc_artists = {}
        for i in range(qty_classes):
            self.dict_moc_ecoc_artists[self.unique_artists[i]] = matriz_binarios[i]

        logging.info("Matriz de bits para ECOC/MOC {}".format(self.dict_moc_ecoc_artists))


        self.list_datasets_moc_ecoc = []
        for i in range(self.moc_ecoc_qty):
            classes_dataset_0 = []
            classes_dataset_1 = []
            for classe in self.dict_moc_ecoc_artists:
                if self.dict_moc_ecoc_artists[classe][i] == "1":
                    classes_dataset_1.append(classe)
                else:
                    classes_dataset_0.append(classe)
            self.list_datasets_moc_ecoc.append((classes_dataset_0, classes_dataset_1))

        return self.dict_moc_ecoc_artists, self.list_datasets_moc_ecoc

    def get_moc_ecoc_x_y(self, dict_data, lista_classe_0, lista_classe_1, lbp=False):
        for i in range(len(lista_classe_0)):
            if i == 0:
                x_train_0 = dict_data[lista_classe_0[i]][0]
                y_classes_reais_0 = dict_data[lista_classe_0[i]][1]
            else:
                x_train_0 = cp.concatenate([x_train_0, dict_data[lista_classe_0[i]][0]])
                y_classes_reais_0 = cp.concatenate(
                    [y_classes_reais_0, dict_data[lista_classe_0[i]][1]])

        for i in range(len(lista_classe_1)):
            if i==0:
                x_train_1 = dict_data[lista_classe_1[i]][0]
                y_classes_reais_1 = dict_data[lista_classe_1[i]][1]
            else:
                x_train_1 = cp.concatenate([x_train_1, dict_data[lista_classe_1[i]][0]])
                y_classes_reais_1 = cp.concatenate(
                    [y_classes_reais_1, dict_data[lista_classe_1[i]][1]])

        x_train = cp.concatenate([x_train_0, x_train_1])
        y_train_0 = cp.full(x_train_0.shape[0], -1)
        y_train_1 = cp.full(x_train_1.shape[0], 1)
        y_train = cp.concatenate([y_train_0, y_train_1])
        y_classes_reais = cp.concatenate([y_classes_reais_0, y_classes_reais_1])
        y_train = y_train.reshape(-1, 1)
        y_classes_reais = y_classes_reais.reshape(-1, 1)
        return x_train, y_train, y_classes_reais

    def get_moc_ecoc_x_y_fold(self, dict_data, lista_classe_0, lista_classe_1, folds=[], lbp=False):
        j = 0
        k = 0
        for fold in folds:
            for i in range(len(lista_classe_0)):
                if j == 0:
                    x_train_0 = dict_data[lista_classe_0[i]][0][fold]
                    y_classes_reais_0 = dict_data[lista_classe_0[i]][1][fold]
                    j = 1
                else:
                    x_train_0 = cp.concatenate([x_train_0, dict_data[lista_classe_0[i]][0][fold]])
                    y_classes_reais_0 = cp.concatenate(
                        [y_classes_reais_0, dict_data[lista_classe_0[i]][1][fold]])

            for i in range(len(lista_classe_1)):
                if k==0:
                    x_train_1 = dict_data[lista_classe_1[i]][0][fold]
                    y_classes_reais_1 = dict_data[lista_classe_1[i]][1][fold]
                    k = 1
                else:
                    x_train_1 = cp.concatenate([x_train_1, dict_data[lista_classe_1[i]][0][fold]])
                    y_classes_reais_1 = cp.concatenate(
                        [y_classes_reais_1, dict_data[lista_classe_1[i]][1][fold]])

        x_train = cp.concatenate([x_train_0, x_train_1])
        y_train_0 = cp.full(x_train_0.shape[0], -1)
        y_train_1 = cp.full(x_train_1.shape[0], 1)
        y_train = cp.concatenate([y_train_0, y_train_1])
        y_classes_reais = cp.concatenate([y_classes_reais_0, y_classes_reais_1])
        y_train = y_train.reshape(-1, 1)
        y_classes_reais = y_classes_reais.reshape(-1, 1)
        return x_train, y_train, y_classes_reais

    def hamming_distance(self, bit_1, bit_2):
        hamming = 0
        # hamming = abs(np.array(list(bit_1)).astype(np.int8) - np.array(list(bit_2)).astype(np.int8)).sum()
        size = len(bit_1)
        for x in range(size):
            hamming += abs(int(bit_2[x]) - int(bit_1[x]))
        return hamming

    def min_hamming_distance(self, val, list_vals, max_hamming=-1):
        # ini = datetime.datetime.now()
        min_hamming = max_hamming
        if max_hamming < 0:
            min_hamming = len(val) + 1
        for j in list_vals:
            hd = self.hamming_distance(val, j)
            if hd==1:
                return j
            if hd < min_hamming:
                min_hamming = hd
                min_val = j
        # fim = datetime.datetime.now()
        # tempo = fim - ini
        return min_val

    def decoder_resultados_oc(self, dict_moc, resultados, y_classes_reais, tipo=""):
        # cria um dicionário inverso para o
        dict_classes = {}
        for classe in dict_moc.keys():
            dict_classes[dict_moc[classe]] = classe

        res_moc = []
        for res in range(len(resultados[0])):
            lista_res = "".join([res_y[res][0] for res_y in resultados])
            res_moc.append(lista_res)

        res_moc_classes = []

        # para não fazer a contagem item a item no metodo min_hamming_distante
        max_hamming = len(res_moc[0])

        for x in res_moc:
            if x in dict_classes.keys():
                res_moc_classes.append(dict_classes[x])
            else:
                res_moc_classes.append(dict_classes[self.min_hamming_distance(x,
                                                                              dict_classes.keys(),
                                                                              max_hamming=max_hamming)])

        res_moc_classes = cp.array(res_moc_classes).reshape(-1, 1)

        res_moc_classes = res_moc_classes.reshape(-1, 1)
        y_classes_reais = y_classes_reais.reshape(-1, 1)

        logging.info("ecoc: acuracia %s %f", tipo, (res_moc_classes == y_classes_reais).astype(int).mean())


Mounted at /content/drive


In [1]:
import sys
import logging
import h5py
import cupy as cp
import numpy as np
import math
import random
import datetime
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import logging
import numpy as np
import cupy as cp
import math
from cvxopt import matrix, solvers

class svm_gpu:
    def __init__(self, X, Y, config=None):
        '''
        inicializador da classe
        :param X: caracteristicas para treinar
        :param Y: resultado desejado
        :param log_file: nome do arquivo de log para guardar
        '''
        logging.info("instanciado um svm")
        self.X = X
        self.Y = Y
        self.config = config
        if config:
            self.kkttol = config['kkttol']
            self.chunksize = config['chunksize']
            self.bias = config['bias']
            self.sv = config['sv']
            self.svcoeff = config['svcoeff']
            self.normalw = config['normalw']
            self.C = config['C']
            self.h = config['h']
            self.debug = config['debug']
            self.alphatol = config['alphatol']
            self.SVThresh = config['SVThresh']
            self.qpsize = config['qpsize']
            self.logs = config['logs']
            self.configs = config['configs']
            self.kernelpar = config['kernelpar']
            self.randomWS = config['randomWS']
        else:
            self.kkttol = 5e-2
            self.chunksize = 4000
            self.bias = []
            self.sv = []
            self.svcoeff = []
            self.normalw = []
            self.C = 2
            self.h = 0.01
            self.debug = True
            self.alphatol = 1e-2
            self.SVThresh = 0.
            self.qpsize = 512
            self.logs = []
            self.configs = {}
            self.kernelpar = 1  # ao aumentar eentre 1 e 2, melhorou a acuracia em testes
            self.randomWS = True

    def prep(self):
        '''
        etapa de validacao dos atributos
        :return:
        '''
        self.N, self.ne = self.X.shape
        self.class0 = self.Y == -1
        self.class1 = self.Y == 1

        self.Y = self.Y.reshape(self.N, 1)
        self.qpsize = min(self.N, self.qpsize)
        self.C = cp.full((self.N, 1), self.C)
        self.alpha = cp.zeros((self.N, 1))

        if self.debug:
            logging.debug("Working set possui {} exemplos de classe positiva e {} exemplos de classe negativa.".format(self.Y[self.class1].shape[0], self.Y[self.class0].shape[0]))





    def fit(self):
        '''
        Treina o SVM
        codigo obtido na aula da disciplina SIN5016 originalmente em matlab
        convertido para python e orientacao a objetos.
        :return:
        '''
        logging.info("svm.fit")

        X = self.X
        Y = self.Y
        logging.info('X.shape %s', str(X.shape))
        logging.info('Y.shape %s', str(Y.shape))
        logging.info('config: {}'.format(self.config))

        saida_svm = cp.zeros((self.N,1))
        alphaOld = cp.copy(self.alpha)
        if self.Y[self.class1].shape[0] == self.N:
            self.bias = 1
            return

        if self.Y[self.class0].shape[0] == self.N:
            self.bias = -1
            return

        iteracao = 0
        workset = cp.full((self.N,1), False)
        sameWS = 0
        self.bias = 0
        while True:
            # logging.info('Iteracao %i', iteracao)

            # Passo 1: determina os vetores de suporte
            self.findSV()

            # Passo 2: Encontra a saída para o SVM
            if iteracao == 0:
                changedSV = self.svind # todo: cp.copy(self.svind)
                changedAlpha = self.alpha[changedSV] # todo: cp.copy(self.alpha[changedSV])
                saida_svm = cp.zeros((self.N, 1))
            else:
                changedSV = cp.flatnonzero(alphaOld != self.alpha)
                changedAlpha = self.alpha[changedSV] - alphaOld[changedSV]

            # Função de kernel RBF
            chunks1 = math.ceil(self.N / self.chunksize)
            chunks2 = math.ceil(len(changedSV) / self.chunksize)

            for ch1 in range(chunks1):
                ini_ind1 = (ch1)*self.chunksize
                fim_ind1 = min(self.N, (ch1+1)*self.chunksize)
                for ch2 in range(chunks2):
                    ini_ind2 = (ch2)*self.chunksize
                    fim_ind2 = min(len(changedSV), (ch2+1)*self.chunksize)
                    K12 = self.calc_rbf(X[ini_ind1:fim_ind1, :], X[changedSV][ini_ind2:fim_ind2])
                    coeff = changedAlpha[ini_ind2:fim_ind2]*Y[changedSV[ini_ind2:fim_ind2]]
                    saida_svm[ini_ind1:fim_ind1] = saida_svm[ini_ind1:fim_ind1] + cp.dot(K12, coeff)

            # Passo 3: Calcule o bias da função de decisão
            workSV = cp.flatnonzero(cp.logical_and(self.SVnonBound, workset))
            if not workSV.size == 0:
                self.bias = cp.mean(Y[workSV] - saida_svm[workSV])


            # Passo 4: Calcula as condicoes de KKT
            KKT = (saida_svm+self.bias)*self.Y-1
            KKTViolations1 = cp.logical_and(self.SVnonBound, (abs(KKT)>self.kkttol))
            KKTViolations2 = cp.logical_and(self.SVBound, KKT > self.kkttol)
            KKTViolations3 = cp.logical_and(cp.logical_not(self.SV), (KKT < (self.kkttol*-1)))
            KKTViolations = cp.logical_or(KKTViolations1, KKTViolations2)
            KKTViolations = cp.logical_or(KKTViolations, KKTViolations3)

            # (uint8((SVnonbound & (abs(KKT) > svm.kkttol)) | ...
            # (SVbound & (KKT > svm.kkttol)) | ...
            #        (~SV & (KKT < -svm.kkttol))))



            count_kkt = len(cp.flatnonzero(KKTViolations))

            if iteracao % 100 == 0:
                logging.info('iteracao: {} KKT violacoes: {}'.format(iteracao, count_kkt))

            if count_kkt == 0:
                # sem violacoes, terminar
                logging.info('fim do treino por fim das violacoes de KKT, total iteracoes: {}'.format(iteracao))
                break

            # Passo 5: determinar o novo conjunto de trabalho
            searchDir = saida_svm - self.Y
            set1 = cp.logical_and(cp.logical_or(self.SV, self.class0), cp.logical_or(cp.logical_not(self.SVBound), self.class1))
            set2 = cp.logical_and(cp.logical_or(self.SV, self.class1), cp.logical_or(cp.logical_not(self.SVBound), self.class0))

            if self.randomWS:
                cp.random.seed(0)
                searchDir = cp.random.rand(self.N, 1)
                set1 = self.class1
                set2 = self.class0
                self.randomWS = False


            # Passo 6: Seleciona o working set
            #          (QPsize/2 exemplos de set1, QPsize/2 de set2
            worksetOld = cp.copy(workset)
            workset = cp.full((self.N, 1), False)

            if cp.flatnonzero(cp.logical_or(set1, set2)).size <= self.qpsize:
                workset[cp.logical_or(set1, set2)] = True
            elif cp.flatnonzero(set1).size <= math.floor(self.qpsize/2):
                workset[set1] = True
                set2 = cp.flatnonzero(cp.logical_and(set2, cp.logical_not(workset)))
                ind = searchDir[set2].argsort(0)
                from2 = min(set2.size, self.qpsize - cp.flatnonzero(workset).size)
                workset[set2[ind[:from2]]] = True
            elif cp.flatnonzero(set2).size <= math.floor(self.qpsize/2):
                workset[set2] = True
                set1 = cp.flatnonzero(cp.logical_and(set1, cp.logical_not(workset)))
                ind = -searchDir[set1].argsort(0)
                from1 = min(set1.size, self.qpsize - cp.flatnonzero(workset).size)
                workset[set1[ind[:from1]]] = True
            else:
                set1 = cp.flatnonzero(set1)
                ind = (-searchDir[set1]).argsort(0)
                from1 = min(set1.size, math.floor(self.qpsize /2))
                workset[set1[ind[:from1]]] = True

                set2 = cp.flatnonzero(cp.logical_and(set2, cp.logical_not(workset)))
                ind = searchDir[set2].argsort(0)
                from2 = min(set2.size, self.qpsize - cp.flatnonzero(workset).size)
                workset[set2[ind[:from2]]] = True

            worksetind = cp.flatnonzero(workset)

            if cp.all(workset==worksetOld):
                sameWS +=1
                if sameWS == 3:
                    logging.info('fim do treino por por permanecer no mesmo workingset por: %i iteracoes', sameWS)
                    break
            else:
                sameWS = 0

            worksize = worksetind.size
            nonworkset = cp.logical_not(workset)

            # Passo 7: Determine a parte da programação linear
            nonworkSV = cp.flatnonzero(cp.logical_and(nonworkset, self.SV))
            qBN = 0
            if nonworkSV.size > 0:
                chunks = math.ceil(nonworkSV.size/self.chunksize)
                for ch in range(chunks):
                    ind_ini = self.chunksize*ch
                    ind_fim = min(nonworkSV.size, self.chunksize*(ch+1))
                    Ki = self.calc_rbf(X[worksetind, :], X[nonworkSV[ind_ini:ind_fim], :])
                    qBN += Ki.dot(self.alpha[nonworkSV[ind_ini:ind_fim]] * Y[nonworkSV[ind_ini:ind_fim]])
                qBN = qBN * Y[workset].reshape(-1,1)

            f = qBN - cp.ones((worksize,1))

            # Passo 8: Soluciona a programação quadrática
            eps_2_3 = np.spacing(1)**(2/3)
            H = self.calc_rbf(self.X[worksetind, :], self.X[worksetind, :])
            H += cp.diag(cp.ones((worksize, 1))*eps_2_3)
            H = H * (self.Y[workset].dot(Y[workset].T))

            A = Y[workset].T.astype('float').reshape(1,-1)

            if nonworkSV.size > 0:
                eqconstr = -self.alpha[nonworkSV].T.dot(Y[nonworkSV])
            else:
                eqconstr = cp.zeros(1)

            VLB = cp.zeros((1, worksize))
            VUB = self.C[workset].astype('float')


            start_val = self.alpha[workset]
            start_val = start_val.reshape(worksize, 1)

            # cvxopt = quadprog(C1, C2, C3, C4, C5, C6      , C7 , C8 ,
            # matlab = quadprog(H , f , [], [], A , eqconstr, VLB, VUB, startVal,

            # _H = matrix(H) #P #todo: ver direito isso ae
            _f = matrix(np.array(f.get())) #q

            tmp1 = cp.diag(cp.ones(worksize) * -1)
            tmp2 = cp.identity(worksize)
            G = matrix(np.array(cp.vstack((tmp1, tmp2)).get()))

            tmp1 = cp.zeros(worksize)
            tmp2 = cp.ones(worksize) * 10.
            h = matrix(np.array(cp.hstack((tmp1, tmp2)).get()))

            _c3 = matrix(np.array(cp.vstack((cp.eye(worksize)*-1,cp.eye(worksize))).get())) #G
            _c4 = matrix(np.array(cp.hstack((cp.zeros(worksize), cp.ones(worksize) * 10)).get())) #h
            _A = matrix(np.array(A.get())) #A
            _eqconstr = matrix(np.array(eqconstr.get())) #b
            # _VLB = matrix(VLB)
            # _VUB = matrix(VUB)
            _start_val = matrix(np.array(start_val.get()))

            H = self.calc_rbf(X[worksetind], X[worksetind])
            eps_2_3 = np.spacing(1) ** (2 / 3)
            H = H + cp.diag(cp.full((worksize, worksize), eps_2_3).diagonal())
            H = H * cp.dot(Y[workset].reshape(-1,1), (Y[workset].T.reshape(1,-1)))
            _H = matrix(np.array(H.get()).astype(float))
            solvers.options['maxiters'] = 1000
            solvers.options['show_progress'] = self.debug
            # solvers.options['abstol'] = 1e-8
            # solvers.options['reltol'] = 1e-8
            # solvers.options['feastol'] = 1e-8
            # solvers.options['refinement'] = 1
            sol = solvers.qp(_H, _f, G, h, A=_A, b=_eqconstr, initvals=_start_val)
            workAlpha = cp.array(sol['x'])

            # logging.info('work alpha: %s', str(workAlpha))

            alphaOld = cp.copy(self.alpha)
            self.alpha[workset] = workAlpha.squeeze()
            iteracao += 1

        self.svcoeff = self.alpha[self.svind] * Y[self.svind]
        self.SV = X[self.svind, :]

    def findSV(self):
        '''
        busca os vetores de suporte baseado nos criterios de alphatol / threshold
        :return:
        '''
        maxalpha = self.alpha.max()
        if maxalpha > self.alphatol:
            self.SVThresh = self.alphatol
        else:
            eps_1 = np.spacing(1)
            self.SVThresh = math.exp((math.log(max(eps_1, maxalpha))+ math.log(eps_1))/2)

        self.SV = self.alpha>=self.SVThresh

        self.SVBound = self.alpha>=(self.C-self.alphatol)

        self.SVnonBound = cp.logical_and(self.SV, cp.logical_not(self.SVBound))
        self.svind = cp.flatnonzero(self.SV)

    def calc_rbf(self, X1, X2):
        '''
        funcao kernel rbf
        :param X1:
        :param X2:
        :return:
        '''
        N1, d = X1.shape
        N2, d = X2.shape

        dist2 = cp.tile(cp.sum((X1 ** 2).T, 0), (N2, 1)).T
        dist2 += cp.tile(cp.sum((X2 ** 2).T, 0), (N1, 1))
        dist2 -= 2 * X1.dot(X2.T)
        return cp.exp(-dist2/(2*self.kernelpar**2))

    def return_instance_for_predict(self):
        '''
        Retorna uma instancia do modelo treinado com apenas o necessario
        :return:
        '''
        #
        self.X = None
        self.Y = None
        self.alphatol = None
        self.kkttol = None
        self.alpha = None
        self.svind = None
        self.class1 = None
        self.class0 = None
        self.C = None
        self.normalw = None
        self.ne = None
        self.SVBound = None
        self.SVnonBound = None
        self.SV = self.SV.astype(cp.float16)
        self.svcoeff = self.svcoeff.astype(cp.float16)
        return self

    def predict(self, X):
        N, d = X.shape
        nbSV = self.SV.shape[0]
        chsize = self.chunksize
        Y1 = cp.zeros((N, 1))
        chunks1 = math.ceil(N / chsize)
        chunks2 = math.ceil(nbSV / chsize)

        for ch1 in range(chunks1):
            ini_ind1 = (ch1) * self.chunksize
            fim_ind1 = min(self.N, (ch1 + 1) * self.chunksize)
            for ch2 in range(chunks2):
                ini_ind2 = (ch2) * self.chunksize
                fim_ind2 = min(nbSV, (ch2 + 1) * self.chunksize)
                K12 = self.calc_rbf(X[ini_ind1:fim_ind1, :], self.SV[ini_ind2:fim_ind2, :])
                Y1[ini_ind1:fim_ind1] += K12.dot(self.svcoeff[ini_ind2:fim_ind2])

        Y1 += self.bias
        Y = cp.sign(Y1)
        Y[Y==0] = 1
        return Y, Y1


def acuracia(y, y_predito):
    '''
    calcula a acuracia
    :param y: y desejado
    :param y_predito: y predito
    :return: acuraria de 0 a 1
    '''
    y.reshape(y_predito.shape)
    acuracia = (y==y_predito).astype(float).mean()
    return acuracia

In [3]:
import sys
import logging
import numpy as np
import cupy as cp
import datetime
import pickle


def treino_svm(train, test, config, valida_teste=True):
    ini_treino = datetime.datetime.now()
    x_train, y_train_svm, y_train_class = train


    svm_ = svm_gpu(x_train, y_train_svm, config=config)
    svm_.debug = False
    svm_.prep()
    svm_.fit()

    y_predict_train, y1_predict_train = svm_.predict(x_train)
    acuracia_treino = acuracia(y_train_svm, y_predict_train)

    logging.info("Treino Acuracia:{}".format(acuracia_treino))

    if valida_teste:
        x_test, y_test_svm, y_test_class = test
        y_predict_test, y1_predict_test = svm_.predict(x_test)
        acuracia_test = acuracia(y_test_svm, y_predict_test)
        logging.info("Teste Acuracia:{}".format(acuracia_test))

    fim_treino = datetime.datetime.now()

    logging.info("tempo treinamento {}".format(fim_treino - ini_treino))
    if valida_teste:
        return svm_.return_instance_for_predict(), acuracia_treino, acuracia_test
    else:
        return svm_.return_instance_for_predict(), acuracia_treino

def treino_svm_total(dt_obj, config):
    logging.info("ini treino_svm_total")
    ini_full = datetime.datetime.now()

    moc_svms_full = []
    for svm_dataset in dt_obj.list_datasets_moc_ecoc:
        x_train, y_train_svm, y_train_class = dt_obj.get_moc_ecoc_x_y(dt_obj.dict_artists_train, svm_dataset[0], svm_dataset[1])
        x_test, y_test_svm, y_test_class = dt_obj.get_moc_ecoc_x_y(dt_obj.dict_artists_test, svm_dataset[0], svm_dataset[1])

        svm_pesos, a, b = treino_svm((x_train, y_train_svm, y_train_class), (x_test, y_test_svm, y_test_class), config=config)
        moc_svms_full.append(svm_pesos)

    pickle.dump(moc_svms_full, open("5016_svm_hog_moc_gpu_{}.dat".format(len(dt_obj.unique_artists)), "wb"))

    resultados = []
    for svm_treinado in moc_svms_full:
        Ysvm_train, Y1svm_train = svm_treinado.predict(x_train)
        Ysvm_train[Ysvm_train < 0] = '0'
        Ysvm_train[Ysvm_train > 0] = '1'
        resultados.append(np.array(Ysvm_train.get()).astype(int).astype(str))

    dt_obj.decoder_resultados_oc(dt_obj.dict_moc_ecoc_artists, resultados, y_train_class, "treino")

    resultados_teste = []
    for svm_treinado in moc_svms_full:
        Ysvm_train, Y1svm_train = svm_treinado.predict(x_test)
        Ysvm_train[Ysvm_train < 0] = '0'
        Ysvm_train[Ysvm_train > 0] = '1'
        resultados_teste.append(np.array(Ysvm_train.get()).astype(int).astype(str))

    dt_obj.decoder_resultados_oc(dt_obj.dict_moc_ecoc_artists, resultados_teste, y_test_class, "teste")

    fim_full = datetime.datetime.now()
    pickle.dump(moc_svms_full, open("/content/drive/MyDrive/5016_svm_hog_moc_gpu_ecoc_4_{}.dat".format(len(dt_obj.unique_artists)), "wb"))
    pickle.dump(dt_obj.dict_moc_ecoc_artists, open("/content/drive/MyDrive/dict_moc_ecoc_artists_ecoc_4_{}.dat".format(len(dt_obj.unique_artists)), "wb"))

    logging.info("tempo treino_svm_total: {}".format(fim_full-ini_full))
    logging.info("fim treino_svm_total")

def kfold_cross_validation(dt_obj, config):
    logging.info("ini kfold_cross_validation")
    ini_ = datetime.datetime.now()

    moc_svms_full = []
    k = dt_obj.kfolds
    list_acuracia_kfolds = []
    bit = 0
    for svm_dataset in dt_obj.list_datasets_moc_ecoc:
        # cada iteracao corresponde a um bit do moc/ecoc
        list_folds = np.arange(k)
        result_test = []
        result_train = []
        
        for fold in range(k):
            folds_train = np.delete(list_folds, fold)
            folds_test = np.delete(list_folds, folds_train)

            x_train, y_train_svm, y_train_class = dt_obj.get_moc_ecoc_x_y_fold(dt_obj.dict_artists_kfold,
                                                                               svm_dataset[0],
                                                                               svm_dataset[1],
                                                                               folds_train)

            x_test, y_test_svm, y_test_class = dt_obj.get_moc_ecoc_x_y_fold(dt_obj.dict_artists_kfold,
                                                                       svm_dataset[0],
                                                                       svm_dataset[1],
                                                                       folds_test)

            svm_pesos, acuracia_treino, acuracia_test = treino_svm((x_train, y_train_svm, y_train_class),
                                                                   (x_test, y_test_svm, y_test_class), config=config)
            result_train.append(acuracia_treino)
            result_test.append(acuracia_test)

        bit += 1
        list_acuracia_kfolds.append(float(cp.array(result_test).mean()))
        logging.info("acuracia media kfold treino: {} ".format(cp.array(result_train).mean()))
        logging.info("acuracia media kfold teste: {} ".format(cp.array(result_test).mean()))

    fim_ = datetime.datetime.now()
    logging.info("tempo kfold_cross_validation: {}".format(fim_-ini_))
    logging.info("fim kfold_cross_validation")
    logging.info("kfolds lista acuracia: {}".format(list_acuracia_kfolds))



# kfold_cross_validation(dt_hog)


print()






In [4]:
import logging
logging.basicConfig(filename="/content/drive/MyDrive/svm_workflow_hog_ecoc4.py.log", 
                    format='%(asctime)s %(message)s', 
                    level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
dt_hog = dataprep(hdf5_path="/content/drive/Shareddrives/grupo_estudos/hdf5/hog_11_15_20_56",
                             max_artists=2544)

dt_hog.load_hdf5()
dt_hog.get_dictionary_artists()
dt_hog.get_dictionary_kfold_test()
dt_hog.get_dictionary_moc_ecoc_artists(ecoc=4)
config = {}
config['kkttol'] = 5e-2
config['chunksize'] = 8000
config['bias'] = []
config['sv'] = []
config['svcoeff'] = []
config['normalw'] = []
config['C'] = 2
config['h'] = 0
config['debug'] = True
config['alphatol'] = 1e-2
config['SVThresh'] = 0
config['qpsize'] = 256
config['logs'] = []
config['configs'] = {}
config['kernelpar'] = 1.6
config['randomWS'] = True
treino_svm_total(dt_hog, config)
# kfold_cross_validation(dt_hog)

add 2544 artistas / 42634 fotos ao dict_artists_kfolds.
add 2544 artistas / 11660 fotos ao dict_artists_tests.
ini treino ECOC com 4.000000x o numero de bits.
Descritor: HOG
gerando matriz de binarios para : 2544 artistas
Matriz de bits para ECOC/MOC {2890.0: '001010001011110110011111011000100100111101100111', 2364.0: '101101100000001101001110101011111110000011111011', 7463.0: '010100101101001111111101111100101010111000000100', 8717.0: '001001010110110001010010101111111000010010010110', 5323.0: '010111101001110000101111100000101100111100010111', 2759.0: '101111100000111010100000101101011110110001010010', 2221.0: '010010001110001110010101110011011000111001100000', 5865.0: '001101111001110111110000101011100111101000011011', 256.0: '100111101001110011010101101110110110001000101011', 3491.0: '101110011000000001111100111101100110111100100100', 5193.0: '000101100111111111101100010011011101011011001100', 7232.0: '010001110000101011111111110100110011101110111111', 1789.0: '10100101001000001010

In [5]:
# import logging
# logging.basicConfig(filename="svm_workflow_lbp.py.log", 
#                     format='%(asctime)s %(message)s', 
#                     level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
# dt_lbp = dataprep(hdf5_path="/content/drive/Shareddrives/grupo_estudos/hdf5/lbp_grid_total",
#                              max_artists=32, lbp=True)

# dt_lbp.load_hdf5()
# dt_lbp.get_dictionary_artists()
# dt_lbp.get_dictionary_kfold_test()
# dt_lbp.get_dictionary_moc_ecoc_artists(ecoc=1)

# config = {}
# config['kkttol'] = 5e-2
# config['chunksize'] = 8000
# config['bias'] = []
# config['sv'] = []
# config['svcoeff'] = []
# config['normalw'] = []
# config['C'] = 10
# config['h'] = 0
# config['debug'] = True
# config['alphatol'] = 1e-2
# config['SVThresh'] = 0
# config['qpsize'] = 256
# config['logs'] = []
# config['configs'] = {}
# config['kernelpar'] = 1
# config['randomWS'] = True

# treino_svm_total(dt_lbp, config)
# kfold_cross_validation(dt_lbp, config)