In [1]:
import numpy as np
from time import sleep, time
import matplotlib.pyplot as plt
import pickle as pickle

In [2]:

class SOM:

    
    wNodes = None
     
    alpha0 = None # taxa de aprendizagem
    sigma0 = None # raio inicial de influência
    dataIn = None # representação
    grid = None   # matriz de neurônios
     
    def __init__ (self, dataIn, grid=[10,10], alpha=0.1, sigma=None):
        dim = dataIn.shape[1] #identificado quantas dimensões a matriz possui, no caso do trabalho será a quantidade de tokens 
        self.wNodes = np.random.uniform(-1,1,[grid[0], grid[1], dim]) # aqui ocorre a criação da matriz de pesos ( 3 dimensões, cada elemento dessa matriz será um vetor)   
         
        self.alpha0 = alpha
        if (sigma is None):
            self.sigma0 = max(grid) / 2.0
        else:
            self.sigma0 = sigma
         
        self.dataIn = np.asarray(dataIn)
        self.grid = grid

In [3]:
def train (self, maxIt=100, verbose=True, analysis=False, timeSleep = 0.5):
        nSamples = self.dataIn.shape[0] #quantidade de textos 
        m = self.wNodes.shape[0]        
        n = self.wNodes.shape[1]        
     
     
        # The time constant needs to be computed just one time, so we so it before the loop starts        
        timeCte = (maxIt/np.log(self.sigma0))        
        if analysis:
            print ('timeCte = ', timeCte)
             
        timeInit = 0       
        timeEnd = 0
        for epc in range(maxIt): #maxIt é a quantidade de vezes que será iterado 
            # Computing the constants
            alpha = self.alpha0*np.exp(-epc/timeCte)
            sigma = self.sigma0 * np.exp(-epc/timeCte)
             
            if verbose:
                print ('Epoch: ', epc, ' - Expected time: ', (timeEnd-timeInit)*(maxIt-epc), ' sec')
                 
            timeInit = time()
 
            for k in range(nSamples):    
                 
                # Getting the winner node
                matDist = self.distance (self.dataIn[k,:], self.wNodes)
                posWin = self.getWinNodePos(matDist)                              
                 
                deltaW  = 0               
                h = 0   
                           
                for i in range(m):
                    for j in range(n):      
                        # calculando a distância euclidiana
                        dNode = self.getDistanceNodes([i,j],posWin)                       

                        # computando o nó vencedor 
                        h = np.exp ((-dNode**2)/(2*sigma**2))
                         
                        # Atualização dos pesos
                        deltaW = (alpha*h*(self.dataIn[k,:] - self.wNodes[i,j,:]))                       
                        self.wNodes[i,j,:] += deltaW
                             
                        if analysis:  
                            print ('Epoch = ', epc)
                            print ('Sample = ', k)
                            print ('-------------------------------')
                            print ('alpha = ', alpha)
                            print ('sigma = ', sigma)                            
                            print ('h = ',  h)
                            print ('-------------------------------')
                            print ('Winner Node = [', posWin[0],', ',posWin[1],']')
                            print ('Current Node = [',i,', ',j,']')
                            print ('dist. Nodes = ', dNode)
                            print ('deltaW = ', deltaW   )                     
                            print ('wNode before = ', self.wNodes[i,j,:])
                            print ('wNode after = ', self.wNodes[i,j,:] + deltaW)
                            print ('\n')                       
                            sleep(timeSleep) 
                             
            timeEnd = time()  

In [9]:
# This code uses the Euclidean distance. You may change this distance, if you want to.
    # This method computes the distance between the inputs and weights throught the 3D matrix
    def distance (self,a,b):
        return np.sqrt(np.sum((a-b)**2,2,keepdims=True))        
 
    # Method to get the distance between two nodes in the grid
    def getDistanceNodes (self,n1,n2):
        n1 = np.asarray(n1)
        n2 = np.asarray(n2)
        return np.sqrt(np.sum((n1-n2)**2))
         
    # This method gets the position of the winner node     
    def getWinNodePos (self,dists):
        arg = dists.argmin()
        m = dists.shape[0]
        return arg//m, arg%m
         
    # Method to get the centroid of a input data
    def getCentroid (self, data):
        data = np.asarray(data)        
        N = data.shape[0]
        centroids = list()
         
        for k in range(N):
            matDist = self.distance (data[k,:], self.wNodes)
            centroids.append (self.getWinNodePos(matDist))
             
        return centroids
         
    # Methods to save and load trained nodes
    def saveTrainedSOM (self, fileName='trainedSOM.csv'):
        np.savetxt(fileName, self.wNodes)
 
    def setTrainedSOM (self, fileName):
        self.wNodes = np.loadtxt(fileName)

IndentationError: unexpected indent (<ipython-input-9-bb8c9a6bc9b0>, line 3)

In [8]:
#método de serialização das estruturas 
def serialise (obj,name):
    f=open(name+ '.pkl', 'wb')
    p = pickle.Pickler(f)
    p.fast= True
    p.dump(obj)
    f.close()
    p.clear_memo

In [None]:
#invocando a função de treinamento para cada representação
matriz_tf  = np.loadtxt('./tf_matrix_abstracts.txt')
s1 = SOM(matriz_tf,[20,30], alpha=0.3)
s1.train(maxIt=5)
serialise(s1,"tf_SOM_2k_abstracts") #salva em formato .pkl
    
matriz_tfidf  = np.loadtxt('./tfidf_matrix_abstracts.txt')
s2 = SOM(matriz_tfidf,[20,30], alpha=0.3)
s2.train(maxIt=5)
serialise(s2,"tfidf_SOM_abstracts") #salva em formato .pkl

matriz_bn  = np.loadtxt('./bn_matrix_abstracts.txt')
s3 = SOM(matriz_bn,[20,30], alpha=0.3)
s3.train(maxIt=5)
serialise(s3,"bn_SOM_abstracts") #salva em formato .pkl

In [None]:

#visualização na matriz de neurônio
mTf  = pickle.load(open('tf_SOM_2k_abstracts.pkl','rb'))
mTfIdf  = pickle.load(open('tfidf_SOM_abstracts.pkl','rb'))
mBn  = pickle.load(open('bn_SOM_abstracts.pkl','rb'))

plt.scatter(mTf.wNodes[:, 0], mTf.wNodes[:,2], s = 1)
plt.title('Representação TF SOM')
plt.show()

plt.scatter(mTfIdf.wNodes[:, 0], mTfIdf.wNodes[:,2], s = 1)
plt.title('Representação Tf-Idf SOM')
plt.show()

plt.scatter(mBn.wNodes[:, 0], mBn.wNodes[:,2], s = 1)
plt.title('Representação Binária SOM')
plt.show()