In [1]:
import os
import json
import datetime

from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import pandas as pd

import pyperclip
from IPython.display import clear_output, display
import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

# Importação do banco de dados de Prestação de Contas

In [2]:
data_file = "receitas_candidatos_prestacao_contas_final_2016_brasil.txt"
if os.name != 'posix':    
    dataset = data_file
else:        
    dataset = f"Dados/Pretação de Contas/prestacao_contas_final_2016/{data_file}"

* Importa ignorando os valores nulos

In [3]:
dados_BR = pd.read_csv(dataset, encoding="ISO-8859-1", delimiter=";", na_values="#NULO",
                       usecols=["Descricao da receita"]).dropna()
dados_BR.head()
num_linhas = dados_BR.shape[0]

print(f"Importado o arquivo '{dataset}' contendo {num_linhas} linhas.")

Importado o arquivo 'Dados/Pretação de Contas/prestacao_contas_final_2016/receitas_candidatos_prestacao_contas_final_2016_brasil.txt' contendo 1944398 linhas.


# Tratamento dos Dados

* Remove espaços em branco e aspas antes de depois das descrições e converte para letras minúsculas

In [4]:
def limpa_descricao(descricao):
    return descricao.strip().strip("'").lower()    

In [5]:
dados_BR['Descricao da receita'] = dados_BR['Descricao da receita'].map(limpa_descricao)
dados_BR.head()

Unnamed: 0,Descricao da receita
0,foto para campanha
2,doacao de santinhos
6,serviços de contabilidade
7,serviços advocatícios
8,constituição de advogado para prestação de con...


* Remove as entradas duplicadas

In [6]:
dados_BR.drop_duplicates(inplace=True)
print(f"Gerado dataset com {dados_BR.shape[0]} linhas únicas.")

Gerado dataset com 584428 linhas únicas.


# Extrai Amostra Aleatória para Classificação e Treinamento do Modelo

In [7]:
TAMANHO_DA_AMOSTRA = 999

In [8]:
model_data = dados_BR.sample(frac=1, random_state=123).reset_index(drop=True).loc[:TAMANHO_DA_AMOSTRA]
model_data['Classificacao'] = None

* Interface para captura das entradas de classificação

In [9]:
global index_ocorrencia
index_ocorrencia = None
classificadores_db = "classificadores.txt"

def exporta_dados_classificados():
    # Salva as ocorrências já classificadas em csv para recuperação
    model_data.query("~Classificacao.isnull()").to_csv("model_data.csv", index=False, sep=";")

# busca os classificadores no arquivo
try:
    with open(classificadores_db, "r", encoding="utf-8") as f:
        classificadores = sorted(c.strip() for c in f.readlines())
        dropdown_options = [""] + classificadores
except FileNotFoundError:
    dropdown_options = [""]

drop_down = widgets.Dropdown(
    options=dropdown_options,
    value="",
    description="Classificadores:",
)

txt = widgets.Text(description="Novo Classificador")
button = widgets.Button(description="Classificar")

# Botão adicionar classificador
botao_adicionar_classificador = widgets.Button(description="Novo Classificador")
def inserir_novo_classificador(b):
    TXT_IS_NOT_EMPTY = txt.value != ""
    if TXT_IS_NOT_EMPTY:
        novo_classificador = txt.value.lower().strip()
        drop_down.options = sorted(drop_down.options + [novo_classificador])
        txt.value = ""
        save_to_file(novo_classificador)
        print('Novo classificador cadastrado.')

# Botão carrega a próxima ocorrência
botao_proximo = widgets.Button(description="Próximo")
def proxima_ocorrencia(b):
    clear_output()
    global index_ocorrencia
    index_ocorrencia = model_data.query("Classificacao.isnull()").head(1).index[0]    
    ocorrencia = model_data.loc[index_ocorrencia]['Descricao da receita']
    print(f"{index_ocorrencia} - {ocorrencia}")


def save_to_file(ocorrencia):
    with open(classificadores_db, "a") as f:
        f.write(ocorrencia + "\n")
    
    
def classifica_ocorrencia(b):
    global index_ocorrencia
    TXT_IS_EMPTY = ""
    drop_down.value == ""
    clear_output()
    try:
        classificador = drop_down.value
        if TXT_IS_EMPTY:
            print("Erro: Classificador não selecionado.")
        else:            
            if index_ocorrencia is not None:
                model_data.loc[index_ocorrencia]['Classificacao'] = classificador
                print(f"Ocorrência classificada com sucesso como {classificador}.")            
                index_ocorrencia = None
                drop_down.value = ""
                drop_down.selected_label = ""
                exporta_dados_classificados()
            else:
                print("Ocorrência não selecionada.")
    except (UnboundLocalError, NameError):
        print("Erro. Primeiro selecione a próxima ocorrência.")
    

def on_change(change, names="value"):    
    pass
   
drop_down.observe(on_change)
button.on_click(classifica_ocorrencia)
botao_proximo.on_click(proxima_ocorrencia)
botao_adicionar_classificador.on_click(inserir_novo_classificador)
display(drop_down, txt, button, botao_proximo, botao_adicionar_classificador)

40 - perfurite adesivo 43x90


In [27]:
dados_classificados = model_data.query("~Classificacao.isnull()")

## Cria as classes para treinamento do modelo

In [28]:
training_data = []
for i in range(0, len(dados_classificados)):
    sentence, sentence_class = dados_classificados.loc[i]
    if sentence_class != "Não Classificado":
        training_data.append({"class": sentence_class.strip(), "sentence": sentence})    

print ("%s sentences in training data" % len(training_data))

40 sentences in training data


In [29]:
words = []
classes = []
documents = []
ignore_words = ['?']
# loop through each sentence in our training data
for pattern in training_data:
    # tokenize each word in the sentence
    w = nltk.word_tokenize(pattern['sentence'])
    # add to our words list
    words.extend(w)
    # add to documents in our corpus
    documents.append((w, pattern['class']))
    # add to our classes list
    if pattern['class'] not in classes:
        classes.append(pattern['class'])

# stem and lower each word and remove duplicates
words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = list(set(words))

# remove duplicates
classes = list(set(classes))

print (len(documents), "documents")
print (len(classes), "classes", classes)
print (len(words), "unique stemmed words", words)

40 documents
10 classes ['7319-0/04 - consultoria em publicidade', '6911-7/01 - serviços advocatícios', '1813-0/01 - impressão de material para uso publicitário', '7319-0/03 - marketing direto', '7319-0/02 - promoção de vendas', '7711-0/00 - locação de automóveis sem condutor', '5911-1/02 - produção de filmes para publicidade', '6920-6/01 - atividades de contabilidade', '9700-5/00 - serviços domésticos', '4923-0/02 - serviço de transporte de passageiros - locação de automóveis com motorista']
228 unique stemmed words ['campanh', '2013/2013', 'marc', '3930·', 'cento', '2010/11-cor', 'da', 'impresso', 'período', 'à', 'refr', 'cessão', 'serviço', 'medindo', 'cessao', 'ren', 'assessor', '10x7-4x4', 'dobrad', 'gm', 'gol', '75g', 'sua', '18:00h', 'verde-placa', '0.80x0.40', 'gm/s10', '01/10/20166', '09:00', 'az', '1.6', 'ano:2008/2009', 'das', '6957', 'técnicos', 'santinho', 'flex', 'ajb-0582', '2011', 'tutuc', '02/10/2016', 'njo3352', 'dobradinh', 'parachoqu', 'plac', '1994·', 'de', 'perfur

In [30]:
# create our training data
training = []
output = []
# create an empty array for our output
output_empty = [0] * len(classes)

# training set, bag of words for each sentence
for doc in documents:
    # initialize our bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    # stem each word
    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
    # create our bag of words array
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    training.append(bag)
    # output is a '0' for each tag and '1' for current tag
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1
    output.append(output_row)

print ("# words", len(words))
print ("# classes", len(classes))

# words 228
# classes 10


In [31]:
# sample training/output
i = 0
w = documents[i][0]
print ([stemmer.stem(word.lower()) for word in w])
print (training[i])
print (output[i])

['aquisiçao', 'de', 'adesivo', 'flexivel', '(', 'l:0·50', 'x', 'a:1·00', ')', 'propagand', 'eleit']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]


In [32]:
import numpy as np
import time

# compute sigmoid nonlinearity
def sigmoid(x):
    output = 1/(1+np.exp(-x))
    return output

# convert output of sigmoid function to its derivative
def sigmoid_output_to_derivative(output):
    return output*(1-output)
 
def clean_up_sentence(sentence):
    # tokenize the pattern
    sentence_words = nltk.word_tokenize(sentence)
    # stem each word
    sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
    return sentence_words

# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(sentence, words, show_details=False):
    # tokenize the pattern
    sentence_words = clean_up_sentence(sentence)
    # bag of words
    bag = [0]*len(words)  
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s: 
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)

    return(np.array(bag))

def think(sentence, show_details=False):
    x = bow(sentence.lower(), words, show_details)
    if show_details:
        print ("sentence:", sentence, "\n bow:", x)
    # input layer is our bag of words
    l0 = x
    # matrix multiplication of input and hidden layer
    l1 = sigmoid(np.dot(l0, synapse_0))
    # output layer
    l2 = sigmoid(np.dot(l1, synapse_1))
    return l2

In [33]:
# ANN and Gradient Descent code from https://iamtrask.github.io//2015/07/27/python-network-part2/
def train(X, y, hidden_neurons=10, alpha=1, epochs=50000, dropout=False, dropout_percent=0.5):

    print ("Training with %s neurons, alpha:%s, dropout:%s %s" % (hidden_neurons, str(alpha), dropout, dropout_percent if dropout else '') )
    print ("Input matrix: %sx%s    Output matrix: %sx%s" % (len(X),len(X[0]),1, len(classes)) )
    np.random.seed(1)

    last_mean_error = 1
    # randomly initialize our weights with mean 0
    synapse_0 = 2*np.random.random((len(X[0]), hidden_neurons)) - 1
    synapse_1 = 2*np.random.random((hidden_neurons, len(classes))) - 1

    prev_synapse_0_weight_update = np.zeros_like(synapse_0)
    prev_synapse_1_weight_update = np.zeros_like(synapse_1)

    synapse_0_direction_count = np.zeros_like(synapse_0)
    synapse_1_direction_count = np.zeros_like(synapse_1)
        
    for j in iter(range(epochs+1)):

        # Feed forward through layers 0, 1, and 2
        layer_0 = X
        layer_1 = sigmoid(np.dot(layer_0, synapse_0))
                
        if(dropout):
            layer_1 *= np.random.binomial([np.ones((len(X),hidden_neurons))],1-dropout_percent)[0] * (1.0/(1-dropout_percent))

        layer_2 = sigmoid(np.dot(layer_1, synapse_1))

        # how much did we miss the target value?
        layer_2_error = y - layer_2

        if (j% 10000) == 0 and j > 5000:
            # if this 10k iteration's error is greater than the last iteration, break out
            if np.mean(np.abs(layer_2_error)) < last_mean_error:
                print ("delta after "+str(j)+" iterations:" + str(np.mean(np.abs(layer_2_error))) )
                last_mean_error = np.mean(np.abs(layer_2_error))
            else:
                print ("break:", np.mean(np.abs(layer_2_error)), ">", last_mean_error )
                break
                
        # in what direction is the target value?
        # were we really sure? if so, don't change too much.
        layer_2_delta = layer_2_error * sigmoid_output_to_derivative(layer_2)

        # how much did each l1 value contribute to the l2 error (according to the weights)?
        layer_1_error = layer_2_delta.dot(synapse_1.T)

        # in what direction is the target l1?
        # were we really sure? if so, don't change too much.
        layer_1_delta = layer_1_error * sigmoid_output_to_derivative(layer_1)
        
        synapse_1_weight_update = (layer_1.T.dot(layer_2_delta))
        synapse_0_weight_update = (layer_0.T.dot(layer_1_delta))
        
        if(j > 0):
            synapse_0_direction_count += np.abs(((synapse_0_weight_update > 0)+0) - ((prev_synapse_0_weight_update > 0) + 0))
            synapse_1_direction_count += np.abs(((synapse_1_weight_update > 0)+0) - ((prev_synapse_1_weight_update > 0) + 0))        
        
        synapse_1 += alpha * synapse_1_weight_update
        synapse_0 += alpha * synapse_0_weight_update
        
        prev_synapse_0_weight_update = synapse_0_weight_update
        prev_synapse_1_weight_update = synapse_1_weight_update

    now = datetime.datetime.now()

    # persist synapses
    synapse = {'synapse0': synapse_0.tolist(), 'synapse1': synapse_1.tolist(),
               'datetime': now.strftime("%Y-%m-%d %H:%M"),
               'words': words,
               'classes': classes
              }
    synapse_file = "synapses.json"

    with open(synapse_file, 'w') as outfile:
        json.dump(synapse, outfile, indent=4, sort_keys=True)
    print ("saved synapses to:", synapse_file)

Melhor delta:
* 10 - 0.00180662153035
* 40 - 0.00086592825509

In [34]:
X = np.array(training)
y = np.array(output)

start_time = time.time()

train(X, y, hidden_neurons=20, alpha=0.1, epochs=100000, dropout=False, dropout_percent=0.2)

elapsed_time = time.time() - start_time
print ("processing time:", elapsed_time, "seconds")

Training with 20 neurons, alpha:0.1, dropout:False 
Input matrix: 40x228    Output matrix: 1x10
delta after 10000 iterations:0.0029671128153
delta after 20000 iterations:0.00203083238842
delta after 30000 iterations:0.00163396223691
delta after 40000 iterations:0.0014022304734
delta after 50000 iterations:0.00124612830023
delta after 60000 iterations:0.00113195644206
delta after 70000 iterations:0.00104384503753
delta after 80000 iterations:0.000973219884434
delta after 90000 iterations:0.000914992394237
delta after 100000 iterations:0.000865928255099
saved synapses to: synapses.json
processing time: 31.412716150283813 seconds


In [35]:
# probability threshold
ERROR_THRESHOLD = 0.2
# load our calculated synapse values
synapse_file = 'synapses.json' 
with open(synapse_file) as data_file: 
    synapse = json.load(data_file) 
    synapse_0 = np.asarray(synapse['synapse0']) 
    synapse_1 = np.asarray(synapse['synapse1'])

def classify(sentence, show_details=False):
    results = think(sentence, show_details)

    results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD ] 
    results.sort(key=lambda x: x[1], reverse=True) 
    return_results =[[classes[r[0]],r[1]] for r in results]
    print ("%s \n classification: %s" % (sentence, return_results))
    return return_results


test_data = dados_BR.sample(frac=1).reset_index(drop=True).loc[:10]
for i in range(0, 10):
    classify(test_data.reset_index().ix[i, "Descricao da receita"])

doação de 01 (uma) paródia 
 classification: [['1813-0/01 - impressão de material para uso publicitário', 0.56227048734157359], ['7711-0/00 - locação de automóveis sem condutor', 0.29628238762585718]]
serviço de motorista no veiculo vw saveiro 1.6 ce cross· 2011/2012· cor branca 
 classification: [['9700-5/00 - serviços domésticos', 0.24892744791932553]]
impresso adesivo vinil 10x30 para choque 
 classification: [['1813-0/01 - impressão de material para uso publicitário', 0.89342855533661281]]
honda/nxr125 bros es· placa ily5698· renavam 00833521179· chassi 9c2jd20245020829· ano/modelo 2004 
 classification: [['7711-0/00 - locação de automóveis sem condutor', 0.96165490766400152]]
50 por cento da arte de santinhos 10x7 - 4/4 cores c/ neilton c/ jair suspensao 
 classification: [['1813-0/01 - impressão de material para uso publicitário', 0.43269343817793199]]
adesivo 30x9cm rose modesto e niele barbieri 
 classification: [['1813-0/01 - impressão de material para uso publicitário', 0.928