In [1]:
'''from google.colab import drive
drive.mount('/content/drive')'''

"from google.colab import drive\ndrive.mount('/content/drive')"

In [2]:
import numpy as np
import pandas as pd
import string
import spacy
import re
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Model
import random
from sklearn.model_selection import train_test_split
import tensorflow_datasets as tfds
import os
import time

Using TensorFlow backend.


In [3]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

# ETAPA DE CARREGAMENTO DO DATASET

In [4]:
'''data = pd.read_json('/content/drive/My Drive/Colab Notebooks/tcc1.json', encoding='utf-8')'''
data = pd.read_json('tcc1.json', encoding='utf-8')

# ETAPA DE PRÉ-PROCESSAMENTO

In [5]:
titulo_input = ['<start> ' + m + ' <end>' for m in data.título.tolist()]
noticia_input = ['<start> ' + m + ' <end>' for m in data.texto.tolist()]

# ETAPA DE TOKENIZAÇÃO

In [6]:
def token(texto, tam_max):
    tokens = tf.keras.preprocessing.text.Tokenizer(lower=True, filters='', num_words=2**16)
    tokens.fit_on_texts(texto)
    tensor = tokens.texts_to_sequences(texto)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post', maxlen=tam_max)
    return tensor, tokens

In [7]:
data_input_tokens, data_input = token(noticia_input, tam_max=600)
data_target_tokens, target_input = token(titulo_input, tam_max=20)

In [8]:
num_encoder_tokens = data_input_tokens.shape[1]
num_decoder_tokens = data_target_tokens.shape[1]

# Divisão dos dados

In [9]:
input_data_train, input_data_test, input_decoder_train, input_decoder_test = train_test_split(data_input_tokens, data_target_tokens, test_size=0.2)

# Variáveis de configuração da rede

In [10]:
buffer = len(input_data_train)
batch = 16
steps_por_epoca = len(input_data_train)//batch
embedding_dim = 200
units = 1024
vocab_size_input = len(data_input.word_index)+1
vocab_size_target = len(target_input.word_index)+1

In [11]:
dataset = tf.data.Dataset.from_tensor_slices((input_data_train, input_decoder_train)).shuffle(buffer)
dataset = dataset.batch(batch, drop_remainder=True)

In [12]:
example_input_batch, example_target_batch = next(iter(dataset))

# Arquitetura da Rede

In [13]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, emb_dim, units, batch):
        super(Encoder, self).__init__()
        self.batch = batch
        self.units = units
        self.embedding = tf.keras.layers.Embedding(vocab_size, emb_dim)
        self.lstm = tf.keras.layers.GRU(units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    
    def call(self, x, hidden):
        print('x entrada encoder', x)
        print('hidden entrada:', hidden)
        x = self.embedding(x)
        print('x encoder:', x)
        output, state = self.lstm(x, initial_state = hidden)
        print('output encoder:', output)
        print('state encoder:', state)
        return output, state
    
    def intializer_hidden_state(self):
        return tf.zeros((self.batch, self.units))

In [14]:
encoder = Encoder(vocab_size_input, embedding_dim, units, batch)

In [15]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))

        attention_weights = tf.nn.softmax(score, axis=1)

        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [16]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, emb_dim, units, batch):
        super(Decoder, self).__init__()
        self.batch = batch
        self.units = units
        self.embedding = tf.keras.layers.Embedding(vocab_size, emb_dim)
        self.lstm = tf.keras.layers.GRU(units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.units)

    def call(self, x, hidden, output):
        print('x decoder entrada:', x)
        print('hidden decoder entrada:', hidden)
        print('output decoder entrada:', output)
        context_vector, attention_weights = self.attention(hidden, output)
        x = self.embedding(x)
        print('x decoder:', x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.lstm(x)
        print('output decoder:', output)
        output = tf.reshape(output, (-1, output.shape[2]))
        return output, state, attention_weights

In [17]:
decoder = Decoder(vocab_size_target, embedding_dim, units, batch)

In [18]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

optimizer = tf.keras.optimizers.Adam()

def erro(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    #print(mask.shape[0])
    loss_ = loss_object(real, pred)
    #print(loss_)
    mask = tf.cast(mask.shape[0], dtype=loss_.dtype)
    #print(type(mask))
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [19]:
checkpoint_dir = './treinamento_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=tf.keras.optimizers.Adam(), encoder=encoder, decoder=decoder)

In [20]:
@tf.function
def treino(input_data, target, hidden):
    loss = 0
    batch = 16
    with tf.GradientTape() as tape:
        encoder_output, encoder_hidden = encoder(input_data, hidden)
        decoder_input = tf.expand_dims([target_input.word_index['<start>']] * batch, 1)

        for t in range(1, target.shape[1]):
            predictions, decoder_hidden, _ = decoder(decoder_input, encoder_hidden, encoder_output)
            #print(type(predictions))
            #print(target[:, t])
            loss += erro(target[:, t], predictions)
            #print(type(loss))
            # using teacher forcing
            decoder_input = tf.expand_dims(target[:, t], 1)
    
    batch_loss = (loss / int(target.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [21]:
epocas = 20
erro_acumulado = []
for e in range(epocas):
    inicio = time.time()
    hidden = encoder.intializer_hidden_state()
    erro_total = 0
    for (batch, (input_data, target)) in enumerate(dataset.take(steps_por_epoca)):
        batch_loss = treino(input_data, target, hidden)
        erro_total += batch_loss
    
    print('Epoca {} Erro {:.04f}'.format(e+1, erro_total/steps_por_epoca))
    erro_acumulado.append(erro_total/steps_por_epoca)
    print('Para uma época levou {:.04f}s'.format(time.time() - inicio))
checkpoint.save(file_prefix = checkpoint_prefix)

x entrada encoder Tensor("input_data:0", shape=(16, 600), dtype=int32)
hidden entrada: Tensor("hidden:0", shape=(16, 1024), dtype=float32)
x encoder: Tensor("encoder/embedding/embedding_lookup/Identity:0", shape=(16, 600, 200), dtype=float32)
output encoder: Tensor("encoder/gru/transpose_1:0", shape=(16, 600, 1024), dtype=float32)
state encoder: Tensor("encoder/gru/while:4", shape=(16, 1024), dtype=float32)
x decoder entrada: Tensor("ExpandDims:0", shape=(16, 1), dtype=int32)
hidden decoder entrada: Tensor("encoder/gru/while:4", shape=(16, 1024), dtype=float32)
output decoder entrada: Tensor("encoder/gru/transpose_1:0", shape=(16, 600, 1024), dtype=float32)
x decoder: Tensor("decoder/embedding_1/embedding_lookup/Identity:0", shape=(16, 1, 200), dtype=float32)
output decoder: Tensor("decoder/gru_1/transpose_1:0", shape=(16, 1, 1024), dtype=float32)
x decoder entrada: Tensor("ExpandDims_1:0", shape=(16, 1), dtype=int32)
hidden decoder entrada: Tensor("encoder/gru/while:4", shape=(16, 102

x decoder entrada: Tensor("ExpandDims_17:0", shape=(16, 1), dtype=int32)
hidden decoder entrada: Tensor("encoder/gru/while:4", shape=(16, 1024), dtype=float32)
output decoder entrada: Tensor("encoder/gru/transpose_1:0", shape=(16, 600, 1024), dtype=float32)
x decoder: Tensor("decoder_17/embedding_1/embedding_lookup/Identity:0", shape=(16, 1, 200), dtype=float32)
output decoder: Tensor("decoder_17/gru_1/transpose_1:0", shape=(16, 1, 1024), dtype=float32)
x decoder entrada: Tensor("ExpandDims_18:0", shape=(16, 1), dtype=int32)
hidden decoder entrada: Tensor("encoder/gru/while:4", shape=(16, 1024), dtype=float32)
output decoder entrada: Tensor("encoder/gru/transpose_1:0", shape=(16, 600, 1024), dtype=float32)
x decoder: Tensor("decoder_18/embedding_1/embedding_lookup/Identity:0", shape=(16, 1, 200), dtype=float32)
output decoder: Tensor("decoder_18/gru_1/transpose_1:0", shape=(16, 1, 1024), dtype=float32)
x entrada encoder Tensor("input_data:0", shape=(16, 600), dtype=int32)
hidden entrad

x decoder entrada: Tensor("ExpandDims_15:0", shape=(16, 1), dtype=int32)
hidden decoder entrada: Tensor("encoder/gru/while:4", shape=(16, 1024), dtype=float32)
output decoder entrada: Tensor("encoder/gru/transpose_1:0", shape=(16, 600, 1024), dtype=float32)
x decoder: Tensor("decoder_15/embedding_1/embedding_lookup/Identity:0", shape=(16, 1, 200), dtype=float32)
output decoder: Tensor("decoder_15/gru_1/transpose_1:0", shape=(16, 1, 1024), dtype=float32)
x decoder entrada: Tensor("ExpandDims_16:0", shape=(16, 1), dtype=int32)
hidden decoder entrada: Tensor("encoder/gru/while:4", shape=(16, 1024), dtype=float32)
output decoder entrada: Tensor("encoder/gru/transpose_1:0", shape=(16, 600, 1024), dtype=float32)
x decoder: Tensor("decoder_16/embedding_1/embedding_lookup/Identity:0", shape=(16, 1, 200), dtype=float32)
output decoder: Tensor("decoder_16/gru_1/transpose_1:0", shape=(16, 1, 1024), dtype=float32)
x decoder entrada: Tensor("ExpandDims_17:0", shape=(16, 1), dtype=int32)
hidden deco

'./treinamento_checkpoints\\ckpt-1'

In [22]:
#model = Model([encoder_inputs, decoder_inputs], decoder_inputs)

In [23]:
#model.summary()

# Compilação do modelo

In [24]:
#model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Treinamento do modelo

In [25]:
#model.fit([encoder_input_data, decoder_input_data], decoder_input_data, batch_size=batch, epochs=100, validation_split=0.2)

In [26]:
#model.save('sumAbstrat.h5')
def evaluate(sentence):
    attention_plot = np.zeros((num_encoder_tokens, num_decoder_tokens))
    sentence = sentence.lower()
    inputs = data_input.texts_to_sequences([sentence])
    #inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, maxlen=num_decoder_tokens, padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([target_input.word_index['<start>']], 0)

    for t in range(num_decoder_tokens):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        print(predictions)
        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()
        predicted_id = tf.argmax(predictions[0]).numpy()
        print(predicted_id)
        result += target_input.index_word[predicted_id] + ' '
        if target_input.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence, attention_plot

In [27]:
# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 14}

    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [28]:
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    plot_attention(attention_plot, sentence.split(' '), result.split(' '))

In [29]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

mensagem = '<start> ' + data.texto.tolist()[15] + ' <end>'
print(mensagem)

translate(mensagem)

In C:\Users\renat\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\renat\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\renat\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.
In C:\Users\renat\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\renat\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mpl

<start>  Escutas telefônicas da operação que prendeu membros da Secretaria da Saúde mostram um homem que segundo o MP é integrante do esquema criminoso citando o governador do Rio. Quem menciona Wilson Witzel é o operador financeiro de Mário Peixoto, Luiz Roberto Martins. O governador nega que cometeu qualquer ilegalidade. Lava Jato prende ex-deputado Paulo Melo e empresário Mário Peixoto O operador diz a um interlocutor que o empresário negociou a liberação de uma organização social (OS) com o governador. A organização social Instituto Unir Saúde fechou vários contratos com a secretaria de saúde entre os anos de 2018 e 2019 até ser desqualificada como OS pelo estado em outubro de 2019.  Segundo as investigações da Operação Favorito, a OS movimentou mais de R$ 180 milhões em contratos de gestões de Upas na Baixada. Ainda de acordo com a Polícia Federal e o Ministério Público Federal, o empresário Mário Peixoto e seu operador financeiro Luiz Roberto Martins, ambos presos na semana passa

  [nan nan nan ... nan nan nan]]], shape=(1, 20, 1024), dtype=float32)
x decoder: tf.Tensor(
[[[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
   nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
   nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
   nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
   nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
   nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
   nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
   nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
   nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
   nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
   nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
   nan nan nan nan nan nan nan nan nan nan nan nan nan]]], shape=(1, 1, 200), dtype=float32)
output decoder: tf.Tensor([[[nan 

KeyError: 0