In [1]:
'''from google.colab import drive
drive.mount('/content/drive')'''

"from google.colab import drive\ndrive.mount('/content/drive')"

In [2]:
import numpy as np
import pandas as pd
import string
import spacy
import re
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Model
import random
from sklearn.model_selection import train_test_split
import tensorflow_datasets as tfds
import os
import time

Using TensorFlow backend.


In [3]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

# ETAPA DE CARREGAMENTO DO DATASET

In [4]:
'''data = pd.read_json('/content/drive/My Drive/Colab Notebooks/tcc1.json', encoding='utf-8')'''
data = pd.read_json('tcc1.json', encoding='utf-8')

# ETAPA DE PRÉ-PROCESSAMENTO

In [5]:
titulo_input = ['<start> ' + m + ' <end>' for m in data.título.tolist()]
noticia_input = ['<start> ' + m + ' <end>' for m in data.texto.tolist()]

# ETAPA DE TOKENIZAÇÃO

In [6]:
def token(texto, tam_max):
    tokens = tf.keras.preprocessing.text.Tokenizer(lower=True, filters='', num_words=2**16)
    tokens.fit_on_texts(texto)
    tensor = tokens.texts_to_sequences(texto)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post', maxlen=tam_max)
    return tensor, tokens

In [7]:
data_input_tokens, data_input = token(noticia_input, tam_max=600)
data_target_tokens, target_input = token(titulo_input, tam_max=20)

In [8]:
num_encoder_tokens = data_input_tokens.shape[1]
num_decoder_tokens = data_target_tokens.shape[1]

# Divisão dos dados

In [9]:
input_data_train, input_data_test, input_decoder_train, input_decoder_test = train_test_split(data_input_tokens, data_target_tokens, test_size=0.2)

print(len(input_data_train), len(input_data_test), len(input_decoder_train), len(input_decoder_test))

229 58 229 58


# Variáveis de configuração da rede

In [10]:
buffer = len(input_data_train)
batch = 16
steps_por_epoca = len(input_data_train)//batch
embedding_dim = 200
units = 1024
vocab_size_input = len(data_input.word_index)+1
vocab_size_target = len(target_input.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_data_train, input_decoder_train)).shuffle(buffer)
dataset = dataset.batch(batch, drop_remainder=True)

In [11]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([16, 600]), TensorShape([16, 20]))

# Criação do dataset

In [12]:
'''encoder_input_data = np.zeros(
    (len(noticia_input), num_encoder_tokens, vocab_size_input), dtype='float32')
decoder_input_data = np.zeros(
    (len(noticia_input), num_decoder_tokens, vocab_size_target), dtype='float32')'''

"encoder_input_data = np.zeros(\n    (len(noticia_input), num_encoder_tokens, vocab_size_input), dtype='float32')\ndecoder_input_data = np.zeros(\n    (len(noticia_input), num_decoder_tokens, vocab_size_target), dtype='float32')"

print(encoder_input_data.shape)
print(decoder_input_data.shape)

In [13]:
'''for i, input_text in enumerate(input_data_train):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, char] = 1.'''

'for i, input_text in enumerate(input_data_train):\n    for t, char in enumerate(input_text):\n        encoder_input_data[i, t, char] = 1.'

In [14]:
'''for i, decoder_text in enumerate(input_decoder_train):
    for t, char in enumerate(decoder_text):
        decoder_input_data[i, t, char] = 1.'''

'for i, decoder_text in enumerate(input_decoder_train):\n    for t, char in enumerate(decoder_text):\n        decoder_input_data[i, t, char] = 1.'

# Arquitetura da Rede

In [15]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, emb_dim, units, batch):
        super(Encoder, self).__init__()
        self.batch = batch
        self.units = units
        self.embedding = tf.keras.layers.Embedding(vocab_size, emb_dim)
        self.lstm = tf.keras.layers.GRU(units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    
    def call(self, x, hidden):
        x = self.embedding(x)
        #print("x encoder: ", x)
        output, state = self.lstm(x, initial_state = hidden)
        print("output encoder: ", output)
        print("output state: ", state)
        return output, state
    
    def intializer_hidden_state(self):
        return tf.zeros((self.batch, self.units))

In [16]:
encoder = Encoder(vocab_size_input, embedding_dim, units, batch)

sample_hidden = encoder.intializer_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

output encoder:  tf.Tensor(
[[[-3.0842243e-04 -6.5526515e-03  3.9992554e-04 ... -3.7998706e-03
    7.3350510e-03 -5.1927976e-03]
  [-2.9863871e-03 -3.4470323e-03  4.0791119e-03 ... -1.6927108e-03
    8.5208064e-04 -5.6904852e-03]
  [-1.0755990e-02  6.7602685e-03  2.5231366e-03 ...  8.5079940e-03
   -3.3086690e-05  6.5111695e-03]
  ...
  [ 2.6263280e-03  1.5370127e-02 -3.4013853e-04 ... -6.9554704e-03
    2.3309011e-03  1.0773129e-02]
  [ 4.5271707e-03  1.4697173e-02 -4.8892382e-03 ... -1.3122638e-02
   -5.5989535e-03  1.3821453e-02]
  [ 8.9046033e-03  2.0129342e-02 -7.3118485e-03 ... -5.8914698e-03
    3.4166005e-04  8.8573210e-03]]

 [[-5.4864842e-03  3.2015759e-03  6.2299026e-03 ...  8.0112839e-04
    5.3224829e-03  7.2475045e-04]
  [-1.4812694e-02 -4.0351022e-03 -1.2637429e-03 ...  4.3218979e-03
   -1.0250750e-03 -2.0387364e-03]
  [-9.0720020e-03 -8.4501067e-03 -3.5780377e-03 ... -3.5325003e-05
    1.6717615e-03 -1.1555740e-03]
  ...
  [ 4.9049803e-03  4.4716438e-03  5.1473258e-03 .

In [17]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))

        attention_weights = tf.nn.softmax(score, axis=1)

        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [18]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, emb_dim, units, batch):
        super(Decoder, self).__init__()
        self.batch = batch
        self.units = units
        self.embedding = tf.keras.layers.Embedding(vocab_size, emb_dim)
        self.lstm = tf.keras.layers.GRU(units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.units)

    def call(self, x, hidden, output):
        context_vector, attention_weights = self.attention(hidden, output)
        x = self.embedding(x)
        #print(x)
        #print([tf.expand_dims(context_vector, 1), x])
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.lstm(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        return output, state, attention_weights


In [19]:
decoder = Decoder(vocab_size_target, embedding_dim, units, batch)

sample_decoder_output, _, _ = decoder(tf.random.uniform((batch, 1)), sample_hidden, sample_output)
print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (16, 1024)


In [20]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

optimizer = tf.keras.optimizers.Adam()

def erro(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [21]:
checkpoint_dir = './treinamento_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=tf.keras.optimizers.Adam(), encoder=encoder, decoder=decoder)

In [22]:
@tf.function
def treino(input_data, target, hidden):
    loss = 0
    batch = 16
    #print(input_data)
    with tf.GradientTape() as tape:
        encoder_output, encoder_hidden = encoder(input_data, hidden)
        decoder_input = tf.expand_dims([target_input.word_index['<start>']] * batch, 1)

        for t in range(1, target.shape[1]):
            predictions, decoder_hidden, _ = decoder(decoder_input, encoder_hidden, encoder_output)
            loss += erro(target[:, t], predictions)
            # using teacher forcing
            decoder_input = tf.expand_dims(target[:, t], 1)

    batch_loss = (loss / int(target.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [None]:
epocas = 10
erro_acumulado = []
for e in range(epocas):
    inicio = time.time()
    hidden = encoder.intializer_hidden_state()
    erro_total = 0
    for (batch, (input_data, target)) in enumerate(dataset.take(steps_por_epoca)):
        batch_loss = treino(input_data, target, hidden)
        erro_total += batch_loss
        
    print('Epoca {} Erro {:.04f}'.format(epocas+1, erro_total/steps_por_epoca))
    erro_acumulado.append(erro_total/steps_por_epoca)
    print('Para uma época levou {}s'.format(time.time() - inicio))
checkpoint.save(file_prefix = checkpoint_prefix)

output encoder:  Tensor("encoder/gru/StatefulPartitionedCall:1", shape=(16, 600, 1024), dtype=float32)
output state:  Tensor("encoder/gru/StatefulPartitionedCall:2", shape=(16, 1024), dtype=float32)
output encoder:  Tensor("encoder/gru/StatefulPartitionedCall:1", shape=(16, 600, 1024), dtype=float32)
output state:  Tensor("encoder/gru/StatefulPartitionedCall:2", shape=(16, 1024), dtype=float32)
Epoca 11 Erro nan
Para uma época levou 38.115665435791016s
Epoca 11 Erro nan
Para uma época levou 8.964566707611084s
Epoca 11 Erro nan
Para uma época levou 9.022577285766602s
Epoca 11 Erro nan
Para uma época levou 8.9580659866333s


In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_inputs)

In [None]:
model.summary()

# Compilação do modelo

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Treinamento do modelo

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_input_data, batch_size=batch, epochs=100, validation_split=0.2)

In [None]:
model.save('sumAbstrat.h5')