In [None]:
import numpy as np
import pandas as pd
import string
import spacy
import re
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow import keras
from keras.layers import Input, LSTM, Dense, Embedding, GRU
from keras.models import Model
import random
from sklearn.model_selection import train_test_split
import tensorflow_datasets as tfds
import os
import time

In [None]:
if tf.config.list_physical_devices('GPU'):
    physical_devices = tf.config.list_physical_devices('GPU')
    tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
    tf.config.experimental.set_virtual_device_configuration(physical_devices[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4000)])

# ETAPA DE CARREGAMENTO DO DATASET

In [None]:
data = pd.read_json('tcc1.json', encoding='utf-8')

# ETAPA DE PRÉ-PROCESSAMENTO

In [None]:
titulo_input = ['<start> ' + m + ' <end>' for m in data.título.tolist()]
noticia_input = ['<start> ' + m + ' <end>' for m in data.texto.tolist()]

# ETAPA DE TOKENIZAÇÃO

In [None]:
def token(texto, tam_max):
    tokens = tf.keras.preprocessing.text.Tokenizer(lower=True, filters='', num_words=2**16)
    tokens.fit_on_texts(texto)
    tensor = tokens.texts_to_sequences(texto)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post', maxlen=tam_max)
    return tensor, tokens

In [None]:
data_input_tokens, data_input = token(noticia_input, 600)
data_target_tokens, target_input = token(titulo_input, 20)

In [None]:
num_encoder_tokens = data_input_tokens.shape[1]
num_decoder_tokens = data_target_tokens.shape[1]

# Divisão dos dados

In [None]:
input_data_train, input_data_test, input_decoder_train, input_decoder_test = train_test_split(data_input_tokens, data_target_tokens, test_size=0.2)

# Variáveis de configuração da rede

In [None]:
buffer = len(input_data_train)
batch = 16
steps_por_epoca = len(input_data_train)//batch
embedding_dim = 200
units = 1024
vocab_size_input = len(data_input.word_index)+1
vocab_size_target = len(target_input.word_index)+1

# Criação do dataset

In [None]:
encoder_input_data = np.zeros(
    (len(noticia_input), num_encoder_tokens, vocab_size_input),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(noticia_input), num_decoder_tokens, vocab_size_target),
    dtype='float32')

print(encoder_input_data.shape)
print(decoder_input_data.shape)

In [None]:
for i, input_text in enumerate(input_data_train):
    for t, char in enumerate(input_text):
        #print(i, "->", t, "->", data_input_tokens[char])
        encoder_input_data[i, t, char] = 1.

In [None]:
for i, decoder_text in enumerate(input_decoder_train):
    for t, char in enumerate(decoder_text):
        decoder_input_data[i, t, char] = 1.

# Arquitetura da Rede

In [None]:
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(embedding_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

In [None]:
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(embedding_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
model.summary()

# Compilação do modelo

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Treinamento do modelo

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_input_data, batch_size=batch, epochs=100, verbose=1, 
          validation_split=0.2)

In [None]:
model.save('sumAbstrat.h5')