In [1]:
# !pip install nltk
# import nltk
# nltk.download('stopwords')
# !pip install transformers
# !pip install torchtext

In [2]:
import os
import numpy as np
import pandas as pd

from keras.layers import Input
from keras.layers import Dense
from keras.models import Model
from keras.layers import Embedding
from keras.layers import LSTM, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

from lib.utils import clean_str
from lib.utils import load_data
from lib.utils import prepare_data
from lib.utils import model 

seed = 7
np.random.seed(seed)

parameters = {}
parameters['model_filename'] = 'model/model_pt-bi-lstm.h5' # O model será exportado para este arquivo
parameters['pre_trained_wv'] = False
parameters['bilstm'] = True # LSTM Bidirectional True or False

# parameters['dataset_file'] = './dataset/data_imdb_en_pt.csv'
parameters['dataset_file'] = 'https://1drv.ms/u/s!AtQLEBYHemNkgdt334si6Qepxomgow?e=2w7eEP'

parameters['lang'] = 'pt' # pt or en
parameters['load_from'] = 'ftr' # csv or ftr

parameters['epochs'] = 5

parameters['word_embedding_dim'] = 50 # dimensionalidade do word embedding pré-treinado
parameters['batch_size'] = 32 # número de amostras a serem utilizadas em cada atualização do gradiente
parameters['max_features'] = 5000 # Reflete a quantidade máxima de palavras que iremos manter no vocabulário
parameters['embed_dim'] = 128 # dimensão de saída da camada Embedding
parameters['max_sequence_length'] = 300 # limitamos o tamanho máximo de todas as sentenças


data = load_data(parameters)

X_train, X_test, Y_train, Y_test, word_index, tokenizer = prepare_data(data, parameters)

print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)


model = model(parameters)

model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics = ['accuracy'])

print(model.summary())

if not os.path.exists('./{}'.format(parameters['model_filename']) ):

    hist = model.fit(
        X_train, 
        Y_train, 
        validation_data=(X_test, Y_test),
        epochs=parameters['epochs'],
        batch_size=parameters['batch_size'], 
        shuffle=True,
        verbose=1)

    model.save_weights(parameters['model_filename'])    


    # Plot
    plt.figure()
    plt.plot(hist.history['loss'], lw=2.0, color='b', label='train')
    plt.plot(hist.history['val_loss'], lw=2.0, color='r', label='val')
    plt.title('Classificador de sentimentos')
    plt.xlabel('Epochs')
    plt.ylabel('Cross-Entropy')
    plt.legend(loc='upper right')
    plt.show()

    plt.figure()
    plt.plot(hist.history['accuracy'], lw=2.0, color='b', label='train')
    plt.plot(hist.history['val_accuracy'], lw=2.0, color='r', label='val')
    plt.title('Classificador de sentimentos')
    plt.xlabel('Epochs')
    plt.ylabel('Acurácia')
    plt.legend(loc='upper left')
    plt.show()

else:
    model.load_weights('./{}'.format(parameters['model_filename']) )

print("Preparando utilização do modelo.\n")

scores = model.evaluate(X_test, Y_test, verbose = 0, batch_size = parameters['batch_size'])
print("Acc: %.2f%%" % (scores[1]*100))

while True:
    sentence = input("input> ")

    if sentence == "exit":
        break
    
    new_text = [sentence]
    new_text = tokenizer.texts_to_sequences(new_text)

    new_text = pad_sequences(new_text, maxlen=parameters['max_sequence_length'], dtype='int32', value=0)

    sentiment = model.predict(new_text,batch_size=1,verbose = 2)[0]

    if(np.argmax(sentiment) == 0):
        pred_proba = "%.2f%%" % (sentiment[0]*100)
        print("negativo => ", pred_proba)
    elif (np.argmax(sentiment) == 1):
        pred_proba = "%.2f%%" % (sentiment[1]*100)
        print("positivo => ", pred_proba)

Carregando dataset.
Dataset carregado.

Preparando dados de treinamento.
Dados separados!

(39567, 300) (39567, 2)
(9892, 300) (9892, 2)

Construindo modelo...
Feito!
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 300)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 300, 128)          640000    
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               263168    
_________________________________________________________________
softmax (Dense)              (None, 2)                 514       
Total params: 903,682
Trainable params: 903,682
Non-trainable params: 0
_________________________________________________________________
None
Preparando utilização do modelo.

Acc: 87.53%
input> ótimo
1/1 - 0s
negativo =>  