In [15]:
# !pip install nltk
# import nltk
# nltk.download('stopwords')
# !pip install transformers
# !pip install torchtext

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting torchtext
  Downloading torchtext-0.10.0-cp38-cp38-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 5.7 MB/s eta 0:00:01
Installing collected packages: torchtext
Successfully installed torchtext-0.10.0


# Transformers

In [10]:
import time
import torch
import random
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim

from transformers import AutoModel
from transformers import AutoTokenizer

from torchtext.legacy import datasets
from torchtext.legacy import data

from lib.utils import clean_str
from lib.utils import load_data
from lib.utils import prepare_data
from lib.utils import model 

from lib.utils import train
from lib.utils import evaluate
from lib.utils import epoch_time
from lib.utils import binary_accuracy
from lib.utils import count_parameters
from lib.utils import predict_sentiment
from lib.utils import BERTGRUSentimentS
from lib.utils import BERTGRUSentimentB

from sklearn.model_selection import train_test_split

SEED = 7

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Fonte: https://gist.github.com/nissan/ccb0553edb6abafd20c3dec34ee8099d

class DataFrameDataset(data.Dataset):

    def __init__(self, df, text_field, label_field, is_test=False, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        examples = []
        for i, row in df.iterrows():
            label = row.sentiment if not is_test else None
            text = row.text
            examples.append(data.Example.fromlist([text, label], fields))
        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    @classmethod
    def splits(cls, text_field, label_field, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)

        if train_df is not None:
            train_data = cls(train_df.copy(), text_field, label_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), text_field, label_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), text_field, label_field, True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)

In [5]:
bert = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')

tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=True)

print(len(tokenizer.vocab))
print(tokenizer.convert_tokens_to_ids)

init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

max_input_length = 300

parameters = {}
parameters['model_filename'] = 'model/model_pt-bi-lstm.h5' # O model será exportado para este arquivo
parameters['pre_trained_wv'] = False
parameters['bilstm'] = True # LSTM Bidirectional True or False

parameters['dataset_file'] = './dataset/data_imdb_en_pt.csv'
parameters['lang'] = 'pt' # pt or en
parameters['load_from'] = 'ftr' # csv or ftr

parameters['epochs'] = 5

parameters['word_embedding_dim'] = 50 # dimensionalidade do word embedding pré-treinado
parameters['batch_size'] = 32 # número de amostras a serem utilizadas em cada atualização do gradiente
parameters['max_features'] = 5000 # Reflete a quantidade máxima de palavras que iremos manter no vocabulário
parameters['embed_dim'] = 128 # dimensão de saída da camada Embedding
parameters['max_sequence_length'] = 300 # limitamos o tamanho máximo de todas as sentenças


def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

df = load_data(parameters)
df.sentiment.replace({"neg": 0, "pos": 1}, inplace = True)

X, X_test, Y, Y_test = train_test_split(df.text,df.sentiment, test_size = 0.20, train_size=0.8, random_state = 42)
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.25, train_size = 0.75, random_state = 42)

TEXT_FIELD = data.Field(batch_first = True,
                        use_vocab = False,
                        sequential=True,
                        tokenize=tokenizer, 
#                   tokenize = tokenize_and_cut,
                        preprocessing = tokenizer.convert_tokens_to_ids,
                        init_token = init_token_idx,
                        eos_token = eos_token_idx,
                        pad_token = pad_token_idx,
                        unk_token = unk_token_idx)

LABEL_FIELD = data.LabelField(dtype = torch.float)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


29794
<bound method PreTrainedTokenizerFast.convert_tokens_to_ids of PreTrainedTokenizerFast(name_or_path='neuralmind/bert-base-portuguese-cased', vocab_size=29794, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})>
[CLS] [SEP] [PAD] [UNK]
101 102 0 100
Carregando dataset.
Dataset carregado.



In [6]:
train_data, val_data, test_data = DataFrameDataset.splits(
    text_field=TEXT_FIELD, label_field=LABEL_FIELD, 
    train_df = pd.DataFrame({"text": X_train, "sentiment": Y_train}),
    val_df = pd.DataFrame({"text": X_val, "sentiment": Y_val}), 
    test_df = pd.DataFrame({"text": X_test, "sentiment": Y_test}))

print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(val_data)}")
print(f"Number of testing examples: {len(test_data)}")

print(vars(train_data.examples[6]))

Number of training examples: 29675
Number of validation examples: 9892
Number of testing examples: 9892
{'text': [100, 100, 100], 'label': 0}


In [7]:
tokens = tokenizer.convert_ids_to_tokens(vars(train_data.examples[6])['text'])

print(tokens)

LABEL_FIELD.build_vocab(train_data)

print(LABEL_FIELD.vocab.stoi)

BATCH_SIZE = 256

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, val_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

['[UNK]', '[UNK]', '[UNK]']
defaultdict(None, {1: 0, 0: 1})


In [8]:
OUTPUT_DIM = 2

model = BERTGRUSentimentB(bert, OUTPUT_DIM)

In [11]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUSentimentS(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)

In [12]:
print(f'The model has {count_parameters(model):,} trainable parameters')

for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False
        

print(f'The model has {count_parameters(model):,} trainable parameters')

for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)
        


optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

The model has 111,682,305 trainable parameters
The model has 2,759,169 trainable parameters
rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
out.weight
out.bias


In [13]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
#         torch.save(model.state_dict(), './model/model_pt-bert.pt')
        torch.save(model, './model/model_pt-bert.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 4m 23s
	Train Loss: 0.700 | Train Acc: 49.92%
	 Val. Loss: 0.695 |  Val. Acc: 50.00%
Epoch: 02 | Epoch Time: 4m 27s
	Train Loss: 0.695 | Train Acc: 49.75%
	 Val. Loss: 0.694 |  Val. Acc: 50.00%
Epoch: 03 | Epoch Time: 4m 27s
	Train Loss: 0.694 | Train Acc: 49.50%
	 Val. Loss: 0.693 |  Val. Acc: 50.00%
Epoch: 04 | Epoch Time: 4m 27s
	Train Loss: 0.693 | Train Acc: 50.30%
	 Val. Loss: 0.694 |  Val. Acc: 50.00%
Epoch: 05 | Epoch Time: 4m 27s
	Train Loss: 0.693 | Train Acc: 49.99%
	 Val. Loss: 0.693 |  Val. Acc: 50.00%


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=True)

init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

parameters = {}
parameters['init_token_idx'] = init_token_idx
parameters['eos_token_idx'] = eos_token_idx
parameters['pad_token_idx'] = pad_token_idx
parameters['unk_token_idx'] = unk_token_idx
parameters['device'] = device


In [7]:
# model.load_state_dict(torch.load('./model/model_pt-bert.pt'))

# optimizer = optim.Adam(model.parameters())
# criterion = nn.BCEWithLogitsLoss()
# model = model.to(device)
# criterion = criterion.to(device)

# test_loss, test_acc = evaluate(model, test_iterator, criterion)

# print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

max_input_length = 300

while True:
    sentence = input("input> ")

    if sentence == "exit":
        break

    sentiment = predict_sentiment(model, tokenizer, sentence, parameters)

    if(np.argmax(sentiment) == 0):
        pred_proba = "%.2f%%" % (sentiment)
        print("negativo => ", pred_proba)
    elif (np.argmax(sentiment) == 1):
        pred_proba = "%.2f%%" % (sentiment)
        print("positivo => ", pred_proba)
        

input> ruim
negativo =>  0.50%
input> bom
negativo =>  0.50%
input> ótimo
negativo =>  0.49%
input> maravilhoso
negativo =>  0.50%
input> exit


In [4]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModel
from transformers import AutoTokenizer
from lib.utils import BERTGRUSentimentB
from lib.utils import predict_sentiment

model = torch.load('./model/model_pt-bert.pt')
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

max_input_length = 300

while True:
    sentence = input("input> ")

    if sentence == "exit":
        break
        
    sentiment = predict_sentiment(model, tokenizer, sentence, parameters)

    if(np.argmax(sentiment) == 0):
        pred_proba = "%.2f%%" % (sentiment)
        print("negativo => ", pred_proba)
    elif (np.argmax(sentiment) == 1):
        pred_proba = "%.2f%%" % (sentiment)
        print("positivo => ", pred_proba)

input> ruim
negativo =>  0.50%
input> exit
