<a href="https://colab.research.google.com/github/renatojmf/Aspects-Extraction-in-Portuguese/blob/main/Bert_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
! pip install transformers
! pip install unidecode

In [2]:
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from transformers import AutoModel, AutoTokenizer
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

# Avaliações com o dataset de restaurantes

In [None]:
df1 = pd.read_csv('/content/restaurantes.csv',sep=';')

In [None]:
from unidecode import unidecode
import re

words = []

for word in df1['text']:
  old_string = unidecode(word)
  new_string = re.sub(r"[^a-zA-Z0-9]"," ",old_string)
  words.append(new_string.lower())

In [None]:
df1['text'] = words
df1['labels'] =-1
df1['category'] = 'None'
df1.head()

Unnamed: 0,text,aspects,aspectsClass,labels,category
0,hamburgueria artesanal com tempero diferenciad...,tempero,Food,-1,
1,excelente peixe para quem esta localizado no ...,peixe,Food,-1,
2,serve lanches e cerveja gelada tem mesinhas f...,cerveja,Drinks,-1,
3,muito bom o atendimento,atendimento,Service,-1,
4,cardapio variado,Cardápio,Food,-1,


In [None]:
df1.loc[df1["aspectsClass"] == "Service", "category"] = 'servicos'
df1.loc[df1["aspectsClass"] == "Food", "category"] = 'comida'
df1.loc[df1["aspectsClass"] == "Price", "category"] = 'preco'
df1.loc[df1["aspectsClass"] == "Ambience", "category"] = 'ambiente'
df1.loc[df1["aspectsClass"] == "Location", "category"] = 'local'

In [None]:
df1.head()

Unnamed: 0,text,aspects,aspectsClass,labels,category
0,hamburgueria artesanal com tempero diferenciad...,tempero,Food,-1,comida
1,excelente peixe para quem esta localizado no ...,peixe,Food,-1,comida
2,serve lanches e cerveja gelada tem mesinhas f...,cerveja,Drinks,-1,
3,muito bom o atendimento,atendimento,Service,-1,servicos
4,cardapio variado,Cardápio,Food,-1,comida


In [None]:
df = df1[df1['category'] != 'None']
df.head()

Unnamed: 0,text,aspects,aspectsClass,labels,category
0,hamburgueria artesanal com tempero diferenciad...,tempero,Food,-1,comida
1,excelente peixe para quem esta localizado no ...,peixe,Food,-1,comida
3,muito bom o atendimento,atendimento,Service,-1,servicos
4,cardapio variado,Cardápio,Food,-1,comida
5,ambiente agradavel,Ambiente,Ambience,-1,ambiente


In [None]:
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)
labels = {'servicos':0,
          'comida':1,
          'preco':2,
          'ambiente':3,
          'local':4
          }

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['category']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

Downloading:   0%|          | 0.00/205k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/647 [00:00<?, ?B/s]

In [None]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [None]:
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  

In [None]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

In [None]:
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

160 20 21


In [None]:
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 80/80 [14:59<00:00, 11.24s/it]


Epochs: 1 | Train Loss:  0.800 | Train Accuracy:  0.263 | Val Loss:  0.791 | Val Accuracy:  0.300


100%|██████████| 80/80 [15:18<00:00, 11.48s/it]


Epochs: 2 | Train Loss:  0.784 | Train Accuracy:  0.388 | Val Loss:  0.771 | Val Accuracy:  0.400


100%|██████████| 80/80 [15:17<00:00, 11.47s/it]


Epochs: 3 | Train Loss:  0.767 | Train Accuracy:  0.369 | Val Loss:  0.748 | Val Accuracy:  0.450


100%|██████████| 80/80 [15:23<00:00, 11.54s/it]


Epochs: 4 | Train Loss:  0.747 | Train Accuracy:  0.438 | Val Loss:  0.739 | Val Accuracy:  0.500


100%|██████████| 80/80 [15:25<00:00, 11.57s/it]


Epochs: 5 | Train Loss:  0.745 | Train Accuracy:  0.431 | Val Loss:  0.733 | Val Accuracy:  0.450


In [None]:
evaluate(model, df_test)

Test Accuracy:  0.333


# Avaliações com o dataset de TVs

In [28]:
import json
tvs = json.load(open("/content/tv.json"))

In [29]:
texts = []
aspectsClass = []
for x in tvs['reviews']:
  texts.append(x['review'])
  y = x['implicit aspects']
  if(len(y) > 0):
    aspectsClass.append(y[0])
  else:
    aspectsClass.append(None)

d = {'text': texts, 'aspectsClass':aspectsClass}

In [30]:
df_tvs = pd.DataFrame(data=d)
df_tvs.head()

Unnamed: 0,text,aspectsClass
0,Excelente smart tv. E foi entregue vem antes d...,entrega
1,"Imagem ótima recursos de web bons , recomendo ...",produto
2,"produto ótimo, entrega antes do prazo previsto...",
3,"Muito bom o produto, esta tv está entre as das...",funcionalidades
4,a entrega foi super rápido... muito antes do p...,


In [31]:
values = [None, 'None']
df_tst = df_tvs[df_tvs.aspectsClass.isin(values) == False]
df_tst.head()

Unnamed: 0,text,aspectsClass
0,Excelente smart tv. E foi entregue vem antes d...,entrega
1,"Imagem ótima recursos de web bons , recomendo ...",produto
3,"Muito bom o produto, esta tv está entre as das...",funcionalidades
11,Todos os aparelhos samsung são de uma qualidad...,produto
12,"Atendeu todas as expectativas, recomendo Samsu...",produto


In [32]:
from unidecode import unidecode
import re

words = []

for word in df['text']:
  old_string = unidecode(word)
  new_string = re.sub(r"[^a-zA-Z0-9]"," ",old_string)
  words.append(new_string.lower())

In [33]:
df_tst['text'] = words
df_tst.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,text,aspectsClass
0,excelente smart tv e foi entregue vem antes d...,entrega
1,imagem otima recursos de web bons recomendo ...,produto
3,muito bom o produto esta tv esta entre as das...,funcionalidades
11,todos os aparelhos samsung sao de uma qualidad...,produto
12,atendeu todas as expectativas recomendo samsu...,produto


In [34]:
df_tst['labels'] =-1
df_tst.loc[df_tst["aspectsClass"] == "produto", "labels"] = 0
df_tst.loc[df_tst["aspectsClass"] == "entrega", "labels"] = 1
df_tst.loc[df_tst["aspectsClass"] == "imagem", "labels"] = 2
df_tst.loc[df_tst["aspectsClass"] == "desempenho", "labels"] = 3
df_tst.loc[df_tst["aspectsClass"] == "som", "labels"] = 4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [35]:
df = df_tst[df_tst['labels'] != -1]
df.head(10)

Unnamed: 0,text,aspectsClass,labels
0,excelente smart tv e foi entregue vem antes d...,entrega,1
1,imagem otima recursos de web bons recomendo ...,produto,0
11,todos os aparelhos samsung sao de uma qualidad...,produto,0
12,atendeu todas as expectativas recomendo samsu...,produto,0
13,rapidez boa qualidade e preocupacao em manter...,desempenho,3
14,otima imagem facil acesso a internet com wifi...,produto,0
18,produto nao deixa nada a desejar exelente go...,produto,0
28,produto excelente recomendo produto alem de ...,entrega,1
29,recomendo tem muitas funcoes praticas o wi f...,produto,0
30,maravilhosa chegou dentro do prazo estipulado ...,produto,0


In [42]:
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)
labels = {'produto':0,
          'entrega':1,
          'imagem':2,
          'desempenho':3,
          'som':4
          }

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['aspectsClass']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [37]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [38]:
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  

In [39]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

In [40]:
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

388 48 49


In [43]:
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 194/194 [39:26<00:00, 12.20s/it]


Epochs: 1 | Train Loss:  0.761 | Train Accuracy:  0.446 | Val Loss:  0.669 | Val Accuracy:  0.667


100%|██████████| 194/194 [40:31<00:00, 12.53s/it]


Epochs: 2 | Train Loss:  0.569 | Train Accuracy:  0.758 | Val Loss:  0.558 | Val Accuracy:  0.667


100%|██████████| 194/194 [40:08<00:00, 12.41s/it]


Epochs: 3 | Train Loss:  0.462 | Train Accuracy:  0.758 | Val Loss:  0.522 | Val Accuracy:  0.667


100%|██████████| 194/194 [38:41<00:00, 11.96s/it]


Epochs: 4 | Train Loss:  0.405 | Train Accuracy:  0.758 | Val Loss:  0.461 | Val Accuracy:  0.667


100%|██████████| 194/194 [38:45<00:00, 11.98s/it]


Epochs: 5 | Train Loss:  0.365 | Train Accuracy:  0.760 | Val Loss:  0.436 | Val Accuracy:  0.667


In [44]:
evaluate(model, df_test)

Test Accuracy:  0.694
