<a href="https://colab.research.google.com/github/renatojmf/Aspects-Extraction-in-Portuguese/blob/main/Bert_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
! pip install transformers
! pip install unidecode

In [2]:
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from transformers import AutoModel, AutoTokenizer
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

In [3]:
df1 = pd.read_csv('/content/restaurantes.csv',sep=';')

In [4]:
from unidecode import unidecode
import re

words = []

for word in df1['text']:
  old_string = unidecode(word)
  new_string = re.sub(r"[^a-zA-Z0-9]"," ",old_string)
  words.append(new_string.lower())

In [5]:
df1['text'] = words
df1['labels'] =-1
df1['category'] = 'None'
df1.head()

Unnamed: 0,text,aspects,aspectsClass,labels,category
0,hamburgueria artesanal com tempero diferenciad...,tempero,Food,-1,
1,excelente peixe para quem esta localizado no ...,peixe,Food,-1,
2,serve lanches e cerveja gelada tem mesinhas f...,cerveja,Drinks,-1,
3,muito bom o atendimento,atendimento,Service,-1,
4,cardapio variado,Cardápio,Food,-1,


In [6]:
df1.loc[df1["aspectsClass"] == "Service", "category"] = 'servicos'
df1.loc[df1["aspectsClass"] == "Food", "category"] = 'comida'
df1.loc[df1["aspectsClass"] == "Price", "category"] = 'preco'
df1.loc[df1["aspectsClass"] == "Ambience", "category"] = 'ambiente'
df1.loc[df1["aspectsClass"] == "Location", "category"] = 'local'

In [7]:
df1.head()

Unnamed: 0,text,aspects,aspectsClass,labels,category
0,hamburgueria artesanal com tempero diferenciad...,tempero,Food,-1,comida
1,excelente peixe para quem esta localizado no ...,peixe,Food,-1,comida
2,serve lanches e cerveja gelada tem mesinhas f...,cerveja,Drinks,-1,
3,muito bom o atendimento,atendimento,Service,-1,servicos
4,cardapio variado,Cardápio,Food,-1,comida


In [8]:
df = df1[df1['category'] != 'None']
df.head()

Unnamed: 0,text,aspects,aspectsClass,labels,category
0,hamburgueria artesanal com tempero diferenciad...,tempero,Food,-1,comida
1,excelente peixe para quem esta localizado no ...,peixe,Food,-1,comida
3,muito bom o atendimento,atendimento,Service,-1,servicos
4,cardapio variado,Cardápio,Food,-1,comida
5,ambiente agradavel,Ambiente,Ambience,-1,ambiente


In [9]:
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)
labels = {'servicos':0,
          'comida':1,
          'preco':2,
          'ambiente':3,
          'local':4
          }

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['category']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

Downloading:   0%|          | 0.00/205k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/647 [00:00<?, ?B/s]

In [10]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [11]:
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  

In [12]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

In [13]:
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

160 20 21


In [14]:
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 80/80 [14:59<00:00, 11.24s/it]


Epochs: 1 | Train Loss:  0.800 | Train Accuracy:  0.263 | Val Loss:  0.791 | Val Accuracy:  0.300


100%|██████████| 80/80 [15:18<00:00, 11.48s/it]


Epochs: 2 | Train Loss:  0.784 | Train Accuracy:  0.388 | Val Loss:  0.771 | Val Accuracy:  0.400


100%|██████████| 80/80 [15:17<00:00, 11.47s/it]


Epochs: 3 | Train Loss:  0.767 | Train Accuracy:  0.369 | Val Loss:  0.748 | Val Accuracy:  0.450


100%|██████████| 80/80 [15:23<00:00, 11.54s/it]


Epochs: 4 | Train Loss:  0.747 | Train Accuracy:  0.438 | Val Loss:  0.739 | Val Accuracy:  0.500


100%|██████████| 80/80 [15:25<00:00, 11.57s/it]


Epochs: 5 | Train Loss:  0.745 | Train Accuracy:  0.431 | Val Loss:  0.733 | Val Accuracy:  0.450


In [15]:
evaluate(model, df_test)

Test Accuracy:  0.333
