# Sentiment Analysis - BERT

## Import libraries

In [3]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
from torch.utils.data import Dataset
import torch




## Load product reviews preprocessing

In [4]:
df_reviews = pd.read_csv('./data/product_reviews_preprocessed.csv')
df_reviews.head()

Unnamed: 0,review_score,review_comment_message,review_creation_date,label,processed_review_comment
0,5,"Só achei ela pequena pra seis xícaras ,mais é ...",2017-08-08 00:00:00,1,"achar pequeno pra seis xícara , bom produto"
1,5,Entrega antes da data marcada. Excelente,2018-06-20 00:00:00,1,entregar antes data marcar . excelente
2,5,estou satisfeito,2018-08-15 00:00:00,1,satisfeito
3,5,Mais uma ve satisfeito,2018-05-09 00:00:00,1,ve satisfeito
4,5,"Muito boa a compra, dentro do prazo.",2017-12-08 00:00:00,1,"bom compra , dentro prazo ."


# BERT pré-treinado na língua portuguesa

* Bertimbau Base

In [8]:
# Preprocess the data
# Initialize the tokenizer for the Portuguese BERT model
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

# Define a PyTorch Dataset class for handling the tokenization and encoding of the text
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        
        # Tokenize and encode the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        return {
            'review_text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Define hyperparameters
MAX_LEN = 128  # Maximum length of the tokens list
BATCH_SIZE = 16  # Batch size for training and evaluation

# Split the data into training and test sets
df_train, df_test = train_test_split(df_reviews, test_size=0.1, random_state=42)
train_dataset = SentimentDataset(
    texts=df_train.review_comment_message.to_numpy(),
    labels=df_train.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)
test_dataset = SentimentDataset(
    texts=df_test.review_comment_message.to_numpy(),
    labels=df_test.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased')

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,  # Number of training epochs
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # Weight decay for regularization
    logging_dir='./logs',  # Directory for storing logs
)

# Define functions for evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

# Evaluate the model
evaluation_results = trainer.evaluate()

print(evaluation_results)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 145/4791 [1:45:39<56:25:14, 43.72s/it]
                                                      
 31%|███▏      | 500/1597 [1:32:47<3:22:08, 11.06s/it]

{'loss': 0.2195, 'learning_rate': 5e-05, 'epoch': 0.31}


                                                       
 63%|██████▎   | 1000/1597 [3:06:38<1:52:38, 11.32s/it]

{'loss': 0.1296, 'learning_rate': 2.72105742935278e-05, 'epoch': 0.63}


                                                       
 94%|█████████▍| 1500/1597 [4:40:51<18:13, 11.28s/it]

{'loss': 0.1135, 'learning_rate': 4.421148587055606e-06, 'epoch': 0.94}


                                                     
100%|██████████| 1597/1597 [4:58:56<00:00, 11.23s/it]


{'train_runtime': 17936.2109, 'train_samples_per_second': 1.424, 'train_steps_per_second': 0.089, 'train_loss': 0.15054338129147485, 'epoch': 1.0}


100%|██████████| 178/178 [11:25<00:00,  3.85s/it]

{'eval_loss': 0.10809097439050674, 'eval_accuracy': 0.970754052149401, 'eval_f1': 0.9792136238417231, 'eval_precision': 0.9824120603015075, 'eval_recall': 0.9760359460808787, 'eval_runtime': 689.2267, 'eval_samples_per_second': 4.118, 'eval_steps_per_second': 0.258, 'epoch': 1.0}



