In [None]:
pip install -r requirements.txt

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import accelerate
import torch
import pandas as pd
%load_ext autoreload
%autoreload 2

## Carregue a base de dados e faça a divisão entre treino, validação e teste.

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
  return tokenizer(examples['text'], padding='max_length', truncation=True)

def create_datsets(file):
  data = pd.read_csv(file)
  encoder = LabelEncoder()
  data['labels'] = encoder.fit_transform(data['class'])

  texts = data['text'].tolist()
  labels = data['labels'].tolist()

  train_texts, temp_texts, train_labels, temp_labels = train_test_split(texts, labels, test_size=0.3, random_state=42)
  val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

  # Criando datasets do Hugging Face
  dataset_train = Dataset.from_dict({"text": train_texts, "label": train_labels})
  dataset_val = Dataset.from_dict({"text": val_texts, "label": val_labels})
  dataset_test = Dataset.from_dict({"text": test_texts, "label": test_labels})

  # Aplicando a tokenização
  dataset_train = dataset_train.map(tokenize_function, batched=True)
  dataset_val = dataset_val.map(tokenize_function, batched=True)
  dataset_test = dataset_test.map(tokenize_function, batched=True)

  # Removendo a coluna de texto, mantendo apenas tokens
  dataset_train = dataset_train.remove_columns(["text"])
  dataset_val = dataset_val.remove_columns(["text"])
  dataset_test = dataset_test.remove_columns(["text"])

  display(dataset_test.to_pandas())

  return dataset_train, dataset_val, dataset_test, labels, test_labels


## Treine o BERT (antes, faça a tokenização e veja como estão os tokens de um documento!)


In [5]:
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = torch.argmax(torch.tensor(logits), dim=-1)
  acc = accuracy_score(labels, predictions)
  f1_micro = f1_score(labels, predictions, average='micro')
  f1_macro = f1_score(labels, predictions, average='macro')
  return {
      'accuracy': acc,
      'f1_micro': f1_micro,
      'f1_macro': f1_macro
  }

def train(dataset_train, dataset_val, labels):
  model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(labels)))

  training_args = TrainingArguments(
      output_dir='./results',
      eval_strategy="epoch",
      learning_rate=2e-5,
      per_device_train_batch_size=2,
      per_device_eval_batch_size=16,
      num_train_epochs=3,
      weight_decay=0.01,
      save_strategy="epoch",
      logging_dir='./logs',
      logging_steps=10,
      load_best_model_at_end=True
  )

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=dataset_train,
      eval_dataset=dataset_val,
      compute_metrics=compute_metrics
  )

  trainer.train()

  return trainer

In [6]:
def results(trainer, dataset_test, test_labels):
  predictions = trainer.predict(dataset_test)
  logits = predictions.predictions
  predicted_labels = torch.argmax(torch.tensor(logits), dim=-1).tolist()

  acc = accuracy_score(test_labels, predicted_labels)
  f1_micro = f1_score(test_labels, predicted_labels, average='micro')
  f1_macro = f1_score(test_labels, predicted_labels, average='macro')
  conf_matrix = confusion_matrix(test_labels, predicted_labels)

  print("Accuracy:", acc)
  print("F1 Micro:", f1_micro)
  print("F1 Macro:", f1_macro)
  print("Confusion Matrix:\n", conf_matrix)
  print("Classification Report:\n", classification_report(test_labels, predicted_labels))

## Dmoz-Health.csv

In [7]:
dataset_train, dataset_val, dataset_test, labels, test_labels = create_datsets("Dmoz-Health.csv")

Map: 100%|██████████| 1399/1399 [00:00<00:00, 2220.65 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 2383.20 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 2179.76 examples/s]


Unnamed: 0,label,input_ids,token_type_ids,attention_mask
0,2,"[101, 4698, 5634, 1041, 12519, 2063, 2415, 202...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,3,"[101, 2273, 9681, 8715, 1999, 5995, 2298, 2164...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,1,"[101, 9820, 2740, 15955, 4522, 2740, 8474, 880...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."
3,2,"[101, 5056, 26735, 2491, 4031, 2592, 1010, 690...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,2,"[101, 22195, 11204, 9004, 2902, 2440, 2326, 22...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...
295,3,"[101, 1996, 8926, 6320, 3044, 3192, 2551, 2007...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
296,1,"[101, 2248, 4657, 1997, 23958, 12399, 9331, 51...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
297,3,"[101, 13268, 9113, 3471, 1998, 4330, 1005, 105...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
298,2,"[101, 2415, 15651, 9349, 3529, 8870, 1010, 607...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [8]:
trainer = train(dataset_train, dataset_val, labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro
1,0.0022,0.29687,0.946667,0.946667,0.947315
2,0.6346,0.304901,0.95,0.95,0.950558
3,0.0006,0.32756,0.946667,0.946667,0.947028


In [9]:
results(trainer, dataset_test, test_labels)

Accuracy: 0.9433333333333334
F1 Micro: 0.9433333333333334
F1 Macro: 0.9407402156547247
Confusion Matrix:
 [[88  1  0  0]
 [ 0 60  0  5]
 [ 1  1 65  5]
 [ 0  3  1 70]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99        89
           1       0.92      0.92      0.92        65
           2       0.98      0.90      0.94        72
           3       0.88      0.95      0.91        74

    accuracy                           0.94       300
   macro avg       0.94      0.94      0.94       300
weighted avg       0.95      0.94      0.94       300



## Industry-Sector.csv

In [7]:
dataset_train, dataset_val, dataset_test, labels, test_labels = create_datsets("Industry-Sector.csv")

Map: 100%|██████████| 1399/1399 [00:06<00:00, 223.39 examples/s]
Map: 100%|██████████| 300/300 [00:01<00:00, 174.69 examples/s]
Map: 100%|██████████| 300/300 [00:01<00:00, 217.75 examples/s]


Unnamed: 0,label,input_ids,token_type_ids,attention_mask
0,1,"[101, 13926, 3385, 1024, 3688, 9898, 18898, 30...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,3,"[101, 2188, 3931, 5908, 1997, 1054, 1004, 1038...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,0,"[101, 5527, 5527, 2012, 2115, 2609, 5527, 2012...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,1,"[101, 3235, 6088, 6337, 2194, 2592, 5971, 2951...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,1,"[101, 7044, 3552, 4264, 3552, 7044, 5014, 1010...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...
295,1,"[101, 3361, 2592, 1004, 1050, 5910, 2361, 1025...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
296,0,"[101, 4108, 1011, 3534, 2311, 3688, 4087, 5375...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
297,3,"[101, 2745, 4938, 13713, 2545, 1010, 4297, 101...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
298,1,"[101, 23564, 4892, 5014, 23564, 4892, 1005, 10...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [8]:
trainer = train(dataset_train, dataset_val, labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro
1,0.9234,0.8815,0.75,0.75,0.558292
2,0.1067,0.866072,0.793333,0.793333,0.668299
3,0.2668,0.922778,0.823333,0.823333,0.73276


In [9]:
results(trainer, dataset_test, test_labels)

Accuracy: 0.77
F1 Micro: 0.77
F1 Macro: 0.5627752306560253
Confusion Matrix:
 [[128  11   0   8]
 [ 19  74   0   3]
 [  3   4   0   6]
 [  5  10   0  29]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.87      0.85       147
           1       0.75      0.77      0.76        96
           2       0.00      0.00      0.00        13
           3       0.63      0.66      0.64        44

    accuracy                           0.77       300
   macro avg       0.55      0.58      0.56       300
weighted avg       0.74      0.77      0.75       300



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
