In [None]:
#pip install transformers datasets scikit-learn torch evaluate accelerate
#!pip install -U "transformers>=4.30.0" "datasets" "evaluate" "accelerate" "scikit-learn" "torch" "scikit-learn"
!pip install "transformers>=4.30.0"
!pip install datasets
!pip install torch
!pip install scikit-learn
!pip install evaluate
!pip install accelerate

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
# Step 2: Imports
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_recall_fscore_support, confusion_matrix
import evaluate
import numpy as np
import matplotlib.pyplot as plt
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import torch
from torch.utils.data import DataLoader
from torch.nn.functional import softmax
from tqdm import tqdm
import torch.nn as nn


In [None]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7a3351b003b0>

In [None]:
# Load Dataset
# For financial phrase bank, labels are:
# 0: Negative
# 1: Neutral
# 2: Positive
# We will maintain this structure

dataset = load_dataset("financial_phrasebank", "sentences_50agree")
print(dataset["train"][0])
print(dataset["train"][1])
print(dataset["train"][2])
print(dataset["train"].features)

{'sentence': 'According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .', 'label': 1}
{'sentence': 'Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .', 'label': 1}
{'sentence': 'The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .', 'label': 0}
{'sentence': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'neutral', 'positive'], id=None)}


In [None]:
# Load Models
# Note: we instantiate the bert-base with 3 labels
models = {
    "bert-base": {
        "model": AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3),
        "tokenizer": AutoTokenizer.from_pretrained("bert-base-uncased")
    },
    "finbert": {
        "model": AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone"),
        "tokenizer": AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
    }
}

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [None]:
# Check model configuration
# Note: We need to remap the finbert output to the model dataset for consistency
# bert has not been trained
print(models["bert-base"]["model"].config.id2label)
print(models["finbert"]["model"].config.id2label)

{0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2'}
{0: 'Neutral', 1: 'Positive', 2: 'Negative'}


In [None]:
# Split the dataset
# Test size of 20%, shuffle data, stratify to maintain proportions

train_texts, val_texts, train_labels, val_labels = train_test_split(
    dataset['train']['sentence'],
    dataset['train']['label'],
    test_size=0.2,
    shuffle=True,
    stratify=dataset['train']['label'],
    random_state=42)

print(f'Number of training examples: {len(train_texts)}')
print(f'Number of validation examples: {len(val_texts)}')

#train_labels_finbert = remap_labels_for_finbert(train_labels)
#val_labels_finbert = remap_labels_for_finbert(val_labels)

Number of training examples: 3876
Number of validation examples: 970


In [None]:
# Tokenize Data
# Use each model's tokenizer
# deprecated - using in the SentimentDataset wrapper

def tokenize_data(tokenizer, dataset):
  return tokenizer(dataset, padding=True, truncation=True, max_length=128)

#train_encodings_bert = tokenize_data(models["bert-base"]["tokenizer"], train_texts)
#val_encodings_bert = tokenize_data(models["bert-base"]["tokenizer"], val_texts)
#train_encodings_finbert = tokenize_data(models["finbert"]["tokenizer"], train_texts)
#val_encodings_finbert = tokenize_data(models["finbert"]["tokenizer"], val_texts)


# Build Remap for Finbert
def remap_finbert_to_phrasebank(labels):
  # Remap finbert labels to match financial phrasebank dataset
  # Finbert -> Phrasebank
  # 0 ->  1 (Neutral)
  # 1 -> 2 (Positive)
  # 2 -> 0 (Negative)
  label_map = {0:1, 1:2, 2:0}
  return [label_map[label] for label in labels]

In [None]:
test = [1,2,0] #pos, neg, neu
remap_finbert_to_phrasebank(test)

# 2,0,1 in phrasebank is: pos, neg, neutral

[2, 0, 1]

In [None]:
# Create wrapper for DataLoader
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len)
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)





In [None]:
# From Hugging Face Tutorial
#import torch
#from transformers import AutoTokenizer, AutoModelForSequenceClassification

#checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
#tokenizer = AutoTokenizer.from_pretrained(checkpoint)
#model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
#sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

#tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
#output = model(**tokens)

In [None]:
def evaluate_model(model, dataloader, model_name=None, device="cuda" if torch.cuda.is_available() else "cpu"):
    model.eval()
    model.to(device)

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f"Evaluating {model_name or 'model'}"):

            # Extract Batch Features
            input_ids      = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels         = batch['labels'].to(device)

            # Run through model for predictions
            outputs     = model(input_ids=input_ids, attention_mask=attention_mask)
            logits      = outputs.logits
            predictions = torch.argmax(logits, dim=1)

            # Remap predictions from FinBERT head to PhraseBank if needed
            if model_name == "finbert":
                remapped = remap_finbert_to_phrasebank(predictions.cpu().tolist())
                predictions = torch.tensor(remapped, device=device)

            # Move results to CPU for scikit
            all_preds.extend(predictions.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())


    # Compute metrics
    label_names = ["Negative", "Neutral", "Positive"]
    accuracy = sum([p == l for p, l in zip(all_preds, all_labels)]) / len(all_labels)

    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, labels=[0, 1, 2], average="macro", zero_division=0)

    conf_matrix = confusion_matrix(all_labels, all_preds, labels=[0, 1, 2])

    class_report = classification_report(all_labels, all_preds, target_names=label_names, zero_division=0)

    # Print Results
    print("Classification Report:")
    print(class_report)

    print("Confusion Matrix:")
    print(conf_matrix)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "confusion_matrix": conf_matrix
    }

In [None]:
resFin = evaluate_model(models["finbert"]["model"], DataLoader(SentimentDataset(train_texts, train_labels, models["finbert"]["tokenizer"]), batch_size=16), model_name="finbert")

Evaluating finbert: 100%|██████████| 243/243 [00:29<00:00,  8.22it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.79      0.67      0.73       483
     Neutral       0.79      0.92      0.85      2303
    Positive       0.80      0.59      0.68      1090

    accuracy                           0.79      3876
   macro avg       0.80      0.73      0.75      3876
weighted avg       0.79      0.79      0.79      3876

Confusion Matrix:
[[ 326  144   13]
 [  47 2113  143]
 [  38  414  638]]


In [None]:
resBert = evaluate_model(models["bert-base"]["model"], DataLoader(SentimentDataset(train_texts, train_labels, models["bert-base"]["tokenizer"]), batch_size=16))

Evaluating model: 100%|██████████| 243/243 [00:28<00:00,  8.47it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.07      0.15      0.10       483
     Neutral       0.57      0.71      0.63      2303
    Positive       0.00      0.00      0.00      1090

    accuracy                           0.44      3876
   macro avg       0.21      0.29      0.24      3876
weighted avg       0.35      0.44      0.39      3876

Confusion Matrix:
[[  72  411    0]
 [ 670 1633    0]
 [ 286  804    0]]


In [None]:
def train_bert(model, train_dataloader, val_dataloader, epochs=3, lr=2e-5, device="cuda"):
    model.to(device)
    model.train()

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")

        # Training loop
        total_loss = 0
        for batch in tqdm(train_dataloader, desc="Training"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = loss_fn(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_dataloader)
        print(f"Train Loss: {avg_loss:.4f}")

        # Optional: basic validation accuracy after each epoch
        val_acc = evaluate_model(model, val_dataloader, model_name="bert-base", device=device)["accuracy"]
        print(f"Validation Accuracy: {val_acc:.4f}")

In [None]:
def train_on_increasing_sizes_simple(
    model_template_path,
    train_texts,
    train_labels,
    val_dataloader,
    tokenizer,
    fractions=[0.1, 0.25, 0.5, 0.75, 1.0],
    batch_size=16,
    epochs=3,
    device="cuda"
):
    results = {}

    # make sure data is in consistent order
    combined = list(zip(train_texts, train_labels))
    total = len(combined)

    for frac in fractions:
        num_examples = int(frac * total)
        print(f"\nTraining on {num_examples} examples ({int(frac * 100)}%)")

        # Take a simple slice of the data
        subset = combined[:num_examples]
        sub_texts, sub_labels = zip(*subset)

        # Tokenize and wrap into Dataset and DataLoader
        train_dataset = SentimentDataset(list(sub_texts), list(sub_labels), tokenizer)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        # Load a fresh model
        model = AutoModelForSequenceClassification.from_pretrained(model_template_path, num_labels=3)

        # Train and evaluate
        train_bert(model, train_loader, val_dataloader, epochs=epochs, device=device)
        metrics = evaluate_model(model, val_dataloader, model_name="bert-base", device=device)

        results[f"{int(frac*100)}%"] = metrics

    return results

In [None]:
# Prepare val_loader (only once)
val_dataset = SentimentDataset(val_texts, val_labels, models["bert-base"]["tokenizer"])
val_loader = DataLoader(val_dataset, batch_size=16)

# Run training loop
results = train_on_increasing_sizes_simple(
    model_template_path="bert-base-uncased",
    train_texts=train_texts,
    train_labels=train_labels,
    val_dataloader=val_loader,
    tokenizer=models["bert-base"]["tokenizer"],
    fractions=[0.1, 0.25, 0.5, 0.75, 1.0],
    batch_size=16,
    epochs=3
)


🔁 Training on 387 examples (10%)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 25/25 [00:09<00:00,  2.73it/s]


Train Loss: 0.9470


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 12.98it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       121
     Neutral       0.63      1.00      0.77       576
    Positive       0.49      0.10      0.16       273

    accuracy                           0.62       970
   macro avg       0.37      0.37      0.31       970
weighted avg       0.51      0.62      0.51       970

Confusion Matrix:
[[  0  93  28]
 [  0 576   0]
 [  0 246  27]]
Validation Accuracy: 0.6216

Epoch 2/3


Training: 100%|██████████| 25/25 [00:08<00:00,  2.87it/s]


Train Loss: 0.7819


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 13.54it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       121
     Neutral       0.69      0.98      0.81       576
    Positive       0.54      0.29      0.38       273

    accuracy                           0.67       970
   macro avg       0.41      0.43      0.40       970
weighted avg       0.56      0.67      0.59       970

Confusion Matrix:
[[  0  63  58]
 [  0 567   9]
 [  0 193  80]]
Validation Accuracy: 0.6670

Epoch 3/3


Training: 100%|██████████| 25/25 [00:08<00:00,  2.99it/s]


Train Loss: 0.4927


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 14.17it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.83      0.54      0.65       121
     Neutral       0.77      0.94      0.84       576
    Positive       0.72      0.50      0.59       273

    accuracy                           0.76       970
   macro avg       0.78      0.66      0.70       970
weighted avg       0.76      0.76      0.75       970

Confusion Matrix:
[[ 65  31  25]
 [  9 540  27]
 [  4 132 137]]
Validation Accuracy: 0.7649


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 14.27it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.83      0.54      0.65       121
     Neutral       0.77      0.94      0.84       576
    Positive       0.72      0.50      0.59       273

    accuracy                           0.76       970
   macro avg       0.78      0.66      0.70       970
weighted avg       0.76      0.76      0.75       970

Confusion Matrix:
[[ 65  31  25]
 [  9 540  27]
 [  4 132 137]]

🔁 Training on 969 examples (25%)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 61/61 [00:20<00:00,  2.96it/s]


Train Loss: 0.8810


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 14.38it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.82      0.07      0.14       121
     Neutral       0.76      0.91      0.83       576
    Positive       0.50      0.49      0.49       273

    accuracy                           0.69       970
   macro avg       0.69      0.49      0.49       970
weighted avg       0.69      0.69      0.65       970

Confusion Matrix:
[[  9  28  84]
 [  2 523  51]
 [  0 139 134]]
Validation Accuracy: 0.6866

Epoch 2/3


Training: 100%|██████████| 61/61 [00:20<00:00,  2.96it/s]


Train Loss: 0.4928


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 14.01it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.77      0.28      0.41       121
     Neutral       0.83      0.90      0.86       576
    Positive       0.63      0.70      0.67       273

    accuracy                           0.77       970
   macro avg       0.75      0.63      0.65       970
weighted avg       0.77      0.77      0.75       970

Confusion Matrix:
[[ 34  24  63]
 [ 10 518  48]
 [  0  81 192]]
Validation Accuracy: 0.7670

Epoch 3/3


Training: 100%|██████████| 61/61 [00:20<00:00,  2.92it/s]


Train Loss: 0.1993


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 13.98it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.76      0.83      0.79       121
     Neutral       0.89      0.82      0.85       576
    Positive       0.72      0.81      0.76       273

    accuracy                           0.82       970
   macro avg       0.79      0.82      0.80       970
weighted avg       0.82      0.82      0.82       970

Confusion Matrix:
[[100  15   6]
 [ 23 473  80]
 [  8  45 220]]
Validation Accuracy: 0.8175


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 14.07it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.76      0.83      0.79       121
     Neutral       0.89      0.82      0.85       576
    Positive       0.72      0.81      0.76       273

    accuracy                           0.82       970
   macro avg       0.79      0.82      0.80       970
weighted avg       0.82      0.82      0.82       970

Confusion Matrix:
[[100  15   6]
 [ 23 473  80]
 [  8  45 220]]

🔁 Training on 1938 examples (50%)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 122/122 [00:41<00:00,  2.93it/s]


Train Loss: 0.7363


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 14.18it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.66      0.88      0.76       121
     Neutral       0.89      0.80      0.84       576
    Positive       0.71      0.75      0.73       273

    accuracy                           0.80       970
   macro avg       0.75      0.81      0.78       970
weighted avg       0.81      0.80      0.80       970

Confusion Matrix:
[[107  11   3]
 [ 34 462  80]
 [ 21  48 204]]
Validation Accuracy: 0.7969

Epoch 2/3


Training: 100%|██████████| 122/122 [00:41<00:00,  2.94it/s]


Train Loss: 0.3219


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 14.17it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.71      0.88      0.78       121
     Neutral       0.89      0.84      0.86       576
    Positive       0.76      0.77      0.77       273

    accuracy                           0.82       970
   macro avg       0.78      0.83      0.80       970
weighted avg       0.83      0.82      0.82       970

Confusion Matrix:
[[106  12   3]
 [ 31 481  64]
 [ 13  49 211]]
Validation Accuracy: 0.8227

Epoch 3/3


Training: 100%|██████████| 122/122 [00:41<00:00,  2.96it/s]


Train Loss: 0.1292


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 14.21it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.77      0.87      0.81       121
     Neutral       0.86      0.91      0.88       576
    Positive       0.86      0.71      0.78       273

    accuracy                           0.85       970
   macro avg       0.83      0.83      0.82       970
weighted avg       0.85      0.85      0.84       970

Confusion Matrix:
[[105  16   0]
 [ 23 522  31]
 [  9  70 194]]
Validation Accuracy: 0.8464


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 14.23it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.77      0.87      0.81       121
     Neutral       0.86      0.91      0.88       576
    Positive       0.86      0.71      0.78       273

    accuracy                           0.85       970
   macro avg       0.83      0.83      0.82       970
weighted avg       0.85      0.85      0.84       970

Confusion Matrix:
[[105  16   0]
 [ 23 522  31]
 [  9  70 194]]

🔁 Training on 2907 examples (75%)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 182/182 [01:02<00:00,  2.90it/s]


Train Loss: 0.6856


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 14.14it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.68      0.86      0.76       121
     Neutral       0.88      0.87      0.87       576
    Positive       0.80      0.73      0.76       273

    accuracy                           0.83       970
   macro avg       0.79      0.82      0.80       970
weighted avg       0.83      0.83      0.83       970

Confusion Matrix:
[[104  16   1]
 [ 26 500  50]
 [ 22  53 198]]
Validation Accuracy: 0.8268

Epoch 2/3


Training: 100%|██████████| 182/182 [01:02<00:00,  2.93it/s]


Train Loss: 0.3012


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 14.14it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.80      0.88      0.84       121
     Neutral       0.93      0.82      0.87       576
    Positive       0.73      0.90      0.81       273

    accuracy                           0.85       970
   macro avg       0.82      0.86      0.84       970
weighted avg       0.86      0.85      0.85       970

Confusion Matrix:
[[106  10   5]
 [ 21 470  85]
 [  5  23 245]]
Validation Accuracy: 0.8464

Epoch 3/3


Training: 100%|██████████| 182/182 [01:01<00:00,  2.94it/s]


Train Loss: 0.1217


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 14.16it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.76      0.88      0.81       121
     Neutral       0.91      0.84      0.87       576
    Positive       0.77      0.84      0.80       273

    accuracy                           0.84       970
   macro avg       0.81      0.85      0.83       970
weighted avg       0.85      0.84      0.84       970

Confusion Matrix:
[[106  13   2]
 [ 25 483  68]
 [  9  36 228]]
Validation Accuracy: 0.8423


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 14.14it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.76      0.88      0.81       121
     Neutral       0.91      0.84      0.87       576
    Positive       0.77      0.84      0.80       273

    accuracy                           0.84       970
   macro avg       0.81      0.85      0.83       970
weighted avg       0.85      0.84      0.84       970

Confusion Matrix:
[[106  13   2]
 [ 25 483  68]
 [  9  36 228]]

🔁 Training on 3876 examples (100%)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 243/243 [01:23<00:00,  2.90it/s]


Train Loss: 0.6003


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 14.10it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.70      0.89      0.78       121
     Neutral       0.89      0.85      0.87       576
    Positive       0.80      0.77      0.78       273

    accuracy                           0.83       970
   macro avg       0.79      0.84      0.81       970
weighted avg       0.84      0.83      0.83       970

Confusion Matrix:
[[108  12   1]
 [ 34 490  52]
 [ 13  51 209]]
Validation Accuracy: 0.8320

Epoch 2/3


Training: 100%|██████████| 243/243 [01:22<00:00,  2.94it/s]


Train Loss: 0.2613


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 14.18it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.78      0.83      0.80       121
     Neutral       0.91      0.81      0.86       576
    Positive       0.71      0.84      0.77       273

    accuracy                           0.82       970
   macro avg       0.80      0.83      0.81       970
weighted avg       0.83      0.82      0.83       970

Confusion Matrix:
[[100  13   8]
 [ 21 469  86]
 [  7  36 230]]
Validation Accuracy: 0.8237

Epoch 3/3


Training: 100%|██████████| 243/243 [01:22<00:00,  2.94it/s]


Train Loss: 0.1088


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 14.17it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.85      0.83      0.84       121
     Neutral       0.85      0.93      0.89       576
    Positive       0.87      0.71      0.78       273

    accuracy                           0.86       970
   macro avg       0.86      0.82      0.84       970
weighted avg       0.86      0.86      0.85       970

Confusion Matrix:
[[100  18   3]
 [ 14 537  25]
 [  4  76 193]]
Validation Accuracy: 0.8557


Evaluating bert-base: 100%|██████████| 61/61 [00:04<00:00, 14.23it/s]


Classification Report:
              precision    recall  f1-score   support

    Negative       0.85      0.83      0.84       121
     Neutral       0.85      0.93      0.89       576
    Positive       0.87      0.71      0.78       273

    accuracy                           0.86       970
   macro avg       0.86      0.82      0.84       970
weighted avg       0.86      0.86      0.85       970

Confusion Matrix:
[[100  18   3]
 [ 14 537  25]
 [  4  76 193]]


In [None]:
results

{'10%': {'accuracy': 0.7649484536082474,
  'precision': 0.7754458719371001,
  'recall': 0.65884052815871,
  'f1': 0.6969165399353873,
  'confusion_matrix': array([[ 65,  31,  25],
         [  9, 540,  27],
         [  4, 132, 137]])},
 '25%': {'accuracy': 0.8175257731958763,
  'precision': 0.7899142235063906,
  'recall': 0.8178292141360325,
  'f1': 0.802200816142331,
  'confusion_matrix': array([[100,  15,   6],
         [ 23, 473,  80],
         [  8,  45, 220]])},
 '50%': {'accuracy': 0.8463917525773196,
  'precision': 0.8290660704884676,
  'recall': 0.8282137685546777,
  'f1': 0.8249422369974345,
  'confusion_matrix': array([[105,  16,   0],
         [ 23, 522,  31],
         [  9,  70, 194]])},
 '75%': {'accuracy': 0.8422680412371134,
  'precision': 0.8100460883753007,
  'recall': 0.8499131865609139,
  'f1': 0.8275668802816778,
  'confusion_matrix': array([[106,  13,   2],
         [ 25, 483,  68],
         [  9,  36, 228]])},
 '100%': {'accuracy': 0.8556701030927835,
  'precision'

Now we can start to train the bert model on the dataset