In [24]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd

from torch.nn import BCEWithLogitsLoss
loss_fn = BCEWithLogitsLoss()

In [25]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load data

In [27]:
# Load the Excel file into a DataFrame
df = pd.read_excel("dataset_for_transformer.xlsx")

# Extract the "texts" column
texts = df["text"].tolist()
labels = df['label'].tolist()


In [28]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\juan_\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
max_length = max([len(word_tokenize(text)) for text in texts])
print(max_length)


237


### Tokenize Data

In [30]:

input_data = tokenizer(texts, padding='max_length', truncation=True, max_length=512, return_tensors="pt")

### Dataloader

In [31]:
from torch.utils.data import DataLoader, TensorDataset, random_split

In [32]:
# Convert the list of labels to a tensor
label_tensor = torch.tensor(labels, dtype=torch.long)

# Combine the training inputs into a TensorDataset
dataset = TensorDataset(input_data["input_ids"], input_data["attention_mask"], input_data["token_type_ids"], label_tensor)

In [33]:
from torch.utils.data import random_split

# Define the sizes
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

# Split the dataset
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])


In [34]:

# Create DataLoaders for training and validation
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


### Init model

In [35]:
# Model initialization (for binary classification)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

### Training loop

In [36]:
from transformers import AdamW
from torch.nn import BCEWithLogitsLoss

# Initialize the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = BCEWithLogitsLoss()

best_val_loss = float("inf")
patience_counter = 0
patience_limit = 1  # Adjust based on your requirements

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        # Load batch to GPU
        b_input_ids, b_attention_mask, b_token_type_ids, b_labels = batch
        b_input_ids = b_input_ids.to(device)
        b_attention_mask = b_attention_mask.to(device)
        b_token_type_ids = b_token_type_ids.to(device)
        b_labels = b_labels.to(device)

        # Zero out any previously calculated gradients
        model.zero_grad()

        # Perform a forward pass
        outputs = model(b_input_ids, attention_mask=b_attention_mask, token_type_ids=b_token_type_ids)
        
        # Get the logits from the model
        logits = outputs.logits.squeeze()

        # Compute loss and accumulate the loss values
        loss = loss_fn(logits, b_labels.float())
        total_loss += loss.item()

        # Perform a backward pass to calculate gradients
        loss.backward()

        # Update parameters
        optimizer.step()

    # Average loss over the training epoch
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch: {epoch + 1}, Train Loss: {avg_train_loss:.4f}")

    # Validation loop
    model.eval()  # Set the model to evaluation mode
    total_val_loss = 0

    # Disable gradient updates
    with torch.no_grad():
        for batch in validation_dataloader:
            b_input_ids, b_attention_mask, b_token_type_ids, b_labels = batch
            b_input_ids = b_input_ids.to(device)
            b_attention_mask = b_attention_mask.to(device)
            b_token_type_ids = b_token_type_ids.to(device)
            b_labels = b_labels.to(device)
        
            outputs = model(b_input_ids, attention_mask=b_attention_mask, token_type_ids=b_token_type_ids)
            logits = outputs.logits.squeeze()
            loss = loss_fn(logits, b_labels.float())
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(validation_dataloader)
    print(f"Epoch: {epoch + 1}, Validation Loss: {avg_val_loss:.4f}")
    
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience_limit:
        print("Early stopping triggered!")
        break




Epoch: 1, Train Loss: 0.4144
Epoch: 1, Validation Loss: 0.0059
Epoch: 2, Train Loss: 0.0164
Epoch: 2, Validation Loss: 0.0011
Epoch: 3, Train Loss: 0.0013
Epoch: 3, Validation Loss: 0.0005
Epoch: 4, Train Loss: 0.0041
Epoch: 4, Validation Loss: 0.0003
Epoch: 5, Train Loss: 0.0046
Epoch: 5, Validation Loss: 0.0003


### Save model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device 

In [None]:
# Directory where model will be saved
save_directory = "./model"

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer 
tokenizer.save_pretrained(save_directory)


#### Load the saved model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model
model = BertForSequenceClassification.from_pretrained(save_directory)
model.to(device)

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained(save_directory)


### Evaluation

In [None]:
from sklearn.metrics import f1_score, confusion_matrix

model.to(device)

# Step 1: Predictions and True Labels
all_predictions = []
all_true_labels = []
total_eval_accuracy = 0

model.eval()
for batch in test_dataloader:
    b_input_ids, b_attention_mask, b_token_type_ids, b_labels = batch
    b_input_ids = b_input_ids.to(device)
    b_attention_mask = b_attention_mask.to(device)
    b_token_type_ids = b_token_type_ids.to(device)
    b_labels = b_labels.to(device)

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_attention_mask, token_type_ids=b_token_type_ids)
    
    logits = outputs.logits.squeeze()
    predictions = (logits > 0).int()
    
    all_predictions.extend(predictions.cpu().numpy())
    all_true_labels.extend(b_labels.cpu().numpy())
    
    # Calculate the accuracy rate
    total_eval_accuracy += (predictions == b_labels).sum().item()

# Average accuracy over all batches
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader.dataset)


# Step 2: Compute F1-Score
f1 = f1_score(all_true_labels, all_predictions)

# Step 3: Compute Confusion Matrix
conf_matrix = confusion_matrix(all_true_labels, all_predictions)

print(f"Validation Accuracy: {avg_val_accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, log_loss, cohen_kappa_score, matthews_corrcoef, balanced_accuracy_score
)

# Accuracy
accuracy = accuracy_score(all_true_labels, all_predictions)

# Precision
precision = precision_score(all_true_labels, all_predictions)

# Recall
recall = recall_score(all_true_labels, all_predictions)

# F1-Score
f1 = f1_score(all_true_labels, all_predictions)

# ROC AUC 
# all_probabilities = [probability of positive class for each sample]
# roc_auc = roc_auc_score(all_true_labels, all_probabilities)

# Specificity
tn, fp, fn, tp = confusion_matrix(all_true_labels, all_predictions).ravel()
specificity = tn / (tn + fp)

# Matthews Correlation Coefficient
mcc = matthews_corrcoef(all_true_labels, all_predictions)

# Log Loss (Again, requires probability scores)
# logloss = log_loss(all_true_labels, all_probabilities)

# Cohen's Kappa
kappa = cohen_kappa_score(all_true_labels, all_predictions)

# Balanced Accuracy
bal_acc = balanced_accuracy_score(all_true_labels, all_predictions)

# Print the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
# print(f"ROC AUC: {roc_auc:.4f}")  # Uncomment once you have all_probabilities
print(f"Specificity: {specificity:.4f}")
print(f"Matthews Correlation Coefficient: {mcc:.4f}")
# print(f"Log Loss: {logloss:.4f}")  # Uncomment once you have all_probabilities
print(f"Cohen's Kappa: {kappa:.4f}")
print(f"Balanced Accuracy: {bal_acc:.4f}")
