In [1]:
import torch
from transformers import BertForSequenceClassification, BertTokenizerFast

In [4]:
from datasets import load_dataset

In [7]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [8]:
import nltk
nltk.download('wordnet')  # Needed for synonym replacement
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')  # Needed for sentence tokenization

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/greentea/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/greentea/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /Users/greentea/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
from transformers import get_scheduler

In [14]:
import os

In [2]:
# Load BERT-Mini model & tokenizer
MODEL_NAME = "prajjwal1/bert-mini"  # Smallest BERT model to fit Mac memory
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

# Set device to MPS (Mac GPU) or CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

print(f"Using device: {device}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: mps


In [5]:
suicide_dataset = load_dataset("vibhorag101/suicide_prediction_dataset_phr")  # Suicide detection dataset

In [6]:
print(suicide_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 185574
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 46394
    })
})


In [5]:
def map_labels(example):
    example["label"] = 1 if example["label"] == "suicide" else 0
    return example

suicide_dataset = suicide_dataset.map(map_labels)


In [6]:
def tokenize_function(batch):
    tokenized = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=512)
    tokenized["labels"] = batch["label"]  # Keep labels
    return tokenized

# Apply tokenization
suicide_dataset = suicide_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/185574 [00:00<?, ? examples/s]

Map:   0%|          | 0/46394 [00:00<?, ? examples/s]

In [9]:
import nlpaug.augmenter.word as naw
from nltk.corpus import wordnet  # Import WordNet after downloading

# Define the augmentation function using WordNet
aug = naw.SynonymAug(aug_src='wordnet', aug_max=2)  # Augment text

def augment_text(example):
    example["text"] = aug.augment(example["text"])
    return example

# Apply augmentation to the Suicide dataset
suicide_dataset = suicide_dataset.map(augment_text)

Map:   0%|          | 0/185574 [00:00<?, ? examples/s]

Map:   0%|          | 0/46394 [00:00<?, ? examples/s]

In [10]:
from torch.nn import CrossEntropyLoss

# More weight for the underrepresented class
class_weights = torch.tensor([0.5, 1.5]).to(device)  # Adjust as needed
loss_fn = CrossEntropyLoss(weight=class_weights)

In [11]:
class SuicideDataset(Dataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(item["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(item["labels"], dtype=torch.long)
        }

# Convert to PyTorch dataset
train_suicide_dataset = SuicideDataset(suicide_dataset["train"])

# Create DataLoader
train_suicide_loader = DataLoader(train_suicide_dataset, batch_size=8, shuffle=True)


In [13]:
# Define optimizer and scheduler
optimizer = optim.AdamW(model.parameters(), lr=3e-5)  # Reduce learning rate from 5e-5

# Learning rate scheduler
num_training_steps = len(train_suicide_loader) * 5  # 5 epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=500, num_training_steps=num_training_steps
)

In [15]:
# Checkpoint directory
CHECKPOINT_DIR = "./checkpoints"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

In [16]:
EPOCHS = 5

# Training Loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    print(f"\nEpoch {epoch + 1}/{EPOCHS} - Training...")

    for batch_idx, batch in enumerate(train_suicide_loader):
        optimizer.zero_grad()

        inputs = {key: val.to(device) for key, val in batch.items() if key in ["input_ids", "attention_mask"]}
        labels = batch["labels"].to(device)

        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, labels)

        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()

        # Print loss every 100 batches
        if (batch_idx + 1) % 100 == 0:
            avg_loss = total_loss / (batch_idx + 1)
            print(f"Batch {batch_idx + 1}/{len(train_suicide_loader)} - Avg Loss: {avg_loss:.4f}")

        # Save model every 500 batches
        if (batch_idx + 1) % 500 == 0:
            checkpoint_path = f"{CHECKPOINT_DIR}/checkpoint_epoch{epoch}_batch{batch_idx}.pth"
            torch.save(model.state_dict(), checkpoint_path)
            print(f"Checkpoint saved at {checkpoint_path}")
    
    avg_epoch_loss = total_loss / len(train_suicide_loader)
    print(f"Epoch {epoch+1} completed. Avg Loss: {avg_epoch_loss:.4f}")

    # Save entire model at the end of epoch
    MODEL_PATH = f"{CHECKPOINT_DIR}/model_epoch{epoch}"
    model.save_pretrained(MODEL_PATH)
    tokenizer.save_pretrained(MODEL_PATH)
    print(f"Full model saved for Epoch {epoch+1} at {MODEL_PATH}")

print("\nTraining complete!")


Epoch 1/5 - Training...
Batch 100/23197 - Avg Loss: 0.5940
Batch 200/23197 - Avg Loss: 0.5391
Batch 300/23197 - Avg Loss: 0.4750
Batch 400/23197 - Avg Loss: 0.4314
Batch 500/23197 - Avg Loss: 0.3931
Checkpoint saved at ./checkpoints/checkpoint_epoch0_batch499.pth
Batch 600/23197 - Avg Loss: 0.3747
Batch 700/23197 - Avg Loss: 0.3554
Batch 800/23197 - Avg Loss: 0.3349
Batch 900/23197 - Avg Loss: 0.3216
Batch 1000/23197 - Avg Loss: 0.3143
Checkpoint saved at ./checkpoints/checkpoint_epoch0_batch999.pth
Batch 1100/23197 - Avg Loss: 0.3063
Batch 1200/23197 - Avg Loss: 0.2979
Batch 1300/23197 - Avg Loss: 0.2901
Batch 1400/23197 - Avg Loss: 0.2828
Batch 1500/23197 - Avg Loss: 0.2762
Checkpoint saved at ./checkpoints/checkpoint_epoch0_batch1499.pth
Batch 1600/23197 - Avg Loss: 0.2709
Batch 1700/23197 - Avg Loss: 0.2660
Batch 1800/23197 - Avg Loss: 0.2613
Batch 1900/23197 - Avg Loss: 0.2559
Batch 2000/23197 - Avg Loss: 0.2522
Checkpoint saved at ./checkpoints/checkpoint_epoch0_batch1999.pth
Ba

Traceback (most recent call last):
  File "/Users/greentea/Documents/DetectionBot/virtual_env/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3549, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/lm/l86zt3r51rq7110601lpcfj80000gn/T/ipykernel_45764/2896336531.py", line 23, in <module>
    total_loss += loss.item()
                  ^^^^^^^^^^^
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/greentea/Documents/DetectionBot/virtual_env/lib/python3.12/site-packages/pygments/styles/__init__.py", line 45, in get_style_by_name
ModuleNotFoundError: No module named 'pygments.styles.default'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/greentea/Documents/DetectionBot/virtual_env/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 2173, in showtraceback
    s

In [2]:
# Load latest model and test