In [1]:
import torch
from transformers import BertForSequenceClassification, BertTokenizerFast

In [2]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [3]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [11]:
import nltk
nltk.download('wordnet')  # Needed for synonym replacement
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')  # Needed for sentence tokenization

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/greentea/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/greentea/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package punkt to /Users/greentea/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
from transformers import get_scheduler

In [5]:
# Load BERT-Small model and tokenizer
MODEL_NAME = "prajjwal1/bert-small"  # Upgrading from BERT-Mini
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

# Set device (Mac M1/M2/M3)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

print(f"Using device: {device}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: mps


In [6]:
sentiment_dataset = load_dataset("imdb")  # Sentiment analysis dataset
suicide_dataset = load_dataset("vibhorag101/suicide_prediction_dataset_phr")  # Suicide detection dataset

In [7]:
def map_labels(example):
    example["label"] = 1 if example["label"] == "suicide" else 0
    return example

suicide_dataset = suicide_dataset.map(map_labels)

In [9]:
def tokenize_function(batch):
    tokenized = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=512)
    tokenized["labels"] = batch["label"]  # Keep labels
    return tokenized

# Apply tokenization
suicide_dataset = suicide_dataset.map(tokenize_function, batched=True)
sentiment_dataset = sentiment_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/185574 [00:00<?, ? examples/s]

Map:   0%|          | 0/46394 [00:00<?, ? examples/s]

In [12]:
import nlpaug.augmenter.word as naw
from nltk.corpus import wordnet  # Import WordNet after downloading

# Define the augmentation function using WordNet
aug = naw.SynonymAug(aug_src='wordnet', aug_max=2)  # Augment text

def augment_text(example):
    example["text"] = aug.augment(example["text"])
    return example

# Apply augmentation to the Suicide dataset
suicide_dataset = suicide_dataset.map(augment_text)

Map:   0%|          | 0/185574 [00:00<?, ? examples/s]

Map:   0%|          | 0/46394 [00:00<?, ? examples/s]

In [13]:
# Remove text column to save memory
suicide_dataset = suicide_dataset.remove_columns(["text"])
sentiment_dataset = sentiment_dataset.remove_columns(["text"])

In [14]:
# If dataset has more non-suicidal samples, adjust the loss function so it pays
# attention to suicide-related messages
from torch.nn import CrossEntropyLoss

class_weights = torch.tensor([0.5, 1.5]).to(device)  # Adjust based on dataset imbalance
loss_fn = CrossEntropyLoss(weight=class_weights)

In [15]:
# Wrap in Pytorch Dataset
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(item["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(item["labels"], dtype=torch.long)
        }

# Convert datasets to PyTorch format
train_suicide_dataset = CustomDataset(suicide_dataset["train"])
train_sentiment_dataset = CustomDataset(sentiment_dataset["train"])

# Create DataLoaders
train_suicide_loader = DataLoader(train_suicide_dataset, batch_size=8, shuffle=True)
train_sentiment_loader = DataLoader(train_sentiment_dataset, batch_size=8, shuffle=True)


In [19]:
optimizer = optim.AdamW(model.parameters(), lr=3e-5)  # Lowered from 5e-5

# Add learning rate warmup
num_training_steps = len(train_suicide_loader) * 5  # 5 epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=500, num_training_steps=num_training_steps
)

In [None]:
EPOCHS = 5

# Training Loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    print(f"\nEpoch {epoch + 1}/{EPOCHS} - Training...")

    for batch_idx, batch in enumerate(train_suicide_loader):
        optimizer.zero_grad()

        inputs = {key: val.to(device) for key, val in batch.items() if key in ["input_ids", "attention_mask"]}
        labels = batch["labels"].to(device)

        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, labels)

        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()

        # Print loss every 100 batches
        if (batch_idx + 1) % 100 == 0:
            avg_loss = total_loss / (batch_idx + 1)
            print(f"Batch {batch_idx + 1}/{len(train_suicide_loader)} - Avg Loss: {avg_loss:.4f}")

    # Print average loss at the end of each epoch
    avg_epoch_loss = total_loss / len(train_suicide_loader)
    print(f"Epoch {epoch+1} completed. Avg Loss: {avg_epoch_loss:.4f}")

print("\nTraining complete!")



Epoch 1/5 - Training...
Batch 100/23197 - Avg Loss: 0.5307
