In [2]:
import torch
from transformers import BertForSequenceClassification, BertTokenizerFast

In [3]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [4]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [5]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/greentea/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# Load BERT-Small model and tokenizer
MODEL_NAME = "prajjwal1/bert-small"  # Upgrading from BERT-Mini
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

# Set device (Mac M1/M2/M3)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

print(f"Using device: {device}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: mps


In [7]:
sentiment_dataset = load_dataset("imdb")  # Sentiment analysis dataset
suicide_dataset = load_dataset("vibhorag101/suicide_prediction_dataset_phr")  # Suicide detection dataset

In [9]:
def map_labels(example):
    example["label"] = 1 if example["label"] == "suicide" else 0
    return example

suicide_dataset = suicide_dataset.map(map_labels)

Map:   0%|          | 0/185574 [00:00<?, ? examples/s]

Map:   0%|          | 0/46394 [00:00<?, ? examples/s]

In [11]:
def tokenize_function(batch):
    tokenized = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=512)
    tokenized["labels"] = batch["label"]  # Keep labels
    return tokenized

# Apply tokenization
suicide_dataset = suicide_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/185574 [00:00<?, ? examples/s]

Map:   0%|          | 0/46394 [00:00<?, ? examples/s]

In [12]:
sentiment_dataset = sentiment_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [13]:
# Remove text column to save memory
suicide_dataset = suicide_dataset.remove_columns(["text"])
sentiment_dataset = sentiment_dataset.remove_columns(["text"])

In [14]:
# If dataset has more non-suicidal samples, adjust the loss function so it pays
# attention to suicide-related messages
from torch.nn import CrossEntropyLoss

class_weights = torch.tensor([0.5, 1.5]).to(device)  # Adjust based on dataset imbalance
loss_fn = CrossEntropyLoss(weight=class_weights)

In [15]:
import nlpaug.augmenter.word as naw
from nltk.corpus import wordnet  # Import WordNet after downloading

# Define the augmentation function using WordNet
aug = naw.SynonymAug(aug_src='wordnet', aug_max=2)  # Augment text

def augment_text(example):
    example["text"] = aug.augment(example["text"])
    return example

# Apply augmentation to the Suicide dataset
suicide_dataset = suicide_dataset.map(augment_text)


Map:   0%|          | 0/185574 [00:00<?, ? examples/s]

KeyError: 'text'