In [1]:
import pandas as pd

In [2]:
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [3]:
from torch.utils.data import DataLoader

In [4]:
from torch.cuda.amp import autocast, GradScaler

In [5]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset

In [6]:
import os

In [7]:
MODEL = "prajjwal1/bert-mini"  # Optimized small model
tokenizer = BertTokenizerFast.from_pretrained(MODEL)

# Load Model
model = BertForSequenceClassification.from_pretrained(MODEL, num_labels=2)  # Binary Classification

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
sentiment_dataset = load_dataset("imdb")  # Sentiment analysis dataset
suicide_dataset = load_dataset("vibhorag101/suicide_prediction_dataset_phr")  # Suicide detection dataset

In [9]:
print(suicide_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 185574
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 46394
    })
})


In [10]:
print(sentiment_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [11]:
# Function to convert string labels to numeric values
def map_labels(example):
    # SuicideWatch dataset: Convert "suicide" → 1, "non-suicide" → 0
    if example["label"] == "suicide":
        example["label"] = 1
    elif example["label"] == "non-suicide":
        example["label"] = 0

    return example

In [12]:
suicide_dataset = suicide_dataset.map(map_labels)

In [15]:
# Tokenize and Rename Labels Efficiently
def preprocess_function(batch):
    tokenized = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=512)
    tokenized["labels"] = [int(label) for label in batch["label"]]
    return tokenized

# Apply tokenization to both datasets
suicide_dataset = suicide_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/185574 [00:00<?, ? examples/s]

Map:   0%|          | 0/46394 [00:00<?, ? examples/s]

In [19]:
sentiment_dataset = sentiment_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [20]:
# Remove text column (no longer needed)
suicide_dataset = suicide_dataset.remove_columns(["text"])
sentiment_dataset = sentiment_dataset.remove_columns(["text"])

In [21]:
print(suicide_dataset["train"][0])

{'label': 1, 'input_ids': [101, 2342, 2203, 6114, 4895, 4783, 5400, 6321, 5051, 27469, 2425, 2994, 2113, 2131, 2488, 3984, 2025, 2025, 2113, 2514, 2051, 16873, 5920, 14337, 16592, 2135, 2025, 16592, 2135, 2025, 2191, 2514, 2488, 2215, 2203, 9826, 2699, 2673, 2052, 2191, 2488, 2920, 2542, 2498, 2499, 2215, 3280, 2342, 3280, 2025, 10107, 9015, 2172, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [22]:
# Pytorch Dataset Wrapper
class MultiTaskDataset(Dataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(item["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(item["labels"], dtype=torch.long),
        }

# Wrap datasets
train_suicide_dataset = MultiTaskDataset(suicide_dataset["train"])
test_suicide_dataset = MultiTaskDataset(suicide_dataset["test"])

train_sentiment_dataset = MultiTaskDataset(sentiment_dataset["train"])
test_sentiment_dataset = MultiTaskDataset(sentiment_dataset["test"])


In [23]:
print(train_suicide_dataset)

<__main__.MultiTaskDataset object at 0x137503e30>


In [24]:
# Create efficient dataloaders
BATCH_SIZE = 8  # Increase batch size for efficiency

train_suicide_loader = DataLoader(train_suicide_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_suicide_loader = DataLoader(test_suicide_dataset, batch_size=BATCH_SIZE, shuffle=False)

train_sentiment_loader = DataLoader(train_sentiment_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_sentiment_loader = DataLoader(test_sentiment_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [25]:
# Set Device (Supports Mac MPS and CUDA)
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")

In [26]:
print(f"Using device: {device}")

Using device: mps


In [27]:
# Define Optimizer & Loss Function
optimizer = optim.AdamW(model.parameters(), lr=5e-5)  # Higher learning rate for BERT-Mini
loss_fn = nn.CrossEntropyLoss()  # Binary classification loss

In [28]:
# Model optimizations for less memory usage and better training

# Less dropout layers
for module in model.modules():
    if isinstance(module, torch.nn.Dropout):
        module.p = 0.05  # Reduce dropout

In [29]:
model.classifier = nn.Sequential(
    nn.LayerNorm(256),  # Normalize before classification
    nn.Linear(256, 2)   # Keep original classifier
)

In [30]:
model.half()  # Convert model weights to float16

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.05, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.05, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e

In [31]:
# Training Configurations
EPOCHS = 3  # More epochs compensate for smaller model

In [32]:
batch = next(iter(train_suicide_loader))
print(batch)

{'input_ids': tensor([[  101,  2131, 24209,  ...,     0,     0,     0],
        [  101,  5962,  7477,  ...,     0,     0,     0],
        [  101,  2699,  4830,  ...,     0,     0,     0],
        ...,
        [  101,  2601,  2617,  ...,     0,     0,     0],
        [  101,  3087,  2842,  ...,     0,     0,     0],
        [  101,  2267, 24665,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 0, 0, 0, 1, 1, 1, 1])}


In [33]:
print(type(model))

<class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'>


In [34]:
# Ensure model is fully on the correct device
model.to(device)
for param in model.parameters():
    param.data = param.data.to(device)
    if param.grad is not None:
        param.grad.data = param.grad.data.to(device)

In [38]:
# Train model
for epoch in range(EPOCHS):
    model.train()  # Set model to training mode
    total_loss = 0
    num_batches = min(len(train_suicide_loader), len(train_sentiment_loader))  # Ensure equal batches

    print(f"Epoch {epoch + 1}/{EPOCHS} - Training...")

    for batch_idx, (batch_suicide, batch_sentiment) in enumerate(zip(train_suicide_loader, train_sentiment_loader)):
        optimizer.zero_grad()

        # Suicide Task
        inputs = {key: val.to(device) for key, val in batch_suicide.items() if key in ["input_ids", "attention_mask"]}
        labels = batch_suicide["labels"].to(device)
        outputs = model(**inputs)
        loss_suicide = loss_fn(outputs.logits, labels)

        # Sentiment Task
        inputs = {key: val.to(device) for key, val in batch_sentiment.items() if key in ["input_ids", "attention_mask"]}
        labels = batch_sentiment["labels"].to(device)
        outputs = model(**inputs)
        loss_sentiment = loss_fn(outputs.logits, labels)

        # Combine Losses
        total_loss = (loss_suicide + loss_sentiment) / 2

        # Backpropagation
        total_loss.backward()  
        optimizer.step()

        # Logging Progress
        if batch_idx % 100 == 0:
            print(f"Batch {batch_idx}/{num_batches} - Loss: {total_loss.item():.4f}")

    print(f"Epoch {epoch+1} completed. Avg Loss: {total_loss.item():.4f}")

print("Training complete!")

Epoch 1/3 - Training...
Batch 0/3125 - Loss: nan


KeyboardInterrupt: 

In [23]:
# Save model
MODEL_PATH = "./trained_suicide_detection_model_bertmini"  # saved in same folder
# Ensure directory exists
os.makedirs(MODEL_PATH, exist_ok=True)

In [24]:
# Save model and tokenizer
model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)

('./trained_suicide_detection_model_bertmini/tokenizer_config.json',
 './trained_suicide_detection_model_bertmini/special_tokens_map.json',
 './trained_suicide_detection_model_bertmini/vocab.txt',
 './trained_suicide_detection_model_bertmini/added_tokens.json',
 './trained_suicide_detection_model_bertmini/tokenizer.json')

In [26]:
# Load saved model
model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH)

In [27]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-1

In [28]:
print("Model successfully loaded from:", MODEL_PATH)

Model successfully loaded from: ./trained_suicide_detection_model_bertmini


In [61]:
# Test on new messages
def predict_suicide_risk(text):
    """Runs a prediction on a single text input"""
    
    model.eval()  # Set model to evaluation mode

    # Tokenize input text
    inputs = tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

    # Move to the same device as the model
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Convert logits to probabilities
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # Get prediction label
    predicted_label = torch.argmax(probs, dim=-1).item()
    
    # Map label to class
    label_map = {0: "Non-Suicidal", 1: "High Suicide Risk"}
    return label_map[predicted_label], probs.cpu().numpy()

# Example Test
text_input = "I want to hurt myself."
prediction, confidence = predict_suicide_risk(text_input)

print(f"Prediction: {prediction}")
print(f"Confidence: {confidence}")


Prediction: Non-Suicidal
Confidence: [[0.5663725  0.43362752]]


In [63]:
# Load Model for Testing
from transformers import pipeline

suicide_classifier = pipeline("text-classification", model=MODEL_PATH, top_k=1)
sentiment_classifier = pipeline("text-classification", model=MODEL_PATH, top_k=1)

# Test
test_message = "I feel hopeless and don't want to live."
suicide_result = suicide_classifier(test_message)
sentiment_result = sentiment_classifier(test_message)

print(f"Suicide Classification: {suicide_result}")
print(f"Sentiment Classification: {sentiment_result}")

Device set to use mps:0
Device set to use mps:0


Suicide Classification: [[{'label': 'LABEL_0', 'score': 0.5520013570785522}]]
Sentiment Classification: [[{'label': 'LABEL_0', 'score': 0.5520013570785522}]]


In [1]:
# Test on suicide test data
import time
from sklearn.metrics import accuracy_score, classification_report

# Start timing
start_time = time.time()

# Set model to evaluation mode
model.eval()

true_labels = []
predictions = []

# Total number of batches in test dataset
num_batches = len(test_suicide_loader)

print(f"Evaluating on {num_batches} batches...\n")

with torch.no_grad():
    for batch_idx, batch in enumerate(test_suicide_loader):
        inputs = {key: val.to(device) for key, val in batch.items() if key in ["input_ids", "attention_mask"]}
        labels = batch["labels"].to(device)

        outputs = model(**inputs)
        logits = outputs.logits

        # Get predicted class (0 = Non-Suicidal, 1 = Suicidal)
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        labels = labels.cpu().numpy()

        predictions.extend(preds)
        true_labels.extend(labels)

        # 🔹 Print progress every 100 batches
        if (batch_idx + 1) % 100 == 0 or (batch_idx + 1) == num_batches:
            print(f"Processed {batch_idx + 1}/{num_batches} batches...")

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
end_time = time.time()

print(f"\nTest Accuracy: {accuracy:.4f}")
print(f"Total Time for Testing: {end_time - start_time:.2f} seconds")

# Print classification report
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=["Non-Suicidal", "Suicidal"]))

In [29]:
# Test on new messages
def predict_suicide_risk(text):
    """Runs a prediction on a single text input"""
    
    model.eval()  # Set model to evaluation mode

    # Tokenize input text
    inputs = tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

    # Move to the same device as the model
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Convert logits to probabilities
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # Get prediction label
    predicted_label = torch.argmax(probs, dim=-1).item()
    
    # Map label to class
    label_map = {0: "Non-Suicidal", 1: "High Suicide Risk"}
    return label_map[predicted_label], probs.cpu().numpy()

# Example Test
text_input = "I want to hurt myself."
prediction, confidence = predict_suicide_risk(text_input)

print(f"Prediction: {prediction}")
print(f"Confidence: {confidence}")

Prediction: Non-Suicidal
Confidence: [[0.63923585 0.3607641 ]]
