In [1]:
import pandas as pd

In [2]:
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [3]:
from torch.utils.data import DataLoader

In [4]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset

In [5]:
MODEL = "prajjwal1/bert-mini"  # Optimized small model
tokenizer = BertTokenizerFast.from_pretrained(MODEL)

# Load Model
model = BertForSequenceClassification.from_pretrained(MODEL, num_labels=2)  # Binary Classification

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
sentiment_dataset = load_dataset("imdb")  # Sentiment analysis dataset
suicide_dataset = load_dataset("vibhorag101/suicide_prediction_dataset_phr")  # Suicide detection dataset

In [7]:
# Function to convert string labels to numeric values
def map_labels(example):
    # SuicideWatch dataset: Convert "suicide" → 1, "non-suicide" → 0
    if example["label"] == "suicide":
        example["label"] = 1
    elif example["label"] == "non-suicide":
        example["label"] = 0

    return example

In [12]:
suicide_dataset = suicide_dataset.map(map_labels)

Map:   0%|          | 0/185574 [00:00<?, ? examples/s]

Map:   0%|          | 0/46394 [00:00<?, ? examples/s]

In [8]:
# Tokenize and Rename Labels Efficiently
def preprocess_function(batch):
    tokenized = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=512)
    tokenized["labels"] = [int(label) for label in batch["label"]]
    return tokenized

In [13]:
suicide_dataset = suicide_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/185574 [00:00<?, ? examples/s]

Map:   0%|          | 0/46394 [00:00<?, ? examples/s]

In [15]:
sentiment_dataset = sentiment_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [16]:
# Remove text column (no longer needed)
suicide_dataset = suicide_dataset.remove_columns(["text"])
sentiment_dataset = sentiment_dataset.remove_columns(["text"])

In [18]:
print(suicide_dataset["train"][0])

{'label': 1, 'input_ids': [101, 2342, 2203, 6114, 4895, 4783, 5400, 6321, 5051, 27469, 2425, 2994, 2113, 2131, 2488, 3984, 2025, 2025, 2113, 2514, 2051, 16873, 5920, 14337, 16592, 2135, 2025, 16592, 2135, 2025, 2191, 2514, 2488, 2215, 2203, 9826, 2699, 2673, 2052, 2191, 2488, 2920, 2542, 2498, 2499, 2215, 3280, 2342, 3280, 2025, 10107, 9015, 2172, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [19]:
# Pytorch Dataset Wrapper
class MultiTaskDataset(Dataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(item["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(item["labels"], dtype=torch.long),
        }

In [20]:
# Wrap datasets
train_suicide_dataset = MultiTaskDataset(suicide_dataset["train"].select(range(50000)))
test_suicide_dataset = MultiTaskDataset(suicide_dataset["test"])

In [21]:
train_sentiment_dataset = MultiTaskDataset(sentiment_dataset["train"])
test_sentiment_dataset = MultiTaskDataset(sentiment_dataset["test"])

In [22]:
# Create efficient dataloaders
BATCH_SIZE = 8  # Increase batch size for efficiency

train_suicide_loader = DataLoader(train_suicide_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_suicide_loader = DataLoader(test_suicide_dataset, batch_size=BATCH_SIZE, shuffle=False)

train_sentiment_loader = DataLoader(train_sentiment_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_sentiment_loader = DataLoader(test_sentiment_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [23]:
# Set Device (Supports Mac MPS and CUDA)
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")

In [24]:
print(f"Using device: {device}")

Using device: mps


In [25]:
# Define Optimizer & Loss Function
optimizer = optim.AdamW(model.parameters(), lr=5e-5)  # Higher learning rate for BERT-Mini
loss_fn = nn.CrossEntropyLoss()  # Binary classification loss

In [26]:
# Model optimizations for less memory usage and better training

# Less dropout layers
for module in model.modules():
    if isinstance(module, torch.nn.Dropout):
        module.p = 0.05  # Reduce dropout

In [27]:
model.classifier = nn.Sequential(
    nn.LayerNorm(256),  # Normalize before classification
    nn.Linear(256, 2)   # Keep original classifier
)

In [28]:
model.half()  # Convert model weights to float16

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.05, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.05, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e

In [29]:
print(type(model))

<class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'>


In [30]:
model = torch.compile(model)

In [31]:
print(type(model))

<class 'torch._dynamo.eval_frame.OptimizedModule'>


In [32]:
# Training Configurations
EPOCHS = 3  # More epochs compensate for smaller model

In [33]:
batch = next(iter(train_suicide_loader))
print(batch)

{'input_ids': tensor([[ 101, 6016, 3893,  ...,    0,    0,    0],
        [ 101, 2471, 2095,  ...,    0,    0,    0],
        [ 101, 2052, 2066,  ...,    0,    0,    0],
        ...,
        [ 101, 2025, 2360,  ...,    0,    0,    0],
        [ 101, 2183, 3892,  ...,    0,    0,    0],
        [ 101, 2048, 2154,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 1, 0, 0, 1, 0, 1, 1])}


In [34]:
# Ensure model is fully on the correct device
model.to(device)
for param in model.parameters():
    param.data = param.data.to(device)
    if param.grad is not None:
        param.grad.data = param.grad.data.to(device)

In [35]:
# Train model
for epoch in range(EPOCHS):
    model.train()  # Set model to training mode
    total_loss = 0
    num_batches = min(len(train_suicide_loader), len(train_sentiment_loader))  # Ensure equal batches

    print(f"Epoch {epoch + 1}/{EPOCHS} - Training...")

    for batch_idx, (batch_suicide, batch_sentiment) in enumerate(zip(train_suicide_loader, train_sentiment_loader)):
        optimizer.zero_grad()

        # Suicide Task
        inputs = {key: val.to(device) for key, val in batch_suicide.items() if key in ["input_ids", "attention_mask"]}
        labels = batch_suicide["labels"].to(device)
        outputs = model(**inputs)
        loss_suicide = loss_fn(outputs.logits, labels)

        # Sentiment Task
        inputs = {key: val.to(device) for key, val in batch_sentiment.items() if key in ["input_ids", "attention_mask"]}
        labels = batch_sentiment["labels"].to(device)
        outputs = model(**inputs)
        loss_sentiment = loss_fn(outputs.logits, labels)

        # Combine Losses
        total_loss = (loss_suicide + loss_sentiment) / 2

        # Backpropagation
        total_loss.backward()  
        optimizer.step()

        # Logging Progress
        if batch_idx % 100 == 0:
            print(f"Batch {batch_idx}/{num_batches} - Loss: {total_loss.item():.4f}")

    print(f"Epoch {epoch+1} completed. Avg Loss: {total_loss.item():.4f}")

print("Training complete!")

Epoch 1/3 - Training...


BackendCompilerFailed: backend='inductor' raised:
LoweringException: TypeError: 'NoneType' object is not callable
  target: aten.var_mean.correction
  args[0]: TensorBox(StorageBox(
    Pointwise(
      'mps',
      torch.float32,
      def inner_fn(index):
          i0, i1, i2 = index
          tmp0 = ops.load(buf0, i2 + 256 * i1 + 131072 * i0)
          tmp1 = ops.to_dtype(tmp0, torch.float32, src_dtype=torch.float16)
          return tmp1
      ,
      ranges=[8, 512, 256],
      origin_node=convert_element_type,
      origins=OrderedSet([convert_element_type])
    )
  ))
  args[1]: [2]
  kwargs: {'correction': 0, 'keepdim': True}

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information


You can suppress this exception and fall back to eager by setting:
    import torch._dynamo
    torch._dynamo.config.suppress_errors = True
