<a href="https://colab.research.google.com/github/OmdenaAI/Bhutan-Mental-Health/blob/ml_nlp/Model_classification_Tsovinar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Lets try to get the libraries we need for the data explaratory and modelling


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader
warnings.filterwarnings('ignore')


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_csv("/content/drive/MyDrive/combined_data.csv")

In [4]:
ndata = data.dropna().reset_index(drop=True)  # Reset index here

text = ndata['text']
labels = ndata['mental_state'].values
label_map = {emotion: idx for idx, emotion in enumerate(np.unique(labels))}
num_labels = len(label_map)
y_encoded = np.array([label_map[l] for l in labels])

In [5]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# With this:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels).to(device)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Split data
X_train, X_val, y_train, y_val = train_test_split(
    text, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

In [7]:
# ===== ENHANCED TOKENIZATION STRATEGIES =====

from tqdm import tqdm
import torch
def adaptive_tokenize(text, tokenizer, max_len=512):
    """
    Adaptive tokenization that preserves important content based on text length
    """
    tokens = tokenizer.encode(text, add_special_tokens=True)

    if len(tokens) <= max_len:
        return tokens

    # Strategy 1: For very long texts, use sliding window approach
    if len(tokens) > max_len * 3:
        # Take beginning, middle, and end chunks
        chunk_size = max_len // 3
        beginning = tokens[:chunk_size]
        middle_start = len(tokens) // 2 - chunk_size // 2
        middle = tokens[middle_start:middle_start + chunk_size]
        end = tokens[-chunk_size:]
        return beginning + middle + end

    # Strategy 2: For moderately long texts, use head-tail with bias toward beginning
    else:
        head_ratio = 0.6  # Give more weight to beginning
        head_len = int(max_len * head_ratio)
        tail_len = max_len - head_len
        head = tokens[:head_len]
        tail = tokens[-tail_len:]
        return head + tail

def enhanced_pre_tokenize(texts, labels, tokenizer, max_len=512):
    """Enhanced tokenization with multiple strategies"""
    input_ids = []
    attention_masks = []

    for text in tqdm(texts, desc="Tokenizing"):
        tokens = adaptive_tokenize(text, tokenizer, max_len)

        # Ensure exact length
        if len(tokens) < max_len:
            tokens = tokens + [tokenizer.pad_token_id] * (max_len - len(tokens))
        else:
            tokens = tokens[:max_len]

        mask = [1 if t != tokenizer.pad_token_id else 0 for t in tokens]
        input_ids.append(tokens)
        attention_masks.append(mask)

    return {
        'input_ids': torch.tensor(input_ids),
        'attention_mask': torch.tensor(attention_masks),
        'labels': torch.tensor(labels)
    }

# Enhanced tokenization
MAX_LEN = 512
train_data = enhanced_pre_tokenize(X_train.tolist(), y_train, tokenizer, MAX_LEN)
val_data = enhanced_pre_tokenize(X_val.tolist(), y_val, tokenizer, MAX_LEN)


Tokenizing:   0%|          | 0/98595 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (613 > 512). Running this sequence through the model will result in indexing errors
Tokenizing: 100%|██████████| 98595/98595 [06:41<00:00, 245.35it/s]
Tokenizing: 100%|██████████| 24649/24649 [01:33<00:00, 262.59it/s]


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Handle class imbalance
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

In [9]:
# Weighted Focal Loss
class WeightedFocalLoss(nn.Module):
    def __init__(self, alpha=0.5, gamma=2, weights=None, device='cuda'):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.weights = weights
        self.device = device

    def forward(self, inputs, targets):
        targets = targets.to(self.device)
        ce_loss = F.cross_entropy(inputs, targets, reduction="none")
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss

        if self.weights is not None:
            weights = self.weights[targets]
            weighted_loss = focal_loss * weights
            return weighted_loss.mean()
        return focal_loss.mean()

# Initialize loss function
loss_fn = WeightedFocalLoss(alpha=0.5, gamma=3, weights=weights_tensor, device=device)


In [10]:

from torch.optim import AdamW
def get_optimizer_grouped_parameters(model, base_lr=2e-5, lr_decay=0.95):
    # For DistilBERT, there are 6 transformer layers: layer.0 ... layer.5
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = []
    # Classifier head
    optimizer_grouped_parameters.append({
        "params": [p for n, p in model.named_parameters() if "classifier" in n],
        "weight_decay": 0.01,
        "lr": base_lr,
    })
    # Transformer layers (from last to first, higher lr for higher layers)
    for i in range(5, -1, -1):
        lr = base_lr * (lr_decay ** (5 - i))
        optimizer_grouped_parameters.append({
            "params": [p for n, p in model.named_parameters() if f"layer.{i}." in n and not any(nd in n for nd in no_decay)],
            "weight_decay": 0.01,
            "lr": lr,
        })
        optimizer_grouped_parameters.append({
            "params": [p for n, p in model.named_parameters() if f"layer.{i}." in n and any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
            "lr": lr,
        })
    # Embeddings
    optimizer_grouped_parameters.append({
        "params": [p for n, p in model.named_parameters() if "embeddings" in n],
        "weight_decay": 0.01,
        "lr": base_lr * (lr_decay ** 6),
    })
    return optimizer_grouped_parameters

# Usage example:
optimizer_grouped_parameters = get_optimizer_grouped_parameters(model, base_lr=2e-5, lr_decay=0.95)
optimizer = AdamW(optimizer_grouped_parameters, eps=1e-8)

In [11]:
# Replace TensorDataset with this custom dictionary-style dataset
class PrecomputedDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }




In [12]:
# ===== FIXED UNFREEZING STRATEGY =====
def unfreeze_layers(model, current_epoch, freeze_at_epoch=3):
    """Gradually unfreeze layers during training"""
    # Initial freezing - freeze all except classifier and last layer
    if current_epoch == 0:
        print("Initial freezing: Only classifier and last layer are trainable")
        for name, param in model.named_parameters():
            if 'classifier' not in name and 'layer.5' not in name:
                param.requires_grad = False

    # Unfreeze additional layers after certain epochs
    if current_epoch >= freeze_at_epoch:
        print(f"Epoch {current_epoch}: Unfreezing layers 4 and 5")
        for name, param in model.named_parameters():
            if 'layer.4' in name or 'layer.5' in name or 'classifier' in name:
                param.requires_grad = True

    return model
class CustomLossTrainer(Trainer):
    def __init__(self, *args, loss_fn=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = loss_fn
        self.label_names = ["labels"]
        self._current_epoch = -1  # Track current epoch internally

    def create_optimizer(self):
        if self.optimizer is None:
            optimizer_grouped_parameters = get_optimizer_grouped_parameters(
                self.model, base_lr=self.args.learning_rate, lr_decay=0.95
            )
            self.optimizer = AdamW(optimizer_grouped_parameters, eps=1e-8)
        return self.optimizer

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = self.loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

    def train(self, *args, **kwargs):
        self._current_epoch += 1
        if hasattr(self, 'model'):
            self.model = unfreeze_layers(self.model, self._current_epoch)
        return super().train(*args, **kwargs)



In [13]:
# Create datasets using the new class
train_dataset = PrecomputedDataset(
    train_data['input_ids'],
    train_data['attention_mask'],
    train_data['labels']
)

val_dataset = PrecomputedDataset(
    val_data['input_ids'],
    val_data['attention_mask'],
    val_data['labels']
)

In [14]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=3e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    eval_strategy='epoch',  # Corrected from 'eval_strategy'
    save_strategy='epoch',
    fp16=torch.cuda.is_available(),
    logging_dir='./logs',
    report_to='none',
    dataloader_pin_memory=True,
    gradient_accumulation_steps=1,
    ddp_find_unused_parameters=False,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True
)

# Trainer initialization
trainer = CustomLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
        'f1': f1_score(p.label_ids, np.argmax(p.predictions, axis=1), average='weighted'),
        'precision': precision_score(p.label_ids, np.argmax(p.predictions, axis=1), average='weighted'),
        'recall': recall_score(p.label_ids, np.argmax(p.predictions, axis=1), average='weighted')
    },
    loss_fn=loss_fn
)

# Train & Evaluate
trainer.train()
results = trainer.evaluate()
print("Validation Results:", results)

# Save model and tokenizer
trainer.save_model('./results')
tokenizer.save_pretrained('./results')
save_path = "/content/drive/MyDrive/bert_results_jiuhu"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

Initial freezing: Only classifier and last layer are trainable


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3072,0.272885,0.736582,0.736482,0.755229,0.736582
2,0.2387,0.252421,0.760599,0.761481,0.771681,0.760599
3,0.2246,0.225215,0.752607,0.752667,0.774269,0.752607
4,0.1724,0.259248,0.755852,0.756294,0.785415,0.755852


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3072,0.272885,0.736582,0.736482,0.755229,0.736582
2,0.2387,0.252421,0.760599,0.761481,0.771681,0.760599
3,0.2246,0.225215,0.752607,0.752667,0.774269,0.752607
4,0.1724,0.259248,0.755852,0.756294,0.785415,0.755852
5,0.1346,0.285355,0.767414,0.768021,0.785258,0.767414
6,0.1238,0.28317,0.772567,0.773532,0.785364,0.772567


Validation Results: {'eval_loss': 0.28317025303840637, 'eval_accuracy': 0.7725668384112946, 'eval_f1': 0.7735318389182788, 'eval_precision': 0.7853637185507724, 'eval_recall': 0.7725668384112946, 'eval_runtime': 168.3507, 'eval_samples_per_second': 146.415, 'eval_steps_per_second': 9.154, 'epoch': 6.0}


('/content/drive/MyDrive/bert_results_jiuhu/tokenizer_config.json',
 '/content/drive/MyDrive/bert_results_jiuhu/special_tokens_map.json',
 '/content/drive/MyDrive/bert_results_jiuhu/vocab.json',
 '/content/drive/MyDrive/bert_results_jiuhu/merges.txt',
 '/content/drive/MyDrive/bert_results_jiuhu/added_tokens.json')

In [None]:

from transformers import RobertaTokenizer, RobertaModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "/content/drive/MyDrive/bert_results_2026"

tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaModel.from_pretrained(model_path).to(device)


Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/bert_results_2026 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def get_embeddings_mean_max_chunked(texts, tokenizer, model, device, max_length=512, stride=256):
    if isinstance(texts, pd.Series):
        texts = texts.tolist()
    if isinstance(texts, str):
        texts = [texts]
    all_embeddings = []
    hidden_size = getattr(model.config, "dim", None) or getattr(model.config, "hidden_size", 768)
    for text in texts:
        # Use tokenizer's built-in chunking
        inputs = tokenizer(
            text,
            return_tensors="pt",
            max_length=max_length,
            truncation=True,
            padding="max_length",
            stride=stride,
            return_overflowing_tokens=True
        )
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)
        chunk_embeddings = []
        for i in range(input_ids.size(0)):
            with torch.no_grad():
                outputs = model(input_ids=input_ids[i:i+1], attention_mask=attention_mask[i:i+1])
            last_hidden = outputs.last_hidden_state
            mask = attention_mask[i:i+1].unsqueeze(-1).expand(last_hidden.size()).float()
            mean_pool = torch.sum(last_hidden * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)
            max_pool = torch.max(last_hidden.masked_fill(mask == 0, -1e9), 1)[0]
            pooled = torch.cat([mean_pool, max_pool], dim=1)
            chunk_embeddings.append(pooled.cpu().numpy())
        if chunk_embeddings:
            all_embeddings.append(np.mean(np.vstack(chunk_embeddings), axis=0))
        else:
            all_embeddings.append(np.zeros((hidden_size * 2,)))
    return np.vstack(all_embeddings)

embeddings = get_embeddings_mean_max_chunked(text, tokenizer, model, device)
np.save('/content/drive/MyDrive/roberta_embeddings.npy', embeddings)
pd.DataFrame(embeddings).to_csv('/content/drive/MyDrive/roberta_embeddings.csv', index=False)

In [None]:
embeddings = np.load('/content/drive/MyDrive/roberta_embeddings.npy')
