In [1]:
import torch
print("CUDA:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))


CUDA: True
GPU: Tesla T4


Mount Drive

In [2]:
from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [3]:
import os
import numpy as np
import pandas as pd

from datasets import Dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, f1_score, classification_report


Paths & config

In [4]:
BASE_DIR = "/content/drive/MyDrive/Mental_Health_Sentiment"

TRAIN_FILE = f"{BASE_DIR}/data/processed/train_5class.csv"
VAL_FILE   = f"{BASE_DIR}/data/processed/val_5class.csv"

MLM_MODEL_DIR = f"{BASE_DIR}/models/v1.0/roberta_mlm_adapted"
OUTPUT_DIR    = f"{BASE_DIR}/models/v1.0/roberta_classifier"

NUM_LABELS = 5
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 4
LEARNING_RATE = 2e-5


Load labeled data

In [5]:
train_df = pd.read_csv(TRAIN_FILE)
val_df   = pd.read_csv(VAL_FILE)

train_df.head()


Unnamed: 0,text,label
0,@agpublic have you seen this link? me thinks y...,normal
1,"hey guys, i need advice on a situation that ha...",depression
2,life is fucking hard that s it we care which h...,depression
3,"heart attacks, can be avoided by these importa...",anxiety
4,suicide i do not know what to do i was having ...,depression


In [6]:
train_df.size

77998

Custom Trainer with weighted loss (NEW CELL)

Label encoding

In [7]:
labels = sorted(train_df["label"].unique())
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

label2id, id2label


({'anxiety': 0, 'depression': 1, 'normal': 2, 'stress': 3, 'suicidal': 4},
 {0: 'anxiety', 1: 'depression', 2: 'normal', 3: 'stress', 4: 'suicidal'})

Convert to HF Datasets

In [8]:
train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)


In [25]:
import torch
from sklearn.utils.class_weight import compute_class_weight

class_labels = train_df["label"].map(label2id).values

weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(class_labels),
    y=class_labels
)

class_weights = torch.tensor(weights, dtype=torch.float)

class_weights


tensor([2.5382, 0.6330, 0.5993, 3.7680, 0.9154])

Tokenizer (from MLM-adapted model)

In [9]:
tokenizer = RobertaTokenizerFast.from_pretrained(MLM_MODEL_DIR)


Tokenization function

In [10]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )


Tokenize datasets

In [11]:
train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize, batched=True)

train_ds = train_ds.map(lambda x: {"labels": label2id[x["label"]]})
val_ds   = val_ds.map(lambda x: {"labels": label2id[x["label"]]})

train_ds = train_ds.remove_columns(["text", "label"])
val_ds   = val_ds.remove_columns(["text", "label"])

train_ds.set_format("torch")
val_ds.set_format("torch")


Map:   0%|          | 0/38999 [00:00<?, ? examples/s]

Map:   0%|          | 0/9750 [00:00<?, ? examples/s]

Map:   0%|          | 0/38999 [00:00<?, ? examples/s]

Map:   0%|          | 0/9750 [00:00<?, ? examples/s]

Load classification model

In [12]:
model = RobertaForSequenceClassification.from_pretrained(
    MLM_MODEL_DIR,
    num_labels=NUM_LABELS,
    id2label=id2label,
    label2id=label2id
)


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: /content/drive/MyDrive/Mental_Health_Sentiment/models/v1.0/roberta_mlm_adapted
Key                        | Status     | 
---------------------------+------------+-
lm_head.bias               | UNEXPECTED | 
lm_head.dense.weight       | UNEXPECTED | 
lm_head.dense.bias         | UNEXPECTED | 
lm_head.layer_norm.weight  | UNEXPECTED | 
lm_head.layer_norm.bias    | UNEXPECTED | 
classifier.out_proj.bias   | MISSING    | 
classifier.dense.bias      | MISSING    | 
classifier.dense.weight    | MISSING    | 
classifier.out_proj.weight | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Custom Trainer with weighted loss

In [26]:
from transformers import Trainer

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = torch.nn.CrossEntropyLoss(
            weight=class_weights.to(logits.device)
        )

        loss = loss_fct(
            logits.view(-1, model.config.num_labels),
            labels.view(-1)
        )

        return (loss, outputs) if return_outputs else loss


Metrics

In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro")
    }


Training arguments

In [20]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    fp16=True,
    save_strategy="epoch",
    logging_steps=200,
    report_to="none"
)



Trainer

In [33]:
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics
)




Train classifier

In [34]:
trainer.train()


Step,Training Loss
200,0.479889
400,0.529213
600,0.458394
800,0.468263
1000,0.404671
1200,0.47174
1400,0.493997
1600,0.486408
1800,0.511103
2000,0.424137


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=9752, training_loss=0.37282707849616786, metrics={'train_runtime': 2052.3368, 'train_samples_per_second': 76.009, 'train_steps_per_second': 4.752, 'total_flos': 2.052268887590707e+16, 'train_loss': 0.37282707849616786, 'epoch': 4.0})

Final evaluation

In [35]:
preds = trainer.predict(val_ds)
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=1)

print(classification_report(
    y_true,
    y_pred,
    target_names=[id2label[i] for i in range(NUM_LABELS)],
    digits=4
))


              precision    recall  f1-score   support

     anxiety     0.8542    0.8542    0.8542       768
  depression     0.7451    0.6435    0.6906      3080
      normal     0.9620    0.8943    0.9269      3254
      stress     0.5831    0.7195    0.6442       517
    suicidal     0.5942    0.7414    0.6597      2131

    accuracy                         0.7692      9750
   macro avg     0.7477    0.7706    0.7551      9750
weighted avg     0.7845    0.7692    0.7731      9750



Save final model

In [36]:
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('/content/drive/MyDrive/Mental_Health_Sentiment/models/v1.0/roberta_classifier/tokenizer_config.json',
 '/content/drive/MyDrive/Mental_Health_Sentiment/models/v1.0/roberta_classifier/tokenizer.json')

In [1]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
tokenizer.save_pretrained("../models/v1.0/roberta_classifier")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

('../models/v1.0/roberta_classifier\\tokenizer_config.json',
 '../models/v1.0/roberta_classifier\\special_tokens_map.json',
 '../models/v1.0/roberta_classifier\\vocab.json',
 '../models/v1.0/roberta_classifier\\merges.txt',
 '../models/v1.0/roberta_classifier\\added_tokens.json',
 '../models/v1.0/roberta_classifier\\tokenizer.json')