In [1]:
!pip install -q "transformers>=4.44.0" "datasets" "accelerate" "evaluate"

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import transformers
from transformers import TrainingArguments

print("Transformers:", transformers.__version__)
print("Has evaluation_strategy:", "evaluation_strategy" in TrainingArguments.__init__.__code__.co_varnames)
print("Init signature:", TrainingArguments.__init__)

Transformers: 4.57.2
Has evaluation_strategy: False
Init signature: <function TrainingArguments.__init__ at 0x79b7f9c62b60>


In [2]:
from pathlib import Path

import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    set_seed,
)
import evaluate

In [4]:
# =========================
# 1. 全局配置
# =========================

# 预训练模型名称（可以换成 "roberta-base" 等）
BASE_MODEL_NAME = "distilroberta-base"

# Hugging Face 数据集名称
DATASET_NAME = "tcabanski/mental_health_counseling_conversations_rated"

# 输出模型保存路径（相对项目根目录）
OUTPUT_DIR = Path("src/models/empathy_scorer")

# 随机种子
SEED = 42


def main():
    # 保证可复现
    set_seed(SEED)

    # 确保输出目录存在
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    # =========================
    # 2. 加载数据集
    # =========================
    print(f"[INFO] Loading dataset: {DATASET_NAME}")
    ds = load_dataset(DATASET_NAME)

    # 这个数据集只有一个 split: "train"
    full_ds = ds["train"]
    print(f"[INFO] Full dataset size: {len(full_ds)}")

    # 划分 train / validation
    split_ds = full_ds.train_test_split(test_size=0.1, seed=SEED)
    train_ds = split_ds["train"]
    val_ds = split_ds["test"]

    print(f"[INFO] Train size: {len(train_ds)}, Val size: {len(val_ds)}")

    # =========================
    # 3. 初始化 tokenizer
    # =========================
    print(f"[INFO] Loading tokenizer: {BASE_MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)

    # =========================
    # 4. 数据预处理函数
    # =========================
    def preprocess(example):
        """
        将 context + response 拼接为模型输入，
        使用 avg_empathy_score 作为回归标签。
        """
        # 有些样本可能 context 为空，做个容错
        context = example.get("context") or ""
        response = example.get("response") or ""

        # 你也可以改成只用 response，看你想让模型更关注哪一部分
        text = f"User: {context}\nCounselor: {response}"

        tokenized = tokenizer(
            text,
            truncation=True,
            max_length=512,
        )

        # 使用 avg_empathy_score 作为回归标签 (float)
        empathy_score = float(example["avg_empathy_score"])
        tokenized["labels"] = empathy_score

        return tokenized

    print("[INFO] Tokenizing train dataset...")
    train_tok = train_ds.map(preprocess, batched=False)

    print("[INFO] Tokenizing validation dataset...")
    val_tok = val_ds.map(preprocess, batched=False)

    # 可选：移除原始文本字段，只保留模型需要的字段
    remove_cols = [c for c in ["context", "response"] if c in train_tok.column_names]
    if remove_cols:
        train_tok = train_tok.remove_columns(remove_cols)
        val_tok = val_tok.remove_columns(remove_cols)

    # 动态 padding
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # =========================
    # 5. 定义回归模型
    # =========================
    print(f"[INFO] Loading base model: {BASE_MODEL_NAME}")
    model = AutoModelForSequenceClassification.from_pretrained(
        BASE_MODEL_NAME,
        num_labels=1,  # 回归任务
    )
    model.config.problem_type = "regression"

    # =========================
    # 6. 评估指标 (MSE + Pearson)
    # =========================
    mse_metric = evaluate.load("mse")
    pearson_metric = evaluate.load("pearsonr")

    def compute_metrics(eval_pred):
        preds, labels = eval_pred
        # preds shape: (batch_size, 1)
        preds = preds.squeeze()
        labels = labels.squeeze()

        mse = mse_metric.compute(predictions=preds, references=labels)["mse"]
        pearson = pearson_metric.compute(predictions=preds, references=labels)["pearsonr"]

        return {
            "mse": mse,
            "pearsonr": pearson,
        }

    # =========================
    # 7. 训练参数（兼容版本，不使用 evaluation_strategy 等）
    # =========================
    training_args = TrainingArguments(
        output_dir=str(OUTPUT_DIR / "checkpoints"),
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_steps=50,
        seed=SEED,
        fp16=torch.cuda.is_available(),  # 有 GPU 就用 FP16
    )

    # =========================
    # 8. Trainer
    # =========================
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # =========================
    # 9. 开始训练
    # =========================
    print("[INFO] Start training...")
    trainer.train()

    # 训练结束后在验证集上跑一次评估
    print("[INFO] Running final evaluation on validation set...")
    metrics = trainer.evaluate()
    print("[INFO] Eval metrics:", metrics)

    # =========================
    # 10. 保存模型到 OUTPUT_DIR
    # =========================
    print(f"[INFO] Saving model and tokenizer to: {OUTPUT_DIR}")
    trainer.save_model(str(OUTPUT_DIR))
    tokenizer.save_pretrained(str(OUTPUT_DIR))

    print("[INFO] Training finished.")


if __name__ == "__main__":
    main()


[INFO] Loading dataset: tcabanski/mental_health_counseling_conversations_rated


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

rated_dataset.json:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3512 [00:00<?, ? examples/s]

[INFO] Full dataset size: 3512
[INFO] Train size: 3160, Val size: 352
[INFO] Loading tokenizer: distilroberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

[INFO] Tokenizing train dataset...


Map:   0%|          | 0/3160 [00:00<?, ? examples/s]

[INFO] Tokenizing validation dataset...


Map:   0%|          | 0/352 [00:00<?, ? examples/s]

[INFO] Loading base model: distilroberta-base


model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


[INFO] Start training...


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mzhoub1[0m ([33mzhoub1-university-of-florida[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,3.249
100,0.5501
150,0.4889
200,0.4903
250,0.4681
300,0.3478
350,0.349
400,0.3829
450,0.3167
500,0.2733


[INFO] Running final evaluation on validation set...


[INFO] Eval metrics: {'eval_loss': 0.23029747605323792, 'eval_mse': 0.23029748689044605, 'eval_pearsonr': 0.8521368391898538, 'eval_runtime': 0.4656, 'eval_samples_per_second': 756.059, 'eval_steps_per_second': 47.254, 'epoch': 3.0}
[INFO] Saving model and tokenizer to: src/models/empathy_scorer
[INFO] Training finished.


In [5]:
!zip -r empathy_scorer.zip src/models/empathy_scorer
from google.colab import files
files.download("empathy_scorer.zip")

  adding: src/models/empathy_scorer/ (stored 0%)
  adding: src/models/empathy_scorer/tokenizer_config.json (deflated 75%)
  adding: src/models/empathy_scorer/training_args.bin (deflated 53%)
  adding: src/models/empathy_scorer/config.json (deflated 50%)
  adding: src/models/empathy_scorer/vocab.json (deflated 59%)
  adding: src/models/empathy_scorer/model.safetensors (deflated 7%)
  adding: src/models/empathy_scorer/tokenizer.json (deflated 82%)
  adding: src/models/empathy_scorer/merges.txt (deflated 53%)
  adding: src/models/empathy_scorer/special_tokens_map.json (deflated 52%)
  adding: src/models/empathy_scorer/checkpoints/ (stored 0%)
  adding: src/models/empathy_scorer/checkpoints/runs/ (stored 0%)
  adding: src/models/empathy_scorer/checkpoints/runs/Dec01_01-08-38_b33ebb5b2795/ (stored 0%)
  adding: src/models/empathy_scorer/checkpoints/runs/Dec01_01-08-38_b33ebb5b2795/events.out.tfevents.1764551320.b33ebb5b2795.2124.0 (deflated 62%)
  adding: src/models/empathy_scorer/checkpoin

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>