<a href="https://colab.research.google.com/github/rrikku628129/EmpathiAI-Data-Management/blob/main/model_development.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
BASE_DIR = "/content/drive/MyDrive/AI System"
print("已挂载，工作目录：", BASE_DIR)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
已挂载，工作目录： /content/drive/MyDrive/AI System


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("/content/drive/MyDrive/AI System/cleaned_data.csv")

# 简化标签构造
y = df["avg_empathy_score"].apply(lambda x: 1 if x >= 3.5 else 0)  # empathetic or not
X = df["context_clean"] + " [SEP] " + df["response_clean"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)


In [3]:
!pip install pytorch-tabnet



In [4]:
!pip install --upgrade transformers datasets accelerate evaluate



In [5]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/AI System/cleaned_data.csv")

# 1. 选择可用的数值列当特征
feature_cols = [
    "avg_empathy_score",
    "avg_appropriateness_score",
    "avg_relevance_score",
    "empathy_llama-3-2-1b",
    "empathy_llama-3-2-3b",
    "empathy_llama-3-1-8b",
    "empathy_qwen-2-5-7b",
    "appropriateness_llama-3-2-1b",
    "appropriateness_llama-3-2-3b",
    "appropriateness_llama-3-1-8b",]



X_tab_full = df[feature_cols].copy()
y_tab_full = (df["avg_empathy_score"] >= 4.0).astype(int)

from sklearn.model_selection import train_test_split
import numpy as np

X_tab_train, X_tab_test, y_tab_train, y_tab_test = train_test_split(
    X_tab_full.values.astype(np.float32),
    y_tab_full.values.astype(np.int64),
    test_size=0.2,
    stratify=y_tab_full,
    random_state=42)

In [6]:
!pip install pytorch-tabnet >/dev/null

from pytorch_tabnet.tab_model import TabNetClassifier

clf = TabNetClassifier(verbose=1)
clf.fit(
    X_tab_train, y_tab_train,
    eval_set=[(X_tab_test, y_tab_test)],
    eval_name=["valid"],
    eval_metric=["accuracy"],
    max_epochs=100,
    patience=10,
    batch_size=256,
    virtual_batch_size=128)



epoch 0  | loss: 0.28834 | valid_accuracy: 0.32575 |  0:00:02s
epoch 1  | loss: 0.10408 | valid_accuracy: 0.32575 |  0:00:04s
epoch 2  | loss: 0.04881 | valid_accuracy: 0.32717 |  0:00:05s
epoch 3  | loss: 0.01705 | valid_accuracy: 0.33855 |  0:00:06s
epoch 4  | loss: 0.03277 | valid_accuracy: 0.36842 |  0:00:06s
epoch 5  | loss: 0.01311 | valid_accuracy: 0.41963 |  0:00:07s
epoch 6  | loss: 0.01275 | valid_accuracy: 0.41963 |  0:00:07s
epoch 7  | loss: 0.00718 | valid_accuracy: 0.46515 |  0:00:08s
epoch 8  | loss: 0.00162 | valid_accuracy: 0.50213 |  0:00:08s
epoch 9  | loss: 0.01697 | valid_accuracy: 0.50356 |  0:00:09s
epoch 10 | loss: 0.00327 | valid_accuracy: 0.55761 |  0:00:09s
epoch 11 | loss: 0.00316 | valid_accuracy: 0.57468 |  0:00:10s
epoch 12 | loss: 0.00155 | valid_accuracy: 0.58037 |  0:00:10s
epoch 13 | loss: 0.00091 | valid_accuracy: 0.64154 |  0:00:11s
epoch 14 | loss: 0.01007 | valid_accuracy: 0.64296 |  0:00:11s
epoch 15 | loss: 0.02539 | valid_accuracy: 0.66145 |  0



In [7]:
from sklearn.metrics import classification_report

y_tab_pred = clf.predict(X_tab_test)
print(classification_report(y_tab_test, y_tab_pred, target_names=["low_empathy","high_empathy"]))

              precision    recall  f1-score   support

 low_empathy       1.00      1.00      1.00       474
high_empathy       1.00      1.00      1.00       229

    accuracy                           1.00       703
   macro avg       1.00      1.00      1.00       703
weighted avg       1.00      1.00      1.00       703



In [8]:
import json, time
from sklearn.metrics import precision_recall_fscore_support

prec, rec, f1, _ = precision_recall_fscore_support(
    y_tab_test, y_tab_pred, average=None, labels=[1]
)

risk_entry_tabnet = {
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    "event": "tabnet_eval_high_empathy",
    "metrics": {
        "precision_high_empathy": float(prec[0]),
        "recall_high_empathy": float(rec[0]),
        "f1_high_empathy": float(f1[0])
    },
    "note": "High-empathy recall is critical because the system must identify supportive responses for clinical use."}

with open("/content/drive/MyDrive/AI System/risk_log.jsonl", "a", encoding="utf-8") as f:
    f.write(json.dumps(risk_entry_tabnet, ensure_ascii=False) + "\n")

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("/content/drive/MyDrive/AI System/cleaned_data.csv")

# 1. 构造文本输入
# 有的行可能 context_clean / response_clean 缺失，先用空串补
df["context_clean"]  = df["context_clean"].fillna("").astype(str)
df["response_clean"] = df["response_clean"].fillna("").astype(str)

df["text_input"] = df["context_clean"] + " [SEP] " + df["response_clean"]

# 2. 构造标签，举例把同理心分数高/低当成分类目标
df["label_binary"] = (df["avg_empathy_score"] >= 3.5).astype(int)

# 3. 去掉那些 text_input 真的是空的 或 label 是 NaN 的行
valid_mask = (df["text_input"].str.strip() != "") & df["label_binary"].notna()
df_model = df.loc[valid_mask, ["text_input", "label_binary"]].copy()

# 4. train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df_model["text_input"],
    df_model["label_binary"],
    test_size=0.2,
    stratify=df_model["label_binary"],
    random_state=42
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))
print("Class balance (train):")
print(y_train.value_counts(normalize=True))

Train size: 2809
Test size: 703
Class balance (train):
label_binary
1    0.58811
0    0.41189
Name: proportion, dtype: float64


In [10]:
from datasets import Dataset

train_dict = {
    "text":   X_train.astype(str).tolist(),
    "labels": y_train.astype(int).tolist()}

test_dict = {
    "text":   X_test.astype(str).tolist(),
    "labels": y_test.astype(int).tolist()}

train_dataset = Dataset.from_dict(train_dict)
test_dataset  = Dataset.from_dict(test_dict)

print(train_dataset[:2])
print(test_dataset[:2])

{'text': ["I'm transgender, I know I am, but I've only told a few friends. I know I can't tell my family because of previous conversations we've had. They just wouldn't accept it.\n My gender dysphoria is getting really difficult to deal with on my own. I need some strategies for dealing with it. What should I do? [SEP] Hi. It can be difficult to handle such a transition on your own. I work with clients to understand their needs and wants. This can involve how to communicate effectively with friends, family, and other loved ones; or, learning how to have self-acceptance. I strongly recommend speaking with a licensed clinician one on one to help facilitate the change you are looking for.", "I feel so alone. I have so many people around me, but it seems as they just listen and dont understand.  They say it will all be okay, or they don't listen to me st all.  Everyone says they are here for me but it doesn't feel like they are. Why do I feel so alone? [SEP] We feel alone because we are n

In [11]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256,)

train_dataset = train_dataset.map(tokenize_fn, batched=True)
test_dataset  = test_dataset.map(tokenize_fn, batched=True)

# HuggingFace Trainer
train_dataset = train_dataset.remove_columns(["text"])
test_dataset  = test_dataset.remove_columns(["text"])

train_dataset = train_dataset.with_format("torch")
test_dataset  = test_dataset.with_format("torch")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/2809 [00:00<?, ? examples/s]

Map:   0%|          | 0/703 [00:00<?, ? examples/s]

In [12]:
!pip install --upgrade transformers datasets accelerate evaluate

from transformers import RobertaForSequenceClassification, TrainingArguments, Trainer
import numpy as np

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/AI System/roberta_outputs",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mzhoub1[0m ([33mzhoub1-university-of-florida[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.4967,0.486932
2,0.3849,0.583426
3,0.2739,0.48823


TrainOutput(global_step=1056, training_loss=0.43391442095691507, metrics={'train_runtime': 695.0157, 'train_samples_per_second': 12.125, 'train_steps_per_second': 1.519, 'total_flos': 1108618431759360.0, 'train_loss': 0.43391442095691507, 'epoch': 3.0})

In [13]:
from sklearn.metrics import classification_report
import json, time

preds = trainer.predict(test_dataset)
y_pred = preds.predictions.argmax(axis=-1)
y_true = np.array(y_test.tolist())

rep = classification_report(
    y_true,
    y_pred,
    target_names=["low_empathy", "high_empathy"],
    output_dict=True)

print(rep)

risk_entry = {
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    "event": "roberta_eval_high_empathy",
    "metrics": {
        "precision_high": rep["high_empathy"]["precision"],
        "recall_high":    rep["high_empathy"]["recall"],
        "f1_high":        rep["high_empathy"]["f1-score"],
        "macro_f1":       rep["macro avg"]["f1-score"]
    },
    "note": "We prioritize recall on high_empathy to ensure the assistant reliably surfaces supportive/empathetic responses."}

with open("/content/drive/MyDrive/AI System/risk_log.jsonl", "a", encoding="utf-8") as f:
    f.write(json.dumps(risk_entry, ensure_ascii=False) + "\n")

{'low_empathy': {'precision': 0.8801498127340824, 'recall': 0.8103448275862069, 'f1-score': 0.8438061041292639, 'support': 290.0}, 'high_empathy': {'precision': 0.8738532110091743, 'recall': 0.9225181598062954, 'f1-score': 0.8975265017667845, 'support': 413.0}, 'accuracy': 0.8762446657183499, 'macro avg': {'precision': 0.8770015118716283, 'recall': 0.8664314936962512, 'f1-score': 0.8706663029480242, 'support': 703.0}, 'weighted avg': {'precision': 0.876450671180189, 'recall': 0.8762446657183499, 'f1-score': 0.8753658825422027, 'support': 703.0}}
