This is a performance testing with the dataset of adolescent on two model. A dual model setup is done here.

The first model ***BERT*** is going to train on the emotion and the intent of the user and the ***T5*** model is going to generate text based on the questions.

The datasets used here -> **casual_lm_train.jsonl** and **masked_lm_train.jsonl**

*casual_lm_train* is used for the **T5** training and the *masked_lm* train is for the **Electra**.

*T5*'s performance is going to evaluate by its perplexity that is how certain it's response is or other word what is it's accuracy to predict the next word.

*BERT*'s performance is evaluated by it's accuracy to the prediction of the emotion and intention of the questions.

In [1]:
import os
import re
import json
import random
import math
from dataclasses import dataclass
from typing import List, Dict

import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset as TorchDataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from transformers import DataCollatorForSeq2Seq

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import evaluate
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

rouge_metric = evaluate.load('rouge')
bleu_metric = evaluate.load('bleu')

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x7fccaf7c7050>

**Config**

In [2]:
@dataclass
class Config:
    jsonl_path = "/home/ghost-ed/Documents/Model_Tuning/adolescent_chatbot_train.jsonl"
    work_dir = "/home/ghost-ed/Documents/Model_Tuning"

    # label space
    intents = (
        "seek_help", "venting", "ask_question", "share_success", "neutral"
    )
    emotions = (
        "anxious", "sad", "angry", "lonely", "neutral"
    )

    clf_model_name = "bert-base-uncased"
    gen_model_name = "t5-small"

    clf_epochs = 4
    gen_epochs = 4
    batch_size = 8
    lr = 5e-5
    weight_decay = 0.01

    # generation config
    max_new_token = 96
    do_sample = True
    temperature = 0.9
    top_p = 0.92
    repetition_penalty = 1.08


CFG = Config()
os.makedirs(CFG.work_dir, exist_ok=True)

**Dataset loading and heuristic labeling**

In [3]:
def load_jsonl_messages(path) -> pd.DataFrame:
    rows = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue
            msgs = obj.get('messages', [])
            user = None
            assistant = None
            for m in msgs:
                if m.get('role') == 'user' and user is None:
                    user = m.get('content', '').strip()
                if m.get('role') == 'assistant' and assistant is None:
                    assistant = m.get('content', '').strip()
            if user and assistant:
                rows.append({"text": user, "response": assistant})
    return pd.DataFrame(rows)


# intent/emontion heuristics
INTENT_PATTERNS = {
    "seek_help": [r"help", r"what should i", r"how do i", r"can you", r"advice"],
    "venting": [r"no one", r"nobody", r"so tired", r"fed up", r"i hate"],
    "ask_question": [r"\?$", r"why ", r"what ", r"how ", r"when ", r"where "],
    "share_success": [r"i did (it|well)", r"i feel proud", r"happy to say"],
}


EMOTION_PATTERNS = {
    "anxious": [r"anxious|nervous|worried|on edge|overthink"],
    "sad": [r"sad|down|depress|worthless|cry"],
    "angry": [r"angry|mad|furious|annoy|irritat"],
    "lonely": [r"alone|lonely|isolated|no one"],
}


CRISIS_PATTERNS = {
    'self_harm': [r"kill myself", r"suicide", r"end my life", r"hurt myself"],
    'abuse': [r"abused", r"violence at home", r"beaten", r"forced"],
    'harm_others': [r"hurt someone", r"kill them", r"revenge"],
}


def first_match(text, patterns, default="neutral") -> str:
    t = text.lower()
    for label, pats in patterns.items():
        for p in pats:
            if re.search(p, t):
                return label
    return default


# synonyms for diversity
SYNONYMS = {
    'exam': ['test', 'assessment', 'paper'],
    'school': ['class', 'college', 'campus'],
    'anxious': ['nervous', 'worried', 'on edge'],
    'angry': ['upset', 'frustrated', 'mad'],
    'sad': ['down', 'blue', 'low'],
    'friend': ['peer', 'classmate', 'buddy'],
    'parents': ['family', 'mom and dad', 'guardians'],
}


def simple_augment(text, p=0.25, max_replacement=2) -> str:
    tokens = re.findall(r"\w+|\W", text)
    replaced = 0
    for i, toks in enumerate(tokens):
        low = toks.lower()
        if low in SYNONYMS and random.random() < p and replaced < max_replacement:
            tokens[i] = random.choice(SYNONYMS[low])
            replaced += 1
    return ''.join(tokens)


# load
if os.path.exists(CFG.jsonl_path):
    df = load_jsonl_messages(CFG.jsonl_path)
else:
    df = pd.DataFrame([
        {"text": "I feel so anxious about my exams.",
            "response": "It's okay to feel this way. Let's try planning small study steps."},
        {"text": "No one understands me at school.",
            "response": "That can feel isolating. I'm here to listen—want to share more?"},
        {"text": "How do I stop overthinking everything?",
            "response": "We can try a quick grounding exercise. Want to try together?"},
    ])
    print("df column: ", df.columns)

# heuristics label
df['intent'] = df['text'].apply(lambda x: first_match(x, INTENT_PATTERNS, default="neutral"))
df['emotion'] = df['text'].apply(lambda x: first_match(x, EMOTION_PATTERNS, default="neutral"))

# label maps
intent_list = list(dict.fromkeys(list(CFG.intents) + sorted(df['intent'].unique().tolist())))
emotion_list = list(dict.fromkeys(list(CFG.emotions) + sorted(df['emotion'].unique().tolist())))

intent_label2id = {l:i for i, l in enumerate(intent_list)}
intent_id2label = {l:i for i, l in intent_label2id.items()}
emotion_label2id = {l:i for i, l in enumerate(emotion_list)}
emotion_id2label = {l:i for i, l in emotion_label2id.items()}

# numeric labels
df['intent_label'] = df['intent'].map(intent_label2id)
df['emotion_label'] = df['emotion'].map(emotion_label2id)

print("Sample: ")
print(df.head(5))
print("\nIntent labels: ", intent_id2label)
print("\nEmotion labels: ", emotion_id2label)

#split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=SEED, stratify=df[['intent_label','emotion_label']])

Sample: 
                                                text  \
0  How does your relationship with your mother af...   
1  What does a high personal growth score mean du...   
2    Why is purpose in life important for teenagers?   
3  How does your relationship with your mother af...   
4  What does a high personal growth score mean du...   

                                            response        intent  emotion  \
0  In this case, the respondent scored 30, 27, an...  ask_question  neutral   
1  A personal growth score of 22 indicates that t...  ask_question  neutral   
2  In this response, the adolescent scored 27 on ...  ask_question  neutral   
3  In this case, the respondent scored 30, 34, an...  ask_question  neutral   
4  A personal growth score of 24 indicates that t...  ask_question  neutral   

   intent_label  emotion_label  
0             2              4  
1             2              4  
2             2              4  
3             2              4  
4             

In [9]:
clf_tokenizer = AutoTokenizer.from_pretrained(CFG.clf_model_name)


class ClfDataset(TorchDataset):
    def __init__(self, dataframe, label_col):
        self.df = dataframe.reset_index(drop=True)
        self.label_col = label_col

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        enc = clf_tokenizer(
            row['text'],
            truncation=True,
            padding=False,
            max_length=256,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item['labels'] = torch.tensor(
            int(row[self.label_col]), dtype=torch.long)
        return item


train_intent = ClfDataset(train_df, 'intent_label')
train_emotion = ClfDataset(train_df, 'emotion_label')
test_intent = ClfDataset(test_df, 'intent_label')
test_emotion = ClfDataset(test_df, 'emotion_label')

data_collator = DataCollatorWithPadding(clf_tokenizer)

clf_args = TrainingArguments(
    output_dir=os.path.join(CFG.work_dir, 'clf'),
    per_device_train_batch_size=CFG.batch_size,
    per_device_eval_batch_size=CFG.batch_size,
    num_train_epochs=CFG.clf_epochs,
    logging_steps=50,
    learning_rate=CFG.lr,
    seed=SEED,
)

# intent classifier
clf_intent = AutoModelForSequenceClassification.from_pretrained(
    CFG.clf_model_name, num_labels=len(intent_label2id)
)

trainer_intent = Trainer(
    model=clf_intent,
    args=clf_args,
    train_dataset=train_intent,
    eval_dataset=test_intent,
    tokenizer=clf_tokenizer,
    data_collator=data_collator,
)

trainer_intent.train()
intent_eval = trainer_intent.evaluate()
print("Intent eval: ", intent_eval)

# emotion classifier
clf_emotion = AutoModelForSequenceClassification.from_pretrained(
    CFG.clf_model_name, num_labels=len(emotion_label2id)
)

trainer_emotion = Trainer(
    model=clf_intent,
    args=clf_args,
    train_dataset=train_emotion,
    eval_dataset=test_emotion,
    tokenizer=clf_tokenizer,
    data_collator=data_collator,
)

trainer_emotion.train()
emotion_eval = trainer_emotion.evaluate()
print("Emotion eval: ", emotion_eval)

# prediction
with torch.no_grad():
    def predict_labels(ds, model) -> List[int]:
        pred = []
        for i in range(len(ds)):
            item = ds[i]
            inputs = {k: v.unsqueeze(0) for k, v in item.items() if k in [
                'input_ids', 'attention_mask', 'token_type_ids'] and v is not None}
            logits = model(**inputs).logits
            pred.append(int(logits.argmax(dim=1).item()))
        return pred
    y_true_intent = [int(test_df.iloc[i]['intent_label'])
                     for i in range(len(test_df))]
    y_pred_intent = predict_labels(test_intent, clf_intent)
    y_true_emotion = [int(test_df.iloc[i]['emotion_label'])
                      for i in range(len(test_df))]
    y_pred_emotion = predict_labels(test_emotion, clf_emotion)

print("\nIntent classification report:\n", classification_report(y_true_intent, y_pred_intent,
      target_names=[intent_id2label[i] for i in range(len(intent_id2label))], digits=3))
print("\nEmotion classification report:\n", classification_report(y_true_emotion, y_pred_emotion,
      target_names=[emotion_id2label[i] for i in range(len(emotion_id2label))], digits=3))


print("Intent confusion matrix:\n",
      confusion_matrix(y_true_intent, y_pred_intent))
print("Emotion confusion matrix:\n", confusion_matrix(
    y_true_emotion, y_pred_emotion))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_intent = Trainer(


Step,Training Loss
50,0.1369
100,0.0007
150,0.0004
200,0.0003


Intent eval:  {'eval_loss': 0.0002452132466714829, 'eval_runtime': 0.1236, 'eval_samples_per_second': 970.808, 'eval_steps_per_second': 121.351, 'epoch': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_emotion = Trainer(


Step,Training Loss
50,0.7136
100,0.0014
150,0.0008
200,0.0007


Emotion eval:  {'eval_loss': 0.0004573689366225153, 'eval_runtime': 0.1323, 'eval_samples_per_second': 907.025, 'eval_steps_per_second': 113.378, 'epoch': 4.0}


RuntimeError: Expected all tensors to be on the same device, but got index is on cpu, different from other tensors on cuda:0 (when checking argument in method wrapper_CUDA__index_select)