This is a performance testing with the dataset of adolescent on two model. A dual model setup is done here.

The first model ***BERT*** is going to train on the emotion and the intent of the user and the ***T5*** model is going to generate text based on the questions.

The datasets used here -> **casual_lm_train.jsonl** and **masked_lm_train.jsonl**

*casual_lm_train* is used for the **T5** training and the *masked_lm* train is for the **Electra**.

*T5*'s performance is going to evaluate by its perplexity that is how certain it's response is or other word what is it's accuracy to predict the next word.

*BERT*'s performance is evaluated by it's accuracy to the prediction of the emotion and intention of the questions.

In [None]:
import os
import re
import json
import random
import math
from dataclasses import dataclass

import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset as TorchDataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from transformers import DataCollatorForSeq2Seq

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import evaluate
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

rogue_metric = evaluate.load('rogue')
bleu_metric = evaluate.load('bleu')

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

**Config**

In [None]:
@dataclass
class Config:
    jsonl_path = "/home/ghost-ed/Documents/Model_Tuning/adolescent_chatbot_train.jsonl"
    work_dir = "/home/ghost-ed/Documents/Model_Tuning"

    # label space
    intents = (
        "seek_help", "venting", "ask_question", "share_success", "neutral"
    )
    emotions = (
        "anxious", "sad", "angry", "lonely", "neutral"
    )

    clf_model_name = "bert-base-uncased"
    gen_model_name = "t5-small"

    clf_epochs = 4
    gen_epochs = 4
    batch_size = 8
    lr = 5e-5
    weight_decay = 0.01

    # generation config
    max_new_token = 96
    do_sample = True
    temperature = 0.9
    top_p = 0.92
    repetition_penalty = 1.08


CFG = Config()
os.makedirs(CFG.work_dir, exist_ok=True)

**Dataset loading and heuristic labeling**

In [None]:
def load_jsonl_messages(path) -> pd.DataFrame:
    rows = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue
            msgs = obj.get('messages', [])
            user = None
            assistant = None
            for m in msgs:
                if m.get('role') == 'user' and user is not None:
                    user = m.get('content', '').strip()
                if m.get('role') == 'assistant' and assistant is not None:
                    assistant = m.get('content', '').strip()
            if user and assistant:
                rows.append({"text": user, "response": assistant})
    return pd.DataFrame(rows)


# intent/emontion heuristics
INTENT_PATTERNS = {
    "seek_help": [r"help", r"what should i", r"how do i", r"can you", r"advice"],
    "venting": [r"no one", r"nobody", r"so tired", r"fed up", r"i hate"],
    "ask_question": [r"\?$", r"why ", r"what ", r"how ", r"when ", r"where "],
    "share_success": [r"i did (it|well)", r"i feel proud", r"happy to say"],
}


EMOTION_PATTERNS = {
    "anxious": [r"anxious|nervous|worried|on edge|overthink"],
    "sad": [r"sad|down|depress|worthless|cry"],
    "angry": [r"angry|mad|furious|annoy|irritat"],
    "lonely": [r"alone|lonely|isolated|no one"],
}


CRISIS_PATTERNS = {
    'self_harm': [r"kill myself", r"suicide", r"end my life", r"hurt myself"],
    'abuse': [r"abused", r"violence at home", r"beaten", r"forced"],
    'harm_others': [r"hurt someone", r"kill them", r"revenge"],
}


def first_match(text, patterns, default="neutral") -> str:
    t = text.lower()
    for label, pats in patterns.items():
        for p in pats:
            if re.search(p, t):
                return label
    return default

# synonyms for diversity
SYNONYMS = {
    'exam': ['test', 'assessment', 'paper'],
    'school': ['class', 'college', 'campus'],
    'anxious': ['nervous', 'worried', 'on edge'],
    'angry': ['upset', 'frustrated', 'mad'],
    'sad': ['down', 'blue', 'low'],
    'friend': ['peer', 'classmate', 'buddy'],
    'parents': ['family', 'mom and dad', 'guardians'],
}

def simple_augment(text, p=0.25, max_replacement=2) -> str:
    tokens = re.findall(r"\w+|\W", text)
    replaced = 0
    for i, toks in enumerate(tokens):
        low = toks.lower()
        if low in SYNONYMS and random.random() < p and repalced < max_replacement:
            tokens[i] = random.choice(SYNONYMS[low])
            replaced += 1
    return ''.join(tokens)

: 