In [None]:
import torch

# Get PyTorch version
print("PyTorch version:", torch.__version__)

# Check if CUDA is available
if torch.cuda.is_available():
    print("CUDA is available")
    print("CUDA version:", torch.version.cuda)
    print("GPU device name:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available")

In [None]:
!pip install "unsloth[cu126-torch280]"

In [None]:
!pip install rouge_score
!pip install bert_score
!pip install sacrebleu
!pip install evaluate

In [None]:
import re
import torch
import random
import pandas as pd
import numpy as np
from datasets import Dataset, Features, Sequence, Value, DatasetDict
import torch.nn.functional as F
from unsloth import FastLanguageModel
from trl import SFTConfig, SFTTrainer
import unittest
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import f1_score
import gc
import json
from collections import deque
from tqdm.auto import tqdm
import os
from transformers import EarlyStoppingCallback

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


## Data Setup

In [None]:
import pandas as pd, re, ast
from tqdm.auto import tqdm
from typing import List, Tuple
from datasets import Dataset

# ───────────────────── helpers ────────────────────────────────────
ROLE_PATTERN = re.compile(r"^(customer|assistant|klant|medewerker):\s*(.*)", re.I)

def split_role_text(msg: str) -> Tuple[str, str]:
    m = ROLE_PATTERN.match(str(msg).strip())
    if not m:
        raise ValueError(f"Bad role prefix in: {msg!r}")
    role_raw, body = m.groups()
    role     = "user" if role_raw.lower() in ("customer", "klant") else "assistant"
    visible  = "klant" if role == "user" else "medewerker"
    return role, f"{visible}: {body.strip()}"

def _collapse_with_idx(df: pd.DataFrame, col: str):
    cols = ["role", col, "intent"]
    out, cur, buf_t, buf_i, last = [], None, [], [], None
    for idx, role, text, intent in df[cols].itertuples(index=True, name=None):
        if role != cur and buf_t:
            out.append((cur, buf_t, buf_i, last))
            buf_t, buf_i = [], []
        cur, last = role, idx
        buf_t.append(str(text))
        buf_i.append("" if pd.isna(intent) else str(intent))
    if buf_t:
        out.append((cur, buf_t, buf_i, last))
    return out
# ───────────────────── main builder ───────────────────────────────
def build_instruction_dataframe(
    df: pd.DataFrame,
    tokenizer,
    system_prompt: str,
    max_seq_len: int = 2048,
    output_mode: str = "intent+response",
    use_rag: bool = False,
    rag_col: str = "retrieved_passages",
    rag_top_k: int = 3,
):
    """
    • No explicit <|im_start|>system block.
    • system_prompt is prepended to the FIRST user message, or put in a
      synthetic user stub if the assistant speaks first.
    • RAG passages (if any) are injected ABOVE the user turn they belong to,
      inside the same user message.
    """
    dfc = df.copy().reset_index(drop=True)

    if use_rag and rag_col in dfc.columns and isinstance(dfc[rag_col].iloc[0], str):
        dfc[rag_col] = dfc[rag_col].apply(
            lambda x: ast.literal_eval(x) if pd.notna(x) else []
        )

    dfc[["role", "prefixed_text"]] = pd.DataFrame(
        [split_role_text(m) for m in dfc["conversation"]], index=dfc.index
    )
    dfc = dfc.sort_values(["conversation_number", "message_number"])

    samples = []

    for _, group in tqdm(dfc.groupby("conversation_number", sort=False),
                         desc="Building dataset"):
        turns    = _collapse_with_idx(group, col="prefixed_text")
        history  : List[dict] = []
        sys_done = False

        for role, texts, intents, idx in turns:
            content = "\n".join(texts)

            # ---------------- USER TURN ----------------
            if role != "assistant":
                # inject system_prompt on the first user turn
                if not sys_done:
                    content = f"{system_prompt}\n\nConversation History:\n{content}" if system_prompt else content
                    sys_done = True

                # optional RAG injection happens HERE
                if use_rag and rag_col in group.columns:
                    rag_passages = group.loc[idx, rag_col]
                    if isinstance(rag_passages, list) and rag_passages:
                        rag_txt = "\n\n".join(rag_passages[:rag_top_k])
                        content = (
                            "--- RELEVANT KNOWLEDGE ---\n" + rag_txt
                            + "\n\n--- END KNOWLEDGE ---\n\n" + content
                        )

                history.append({"role": "user", "content": content})
                continue

            # if assistant starts the conversation → create stub user w/ system
            if not sys_done:
                stub = {"role": "user", "content": system_prompt}
                history.append(stub)
                sys_done = True

            # ------------- ASSISTANT TURN --------------
            intents_str = "\n".join(f"- {i}" for i in intents if i)
            if output_mode == "response":
                assistant_content = content
            elif output_mode == "intent+response":
                assistant_content = (
                    f"Intents:\n{intents_str}"
                    f"\nResponses:\n{content}"
                )
            else:
                raise ValueError(f"Bad output_mode: {output_mode}")

            prompt_msgs = history
            full_msgs   = prompt_msgs + [{"role": "assistant", "content": assistant_content}]

            if len(tokenizer.apply_chat_template(full_msgs)) >= max_seq_len:
                continue  # skip over-long sample

            samples.append(
                {
                    "index": idx,
                    "training_text": tokenizer.apply_chat_template(
                        full_msgs, tokenize=False, add_generation_prompt=False
                    ),
                    "prompt_text": tokenizer.apply_chat_template(
                        prompt_msgs, tokenize=False, add_generation_prompt=True
                    ),
                    "assistant_intents_only": "\n".join(i for i in intents if i),
                    "assistant_response_raw": content,
                }
            )

            history.append({"role": "assistant", "content": content})

    if not samples:
        return Dataset.from_list([]), pd.DataFrame()

    df_out = pd.DataFrame(samples).set_index("index")
    df_out = dfc.loc[df_out.index].join(df_out)
    ds_out = Dataset.from_pandas(
        df_out[["training_text"]].rename(columns={"training_text": "text"})
    )

    return ds_out, df_out.reset_index(drop=True)


In [None]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util
import gc
from sklearn.metrics import f1_score
import re
from typing import Tuple, List, Optional
from datasets import Dataset
from tqdm.auto import tqdm
import json
import evaluate

def split_conversations(df, conversation_col='conversation_number', val_ratio=0.1, method='sequential', seed=42):

    conv_ids = df[conversation_col].unique()

    if method == 'random':
        np.random.seed(seed)
        conv_ids = np.random.permutation(conv_ids)
    elif method == 'sequential':
        conv_ids = sorted(conv_ids)  # assumes time-ordered
    else:
        raise ValueError("Method must be 'sequential' or 'random'")

    split_idx = int(len(conv_ids) * (1 - val_ratio))
    train_ids = conv_ids[:split_idx]
    val_ids = conv_ids[split_idx:]

    train_df = df[df[conversation_col].isin(train_ids)]
    val_df = df[df[conversation_col].isin(val_ids)]

    return train_df.reset_index(drop=True), val_df.reset_index(drop=True)

ROLE_RE = re.compile(r'^(customer|assistant|klant|medewerker):', re.I)

def filter_mono_role_convs(df: pd.DataFrame,
                           msg_col: str = "conversation",
                           id_col: str  = "conversation_number"
                          ) -> Tuple[pd.DataFrame, int, int]:

    roles = df[msg_col].str.extract(ROLE_RE, expand=False).str.lower()
    df_role = df.assign(_role=roles)

    role_sets = (df_role.groupby(id_col)["_role"].apply(lambda s: set(s.dropna())))

    valid_conv_ids = role_sets[
    role_sets.apply(lambda x: bool({"customer", "assistant", "klant", "medewerker"} & x) and len(x) > 1)
    ].index

    df_filtered = df_role[df_role[id_col].isin(valid_conv_ids)].drop(columns="_role")

    kept    = len(valid_conv_ids)
    dropped = role_sets.size - kept
    print(f"kept {kept} conversations, dropped {dropped} (mono-role)")
    return df_filtered

def setup_data(path="data/sample_25k_train.csv", val_ratio=0.1):

    df = pd.read_csv(path)
    df = df.rename(columns={"index": "message_number"})

    try:
        df.drop("Unnamed: 0", axis=1)
    except:
        pass

    df = df.sort_values(["conversation_number", "message_number"]).reset_index(drop=True)

    bad_conversations = df[df["intent"].isnull()]["conversation_number"].unique()
    df = df[~df["conversation_number"].isin(bad_conversations)]

    train_df, val_df = split_conversations(df.reset_index(drop=True), val_ratio=val_ratio)

    train_df = filter_mono_role_convs(train_df)
    val_df = filter_mono_role_convs(val_df)

    train_df = train_df.sort_values(["conversation_number", "message_number"]).reset_index(drop=True)
    val_df = val_df.sort_values(["conversation_number", "message_number"]).reset_index(drop=True)

    return train_df, val_df

In [None]:
# No monotonic conversations in the same data.
train_df, val_df = setup_data(path="/toy_conversations.csv", val_ratio=0.3)

In [None]:
def create_model(model_name = "unsloth/Qwen3-1.7B-bnb-4bit", max_seq_length = 2048):

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        load_in_4bit=True,
        dtype=None
    )

    tokenizer.padding_side = "left"

    return model, tokenizer

In [None]:
max_seq_len = int(2048)
phase = 2
two_step = False
r = 16
lora_alpha = 32

model_name = "Qwen3-1.7B-bnb-4bit"
model_url = "unsloth/Qwen3-1.7B-bnb-4bit"
add_intent_in_reply = False
run_name = f"basis_{model_name}"

per_device_train_batch_size = 16
per_device_eval_batch_size = 64
max_steps = 2000
learning_rate = 2e-4

In [None]:
model, tokenizer = create_model(model_name=model_url, max_seq_length=2048)

In [None]:
system_prompt = str("You are a helpful assistant. Given the conversation, write the next assistant reply.")

# system_prompt = str('''You are an expert customer service assistant for our company. You must always follow a strict two-step process to reply.

# 1.  **Intents:** First, you must determine the correct set of response intents needed. Write these on a new line starting with 'Intents:'.
# 2.  **Responses:** After a blank line, you must write one or multiple full, polite, and helpful responses to the customer, starting with 'Responses:'.''')


print("Building training set")
ds_train, df_train_f = build_instruction_dataframe(
    df=train_df, # Your training data, which MUST have the 'retrieved_passages' column
    tokenizer=tokenizer,
    system_prompt=system_prompt,
    max_seq_len=max_seq_len,
    output_mode="response",
    use_rag=False,
    rag_top_k=3   # Specify how many retrieved articles to use
)

print("\nBuilding validation set")
ds_val, df_val_f = build_instruction_dataframe(
    df=val_df, # Your validation data with retrieved passages
    tokenizer=tokenizer,
    system_prompt=system_prompt,
    max_seq_len=max_seq_len,
    output_mode="response",
    use_rag=False, # <-- Also set this to True for the validation set
    rag_top_k=3
)

Building training set


Building dataset:   0%|          | 0/18 [00:00<?, ?it/s]


Building validation set


Building dataset:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
def model_config(train_ds, valid_ds, model, tokenizer, max_seq_length = 2048,
                 per_device_train_batch_size = 4, per_device_eval_batch_size=4, max_steps = 2000, learning_rate = 2e-4):

    cfg = SFTConfig(

        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        max_steps = max_steps,
        learning_rate=learning_rate,
        gradient_accumulation_steps=2,

        warmup_ratio=0.03,
        lr_scheduler_type="cosine",

        eval_strategy="steps",
        eval_steps = 1,

        remove_unused_columns=False,
        seed=42,

        metric_for_best_model="eval_loss",
        report_to=[],
        greater_is_better=False
    )

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_ds,
        eval_dataset=valid_ds,
        args=cfg,
    )

    early_stop = EarlyStoppingCallback(
        early_stopping_patience = 2,      # “2 evals in a row”
        early_stopping_threshold = 0.005,   # “no improvement”
    )
    trainer.add_callback(early_stop)

    return trainer

def lora_config(model, r=16, lora_alpha=32):
    model = FastLanguageModel.get_peft_model(
    model,
    r=r,
    lora_alpha=lora_alpha,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    use_rslora=True,
    use_gradient_checkpointing="unsloth",
    )

    return model

In [None]:
model = lora_config(model, r=r, lora_alpha=lora_alpha)
trainer = model_config(ds_train, ds_val, model, tokenizer,
                       max_seq_length = max_seq_len,
                       per_device_train_batch_size = per_device_train_batch_size,
                       per_device_eval_batch_size=per_device_eval_batch_size,
                       max_steps = 10,
                       learning_rate = learning_rate)
trainer.train()

Unsloth 2025.8.10 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/88 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/28 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 88 | Num Epochs = 4 | Total steps = 10
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 2 x 1) = 32
 "-____-"     Trainable parameters = 17,432,576 of 1,738,007,552 (1.00% trained)
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
1,4.7921,5.152853
2,4.8398,5.152853
3,4.7335,5.152853


Unsloth: Not an error, but Qwen3ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


TrainOutput(global_step=3, training_loss=4.788473765055339, metrics={'train_runtime': 36.5916, 'train_samples_per_second': 8.745, 'train_steps_per_second': 0.273, 'total_flos': 245462590291968.0, 'train_loss': 4.788473765055339})

In [None]:
def _infer_single_batch(
    batch_df: pd.DataFrame,
    model,
    tokenizer,
    *,
    max_new_tokens: int = 20,
    phase: int = 2
):
    """
    Run text generation on a single dataframe batch and return a list of dicts with predictions.
    """
    # This part is unchanged
    tokenizer.padding_side = "left"
    inputs = tokenizer(
        batch_df["prompt_text"].tolist(),
        return_tensors="pt",
        padding=True,
    ).to(model.device)

    # This part is unchanged
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.9,
            top_p=0.95,
            top_k=20,
        )

    # This part is unchanged
    decoded = tokenizer.batch_decode(
        outputs,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )

    records = []
    # --- THIS IS THE ONLY CHANGE ---
    for idx, full_decoded_string in enumerate(decoded):

        # Your original logic for getting the 'prediction' to be parsed
        prediction_for_parsing = full_decoded_string.split("\nassistant\n")[-1]

        if phase == 1:
            # Your original phase 1 logic, but we add the new raw column
            record = {
                "conversation_number": batch_df.iloc[idx]["conversation_number"],
                "input_text":          batch_df.iloc[idx]["input_text"],
                "prompt_text":         batch_df.iloc[idx]["prompt_text"],
                "current_user_query":  batch_df.iloc[idx]["text"],
                "target_intent":       batch_df.iloc[idx]["intent"],
                "generated_intent":    prediction_for_parsing.strip(),
                "generated_response_raw": full_decoded_string.strip() # The new raw column
            }
            records.append(record)
        else:  # phase == 2
            # Your original phase 2 logic, but we add the new raw column
            record = {
                "conversation_number":  batch_df.iloc[idx]["conversation_number"],
                "text":                 batch_df.iloc[idx]["text"],
                "input_text":           batch_df.iloc[idx]["input_text"],
                "prompt_text":          batch_df.iloc[idx]["prompt_text"],
                "current_user_query":   batch_df.iloc[idx]["current_user_query"],
                "target_intent":       batch_df.iloc[idx]["intent"],
                "target_response":      batch_df.iloc[idx]["assistant_raw"],
                "target_response_int":  batch_df.iloc[idx]["assistant_with_int"],
                "generated_response":   prediction_for_parsing.strip(),
                "generated_response_raw": full_decoded_string.strip() # The new raw column
            }
            records.append(record)

    return records

def run_batched_inference(
    df: pd.DataFrame,
    model,
    tokenizer,
    *,
    batch_size: int = 4,
    max_new_tokens: int = 20,
    phase: int = 2,
    show_progress: bool = True,
) -> pd.DataFrame:
    """High‑level convenience wrapper that iterates **outside** the core batch routine.

    This keeps the inner function small and testable, while allowing you to move the
    outer loop elsewhere (e.g. into a notebook cell or a training script).
    """

    df = df.reset_index(drop=True)

    all_preds = []
    iterator = range(0, len(df), batch_size)
    if show_progress:
        iterator = tqdm(iterator, desc="Generating")

    for start in iterator:
        end = min(start + batch_size, len(df))
        batch_df = df.iloc[start:end]
        batch_records = _infer_single_batch(
            batch_df,
            model,
            tokenizer,
            max_new_tokens=max_new_tokens,
            phase=phase,
        )
        all_preds.extend(batch_records)

    return pd.DataFrame(all_preds)

def strip_think_blocks(text: str) -> str:
    if not isinstance(text, str):
        return text
    # remove full <think>...</think> blocks
    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
    # remove any stray <think> or </think> tags
    text = re.sub(r"</?think>", "", text)
    # collapse multiple newlines into one
    text = re.sub(r"\n+", "\n", text)
    # strip leading/trailing whitespace/newlines
    return text.strip()



In [None]:
df_val_f["text"] = ""
df_val_f["input_text"] = df_val_f["training_text"]
df_val_f["current_user_query"] = ""
df_val_f["assistant_with_int"] = ""

df_val_f.rename({"assistant_intents_only":"intent",
                 "assistant_response_raw":"assistant_raw",
                 "assistant_response_structured":"assistant_with_int"}, axis=1, inplace=True)

output = run_batched_inference(
    df=df_val_f,
    model=model,
    tokenizer=tokenizer,
    batch_size=10,
    max_new_tokens=2048
)
output["generated_response"] = output["generated_response"].apply(strip_think_blocks)


Generating:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
import evaluate
import numpy as np
import pandas as pd
from typing import Tuple

def create_response_metrics(
    pred_df: pd.DataFrame,
    *,
    gen_col: str = "pred_reply",
    tgt_col: str = "target_response",
    bert_lang: str = "nl",
    bert_model: str = "xlm-roberta-large",
) -> Tuple[dict, pd.DataFrame]:
    """
    Compute metrics on the FULL dataset only, with **BERTScore only** for embeddings.
    Returns (metrics_dict, enriched_eval_df). Adds per-row BERTScore columns to eval_df.
    """
    # Build a clean eval frame
    eval_df = pd.DataFrame({
        "gen": pred_df[gen_col].fillna("").astype(str),
        "tgt": pred_df[tgt_col].fillna("").astype(str),
    })
    eval_df = eval_df[(eval_df.gen.str.len() > 0) & (eval_df.tgt.str.len() > 0)]
    if eval_df.empty:
        return {"error": "No valid prediction/target pairs."}, eval_df

    # Load metrics
    rouge  = evaluate.load("rouge")
    bleu   = evaluate.load("sacrebleu")
    meteor = evaluate.load("meteor")
    bert   = evaluate.load("bertscore")

    gen, tgt = eval_df.gen.tolist(), eval_df.tgt.tolist()

    # Text overlap metrics (kept as in your previous version)
    r = rouge.compute(predictions=gen, references=tgt)
    b = bleu.compute(predictions=gen, references=[[t] for t in tgt])
    m = meteor.compute(predictions=gen, references=tgt)

    metrics = {
        "bleu_all": b["score"],
        "meteor_all": m["meteor"],
        **{f"{k}_all": v for k, v in r.items()},
    }

    # BERTScore (only embedding-based metric kept)
    bs = bert.compute(predictions=gen, references=tgt, lang=bert_lang, model_type=bert_model)
    for k in ["f1", "precision", "recall"]:
        arr = np.asarray(bs[k], dtype=float)
        metrics[f"bertscore_{k}_all"]     = arr.mean()
        metrics[f"bertscore_{k}_std_all"] = arr.std(ddof=0)

    # Attach per-row BERTScore once
    eval_df["bertscore_f1"]        = bs["f1"]
    eval_df["bertscore_precision"] = bs["precision"]
    eval_df["bertscore_recall"]    = bs["recall"]

    return metrics, eval_df

In [None]:
output["generated_response"] = output["generated_response"].apply(strip_think_blocks)
output

Unnamed: 0,conversation_number,text,input_text,prompt_text,current_user_query,target_intent,target_response,target_response_int,generated_response,generated_response_raw
0,18,,<|im_start|>user\nYou are a helpful assistant....,<|im_start|>user\nYou are a helpful assistant....,,intent clarify_preference intent clarify...,"medewerker: Sure, I can help. Do you want to u...",,"Sure, let me help you unsubscribe from the new...",user\nYou are a helpful assistant. Given the c...
1,18,,<|im_start|>user\nYou are a helpful assistant....,<|im_start|>user\nYou are a helpful assistant....,,intent confirm_action intent confirm_act...,medewerker: Understood. I’ve updated your emai...,,"Sure, I'll help you unsubscribe from the promo...",user\nYou are a helpful assistant. Given the c...
2,18,,<|im_start|>user\nYou are a helpful assistant....,<|im_start|>user\nYou are a helpful assistant....,,intent close_conversation intent close_c...,medewerker: Anytime! Your preference is saved.,,You're welcome! Let me know if you need anythi...,user\nYou are a helpful assistant. Given the c...
3,19,,<|im_start|>user\nYou are a helpful assistant....,<|im_start|>user\nYou are a helpful assistant....,,intent request_receipt_number intent req...,medewerker: Sorry about that. Could you share ...,,I'm sorry to hear that. Let me check the statu...,user\nYou are a helpful assistant. Given the c...
4,19,,<|im_start|>user\nYou are a helpful assistant....,<|im_start|>user\nYou are a helpful assistant....,,intent offer_solution intent...,medewerker: Thanks. I checked and the points d...,,I'll check the system for you. Could you pleas...,user\nYou are a helpful assistant. Given the c...
5,19,,<|im_start|>user\nYou are a helpful assistant....,<|im_start|>user\nYou are a helpful assistant....,,intent confirm_action intent confirm_act...,medewerker: Done. The points are now visible i...,,You're welcome! Let me know if there's anythin...,user\nYou are a helpful assistant. Given the c...
6,19,,<|im_start|>user\nYou are a helpful assistant....,<|im_start|>user\nYou are a helpful assistant....,,intent close_conversation intent close_c...,medewerker: Glad to help!,,You're very welcome! If you have any more ques...,user\nYou are a helpful assistant. Given the c...
7,20,,<|im_start|>user\nYou are a helpful assistant....,<|im_start|>user\nYou are a helpful assistant....,,intent provide_instructions intent provi...,medewerker: Understood. You can change it in S...,,Of course! I'll switch the app language back t...,user\nYou are a helpful assistant. Given the c...
8,20,,<|im_start|>user\nYou are a helpful assistant....,<|im_start|>user\nYou are a helpful assistant....,,intent offer_additional_help intent offe...,medewerker: Great to hear! Anything else I can...,,You're welcome! If you need any further assist...,user\nYou are a helpful assistant. Given the c...
9,20,,<|im_start|>user\nYou are a helpful assistant....,<|im_start|>user\nYou are a helpful assistant....,,intent close_conversation intent close_c...,"medewerker: Alright, have a good day!",,You're welcome! If you need any further assist...,user\nYou are a helpful assistant. Given the c...


In [None]:
metrics, eval_df = create_response_metrics(
    output,
    gen_col="generated_response",   # model predictions
    tgt_col="target_response",      # gold responses
    bert_lang="en",                 # or "nl" if Dutch
    bert_model="xlm-roberta-large"
)

print("Aggregate metrics:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

print("\nEval DF with per-row BERTScore:")
print(eval_df.head())


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Aggregate metrics:
bleu_all: 0.6993
meteor_all: 0.2272
rouge1_all: 0.1813
rouge2_all: 0.0340
rougeL_all: 0.1503
rougeLsum_all: 0.1521
bertscore_f1_all: 0.8840
bertscore_f1_std_all: 0.0130
bertscore_precision_all: 0.8681
bertscore_precision_std_all: 0.0207
bertscore_recall_all: 0.9008
bertscore_recall_std_all: 0.0151

Eval DF with per-row BERTScore:
                                                 gen  \
0  Sure, let me help you unsubscribe from the new...   
1  Sure, I'll help you unsubscribe from the promo...   
2  You're welcome! Let me know if you need anythi...   
3  I'm sorry to hear that. Let me check the statu...   
4  I'll check the system for you. Could you pleas...   

                                                 tgt  bertscore_f1  \
0  medewerker: Sure, I can help. Do you want to u...      0.905032   
1  medewerker: Understood. I’ve updated your emai...      0.891434   
2     medewerker: Anytime! Your preference is saved.      0.875368   
3  medewerker: Sorry about that.