In [103]:
from unsloth import FastLanguageModel
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
import torch

import json
import os
from datasets import Dataset
from transformers import AutoTokenizer

import random
from typing import List, Tuple
from unsloth import is_bfloat16_supported

from transformers import TextStreamer

from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from transformers import TextStreamer
from unsloth.chat_templates import train_on_responses_only

In [307]:
def train_test_split(member: str, test_size: float = 0.2, seed: int = 42, data_path: str = '/work/users/s/m/smerrill/Albemarle/dataset') -> Tuple[List[dict], List[dict]]:
    """
    Splits the dataset into training and test sets. Synthetic data is always added to the training set.

    Parameters:
    - member: The name identifier for the board member.
    - test_size: Proportion of the real (non-synthetic) data to include in the test split.
    - seed: Random seed for reproducibility.
    - data_path: Base directory for the dataset files.

    Returns:
    - A tuple (train_data, test_data)
    """
    real_data, synth_data = [], []

    if member == 'acuff':
        real_data = load_chat_dataset(os.path.join(data_path, 'kateacuff.txt'))
        synth_data = load_chat_dataset(os.path.join(data_path, 'synth_kateacuff.txt'))
    elif member == 'osborne':
        real_data = load_chat_dataset(os.path.join(data_path, 'ellenosborne.txt'))
        synth_data = load_chat_dataset(os.path.join(data_path, 'synth_ellenosborne.txt'))
    elif member == 'paige':
        real_data = load_chat_dataset(os.path.join(data_path, 'grahampaige.txt'))
        synth_data = load_chat_dataset(os.path.join(data_path, 'synth_grahampaige.txt'))
    elif member == 'le':
        real_data = load_chat_dataset(os.path.join(data_path, 'judyle.txt'))
        synth_data = load_chat_dataset(os.path.join(data_path, 'synth_judyle.txt'))
    elif member == 'callsen':
        real_data = load_chat_dataset(os.path.join(data_path, 'katrinacallsen.txt'))
    elif member == 'oberg':
        real_data = load_chat_dataset(os.path.join(data_path, 'davidoberg.txt'))
    elif member == 'alcaro':
        real_data = load_chat_dataset(os.path.join(data_path, 'jonnoalcaro.txt'))
    else:
        raise ValueError(f"Unknown member: {member}")

    if not 0 < test_size < 1:
        raise ValueError("test_size must be a float between 0 and 1.")

    # Shuffle and split only the real data
    random.seed(seed)
    shuffled_real = real_data.copy()
    random.shuffle(shuffled_real)

    split_index = int(len(shuffled_real) * (1 - test_size))
    train_data = shuffled_real[:split_index] + synth_data
    test_data = shuffled_real[split_index:]

    return train_data, test_data


def load_chat_dataset(input_path):
    """
    Load a chat-style message dataset from a JSON or JSONL file.
    
    Returns:
        data (list): A list of message dictionaries.
    """
    with open(input_path, "r", encoding="utf-8") as f:
        if input_path.endswith(".jsonl"):
            data = [json.loads(line) for line in f]
        else:  # Assume .json
            data = json.load(f)
    
    print(f"Dataset loaded from: {input_path}")
    print(f"Total examples: {len(data)}")
    return data

def convert_to_chat_format(data):
    result = []
    for item in data:
        result.append({
            "role": "user",
            "content": item['prompt']
        })
        result.append({
            "role": "assistant",
            "content": item['response']
        })
    return result

def combine_conversations(dataset):
    # Combine role and content into a list of dicts (conversation format)
    conversations = []
    # Assuming each row corresponds to a single message, group by conversation if possible
    # If it's just single messages in sequence, we can pair messages by user-assistant or whatever fits your logic
    # Here, I'll assume consecutive rows alternate roles and should be paired into conversations

    new_data = []
    i = 0
    while i < len(dataset):
        # Take two rows at a time: user + assistant
        if i + 1 < len(dataset):
            convo = [
                #{"role": 'system', "content": "A user will pass you dialog history for a conversation.  Your job is to predict the next response by Graham Paige by filling in the response after grahampaige:"},
                {"role": dataset[i]['role'], "content": dataset[i]['content']},
                {"role": dataset[i+1]['role'], "content": dataset[i+1]['content']}
            ]
            new_data.append({"conversations": convo})
            i += 2
        else:
            # If odd number of rows, last one alone
            convo = [{"role": dataset[i]['role'], "content": dataset[i]['content']}]
            new_data.append({"conversations": convo})
            i += 1

    return Dataset.from_list(new_data)


# Function to replace the system message in the text
def replace_system_message(example):
    parts = example["text"].split("<|eot_id|>", 1)
    if len(parts) == 2:
        # Replace system block only if the format is correct
        new_text = custom_system_message + parts[1]
    else:
        new_text = example["text"]  # Fallback: leave unchanged
    return {"text": new_text}

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).removeprefix(tokenizer.bos_token) for convo in convos]
    return { "text" : texts, }
pass

In [323]:
train_data, test_data = train_test_split('paige')

train_data = convert_to_chat_format(train_data)
test_data = convert_to_chat_format(test_data)

train_data = combine_conversations(train_data)
test_data = combine_conversations(test_data)

train_data = train_data.map(formatting_prompts_func, batched = True,)
test_data = test_data.map(formatting_prompts_func, batched = True,)

Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/grahampaige.txt
Total examples: 896
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/synth_grahampaige.txt
Total examples: 50


Map: 100%|██████████| 766/766 [00:00<00:00, 13936.02 examples/s]
Map: 100%|██████████| 180/180 [00:00<00:00, 11231.90 examples/s]


In [324]:
agent_name = "Graham Paige"
agent_name_lower = agent_name.replace(' ', '').lower()

message = """You are {agent_name}. Fill in the next response to the conversation by continuing the dialogue naturally. Only write what {agent_name} would say next — do not write for other speakers.

Example:
user:  
{agent_name_lower}: Good evening, everyone. Let's get started with the agenda.  
Speaker_1: Thanks, {agent_name}. I just had a quick question about the minutes from last time.  
assistant:  
{agent_name_lower}: Sure, go ahead with your question.""".format(
    agent_name=agent_name, agent_name_lower=agent_name_lower
)

# Define your custom system message
custom_system_message = f"<|start_header_id|>system<|end_header_id|>\n\n{message}\n<|eot_id|>"
train_data = train_data.map(replace_system_message)
test_data = test_data.map(replace_system_message)

Map: 100%|██████████| 766/766 [00:00<00:00, 16750.54 examples/s]
Map: 100%|██████████| 180/180 [00:00<00:00, 17192.90 examples/s]


In [326]:
test_data['text'][0]

"<|start_header_id|>system<|end_header_id|>\n\nYou are Graham Paige. Fill in the next response to the conversation by continuing the dialogue naturally. Only write what Graham Paige would say next — do not write for other speakers.\n\nExample:\nuser:  \ngrahampaige: Good evening, everyone. Let's get started with the agenda.  \nSpeaker_1: Thanks, Graham Paige. I just had a quick question about the minutes from last time.  \nassistant:  \ngrahampaige: Sure, go ahead with your question.\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSpeaker_2: OK. I'm interested in that. Are you? Yes, I mean, if you want to be back on it. I mean, it's three years, so who knows? But I mean, I don't know if that's a concern for these appointments.\nSpeaker_2: OK. So Ms. Lee, are you volunteering for that?\nSpeaker_2: I would like to volunteer for that.\nSpeaker_2: OK.\ngrahampaige: Are there any other volunteers for the three-year term on the KTEC board? If not, thank you very much, Ms. Lee, for ste

### Model

In [314]:
max_seq_length = 2048
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 4, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

==((====))==  Unsloth 2025.5.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-PCIE-40GB. Num GPUs = 1. Max memory: 39.495 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu118. CUDA: 8.0. CUDA Toolkit: 11.8. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [315]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_data,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 1e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.15,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n"
)

Unsloth: Tokenizing ["text"] (num_proc=2): 100%|██████████| 766/766 [00:01<00:00, 456.97 examples/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Map (num_proc=128): 100%|██████████| 766/766 [00:02<00:00, 264.68 examples/s]


In [316]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are Graham Paige. Fill in the next response to the conversation by continuing the dialogue naturally. Only write what Graham Paige would say next — do not write for other speakers.\n\nExample:\nuser:  \ngrahampaige: Good evening, everyone. Let's get started with the agenda.  \nSpeaker_1: Thanks, Graham Paige. I just had a quick question about the minutes from last time.  \nassistant:  \ngrahampaige: Sure, go ahead with your question.\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\ngrahampaige: We now have a break. It's about 805, no 807, so we'll break until 817. Okay.\nSpeaker_1: Hi, Mr. Page.\ngrahampaige: Hey, Dr. Heston, how are you? Doing well. Good, good. OK, it is now 8.17, and we will proceed with our next item on the agenda. Anti-racism policy update, Dr. Bernard Harrison. Dr. Harrison.\nSpeaker_1: Ms. Johnson, shall I share my screen? Yes. Thank you, ma'am. You see a full screen?\nSpeaker_1: Yeah. Very

In [317]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             grahampaige: That's not appearing. The slide's a little bit off. OK, there it is. It's there now. OK.<|eot_id|>"

In [318]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 766 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 6,078,464/3,000,000,000 (0.20% trained)


Step,Training Loss
1,0.4283
2,0.2878
3,0.4046
4,0.5577
5,0.514
6,1.0037
7,0.4888
8,0.7305
9,0.6875
10,0.3346


## Single Evaluation

In [321]:
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

# Step 1: Apply chat template
messages = [train_data['conversations'][1][:-1]]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Required for generation
    return_tensors="pt",
).to("cuda")

# Step 2: Replace old system message (first 26 tokens) with new one
custom_prompt = "<|begin_of_text|>" + custom_system_message
custom_prompt_tokens = tokenizer.encode(custom_prompt, return_tensors="pt").to(inputs.device)

# Slice off the original system message (~first 26 tokens)
inputs = inputs[:, 26:]

# Prepend the new system prompt
inputs = torch.cat([custom_prompt_tokens, inputs], dim=1)

# Step 3: Append your speaker prefix if needed
new_tokens = torch.tensor(tokenizer.encode('grahampaige:')[1:], device=inputs.device).unsqueeze(0)
inputs = torch.cat([inputs, new_tokens], dim=1)

# Step 4: Generate
text_streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(
    input_ids=inputs,
    streamer=text_streamer,
    max_new_tokens=128,
    use_cache=True,
    temperature=1.5,
    min_p=0.1
)

 Ah, Leila Barnes from Albemarle High. Well, Leila, thank you for coming. And welcome to all our guests. Welcome, everyone! Now that we have our representatives, let's move on to the agenda...<|eot_id|>


In [322]:
# Get first example token IDs as a list
input_ids_list = inputs[0].tolist()

# Decode back to text
decoded_text = tokenizer.decode(input_ids_list)

print("Decoded Text:", decoded_text)

# Show tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids_list)
#print("Tokens:", tokens)

Decoded Text: <|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are Graham Paige. Fill in the next response to the conversation by continuing the dialogue naturally. Only write what Graham Paige would say next — do not write for other speakers.

Example:
user:  
grahampaige: Good evening, everyone. Let's get started with the agenda.  
Speaker_1: Thanks, Graham Paige. I just had a quick question about the minutes from last time.  
assistant:  
grahampaige: Sure, go ahead with your question.
<|eot_id|><|start_header_id|>user<|end_header_id|>

grahampaige: Mr. John O. Alcaro. John O. Alcaro at large. Ms. Colson is absent at this time. Ms. Judy Lee.
Speaker_1: Judy Lee, Rivanna District.
Speaker_3: Mr. Dave Oberg. Dave Oberg, Whitehall Magisterial District. Ms. Ellen Osborne.
Speaker_1: Ellen Osborne, Scottsville District.
grahampaige: And I'm Graham Page, Samuel Miller District. Our student rep, I don't think, is here at this time. Is she? I'm here. Hi. Sor

### Full evaluation

In [333]:
import evaluate  # instead of datasets.load_metric
from tqdm import tqdm

# Initialize metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

FastLanguageModel.for_inference(model)  # Enable fast inference

agent_name = "Graham Paige"
agent_name_lower = agent_name.replace(' ', '').lower()
speaker_prefix = tokenizer.encode(f"{agent_name_lower}:", return_tensors="pt").to("cuda")[:, 1:]  # remove BOS

# Store predictions and references
generated_texts = []
reference_texts = []

for example in tqdm(test_data):  # assuming test_set is a list of dicts with 'conversations'
    # === Step 1: Prepare messages ===
    messages = [example['conversations'][:-1]]  # all but final message
    reference = example['conversations'][-1]['content']  # the target response

    # === Step 2: Format prompt ===
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    # Remove old system prompt (~first 26 tokens)
    inputs = inputs[:, 26:]

    # Insert new system prompt
    custom_prompt = "<|begin_of_text|>" + custom_system_message
    custom_prompt_tokens = tokenizer.encode(custom_prompt, return_tensors="pt").to("cuda")
    inputs = torch.cat([custom_prompt_tokens, inputs], dim=1)

    # Append agent speaker token
    inputs = torch.cat([inputs, speaker_prefix], dim=1)

    # === Step 3: Generate response ===
    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=128,
        use_cache=True,
        temperature=1.5,
        min_p=0.1,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    generated = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)

    generated_texts.append(generated.strip())
    reference_texts.append(reference.strip())


100%|██████████| 180/180 [05:17<00:00,  1.76s/it]


ValueError: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: [['Thank', 'you,', 'thank', 'you.', "That's", 'all', 'for', 'now.']],
Input references: [['grahampaige:', 'Okay,', 'we', 'now', 'move', 'to', 'the', 'prep', 'committee.', 'And', 'currently,', 'Ms.', 'Lee', 'and', 'me,', 'the', 'two', 'of', 'us', 'are', 'currently', 'on', 'our', 'board.', 'So', 'we', 'could', 'have', 'two', 'new', 'people', 'if', 'we', 'have', 'two', 'volunteers,', 'unless', 'Ms.', 'Lee', 'wants', 'to', 'volunteer', 'again.']]

In [334]:
# BLEU expects list of strings (predictions) and list of lists (references)
bleu_preds = generated_texts  # list of strings
bleu_refs = [[ref] for ref in reference_texts]  # list of lists of strings

bleu_score = bleu.compute(predictions=bleu_preds, references=bleu_refs)

# ROUGE (expects strings)
rouge_score = rouge.compute(predictions=generated_texts, references=reference_texts)

# BERTScore (expects strings)
bertscore_result = bertscore.compute(predictions=generated_texts, references=reference_texts, lang="en")

print("\n--- Evaluation Results ---")
print(f"BLEU: {bleu_score['bleu']:.4f}")
print(f"ROUGE-L: {rouge_score['rougeL']:.4f}")
print(f"BERTScore F1: {sum(bertscore_result['f1']) / len(bertscore_result['f1']):.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Evaluation Results ---
BLEU: 0.0135


AttributeError: 'numpy.float64' object has no attribute 'mid'

In [335]:
print("\n--- Evaluation Results ---")
print(f"BLEU: {bleu_score['bleu']:.4f}")
print(f"ROUGE-L: {rouge_score['rougeL']:.4f}")
print(f"BERTScore F1: {sum(bertscore_result['f1']) / len(bertscore_result['f1']):.4f}")



--- Evaluation Results ---
BLEU: 0.0135
ROUGE-L: 0.0924
BERTScore F1: 0.8318


What do these scores mean?
Metric	Your score	Typical interpretation
BLEU	0.0135	Very low — almost no n-gram overlap with references
ROUGE-L	0.0924	Also quite low — limited overlap in longest common subsequence
BERTScore	0.8318	Pretty good — semantic similarity is fairly strong


Reasonability and context:
BLEU and ROUGE are surface-level overlap metrics. Scores near zero suggest your generated responses differ a lot word-for-word from the references.

BERTScore captures semantic similarity and is more forgiving about wording changes. 0.83 is pretty decent and indicates your model’s output is somewhat close in meaning, even if wording is different.
