In [65]:
from unsloth import FastLanguageModel
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
import torch

import json
import os
from datasets import Dataset
from transformers import AutoTokenizer

import random
from typing import List, Tuple
from unsloth import is_bfloat16_supported

from transformers import TextStreamer

In [132]:
def train_test_split(member: str, test_size: float = 0.2, seed: int = 42, data_path: str = '/work/users/s/m/smerrill/Albemarle/dataset') -> Tuple[List[dict], List[dict]]:
    """
    Splits the dataset into training and test sets. Synthetic data is always added to the training set.

    Parameters:
    - member: The name identifier for the board member.
    - test_size: Proportion of the real (non-synthetic) data to include in the test split.
    - seed: Random seed for reproducibility.
    - data_path: Base directory for the dataset files.

    Returns:
    - A tuple (train_data, test_data)
    """
    real_data, synth_data = [], []

    if member == 'acuff':
        real_data = load_chat_dataset(os.path.join(data_path, 'kateacuff.txt'))
        synth_data = load_chat_dataset(os.path.join(data_path, 'synth_kateacuff.txt'))
    elif member == 'osborne':
        real_data = load_chat_dataset(os.path.join(data_path, 'ellenosborne.txt'))
        synth_data = load_chat_dataset(os.path.join(data_path, 'synth_ellenosborne.txt'))
    elif member == 'paige':
        real_data = load_chat_dataset(os.path.join(data_path, 'grahampaige.txt'))
        synth_data = load_chat_dataset(os.path.join(data_path, 'synth_grahampaige.txt'))
    elif member == 'le':
        real_data = load_chat_dataset(os.path.join(data_path, 'judyle.txt'))
        synth_data = load_chat_dataset(os.path.join(data_path, 'synth_judyle.txt'))
    elif member == 'callsen':
        real_data = load_chat_dataset(os.path.join(data_path, 'katrinacallsen.txt'))
    elif member == 'oberg':
        real_data = load_chat_dataset(os.path.join(data_path, 'davidoberg.txt'))
    elif member == 'alcaro':
        real_data = load_chat_dataset(os.path.join(data_path, 'jonnoalcaro.txt'))
    else:
        raise ValueError(f"Unknown member: {member}")

    if not 0 < test_size < 1:
        raise ValueError("test_size must be a float between 0 and 1.")

    # Shuffle and split only the real data
    random.seed(seed)
    shuffled_real = real_data.copy()
    random.shuffle(shuffled_real)

    split_index = int(len(shuffled_real) * (1 - test_size))
    train_data = shuffled_real[:split_index] + synth_data
    test_data = shuffled_real[split_index:]

    return train_data, test_data


def load_chat_dataset(input_path):
    """
    Load a chat-style message dataset from a JSON or JSONL file.
    
    Returns:
        data (list): A list of message dictionaries.
    """
    with open(input_path, "r", encoding="utf-8") as f:
        if input_path.endswith(".jsonl"):
            data = [json.loads(line) for line in f]
        else:  # Assume .json
            data = json.load(f)
    
    print(f"Dataset loaded from: {input_path}")
    print(f"Total examples: {len(data)}")
    return data

def convert_to_chat_format(data):
    result = []
    for item in data:
        result.append({
            "role": "user",
            "content": item['prompt']
        })
        result.append({
            "role": "assistant",
            "content": item['response']
        })
    return result

def combine_conversations(dataset):
    # Combine role and content into a list of dicts (conversation format)
    conversations = []
    # Assuming each row corresponds to a single message, group by conversation if possible
    # If it's just single messages in sequence, we can pair messages by user-assistant or whatever fits your logic
    # Here, I'll assume consecutive rows alternate roles and should be paired into conversations

    new_data = []
    i = 0
    while i < len(dataset):
        # Take two rows at a time: user + assistant
        if i + 1 < len(dataset):
            convo = [
                {"role": dataset[i]['role'], "content": dataset[i]['content']},
                {"role": dataset[i+1]['role'], "content": dataset[i+1]['content']}
            ]
            new_data.append({"conversations": convo})
            i += 2
        else:
            # If odd number of rows, last one alone
            convo = [{"role": dataset[i]['role'], "content": dataset[i]['content']}]
            new_data.append({"conversations": convo})
            i += 1

    return Dataset.from_list(new_data)


In [154]:
train_data, test_data = train_test_split('paige')
train_data = convert_to_chat_format(train_data)
train_data = combine_conversations(train_data)

Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/grahampaige.txt
Total examples: 896
Dataset loaded from: /work/users/s/m/smerrill/Albemarle/dataset/synth_grahampaige.txt
Total examples: 50


In [192]:
max_seq_length = 2048
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

==((====))==  Unsloth 2025.5.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-PCIE-40GB. Num GPUs = 1. Max memory: 39.495 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu118. CUDA: 8.0. CUDA Toolkit: 11.8. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [193]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass


In [194]:
train_data = train_data.map(formatting_prompts_func, batched = True,)

Map: 100%|██████████| 766/766 [00:00<00:00, 17129.65 examples/s]


In [195]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_data,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.2,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Unsloth: Tokenizing ["text"] (num_proc=2): 100%|██████████| 766/766 [00:01<00:00, 495.96 examples/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Map (num_proc=128): 100%|██████████| 766/766 [00:02<00:00, 351.33 examples/s]


In [196]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

"<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSpeaker_1: Hi, Mr. Page.\ngrahampaige: Hey, Dr. Heston, how are you? Doing well. Good, good. OK, it is now 8.17, and we will proceed with our next item on the agenda. Anti-racism policy update, Dr. Bernard Harrison. Dr. Harrison.\nSpeaker_1: Ms. Johnson, shall I share my screen? Yes. Thank you, ma'am. You see a full screen?\nSpeaker_1: Yeah. Very good.\nSpeaker_1: So are we seeing a full screen? All right. Sorry. You see a full screen? Yep. Yes. OK. So good evening board members, Dr. Haas as well, and the community. I am Bernard Hairston, your assistant superintendent of School and Community Empowerment. I will be joined by Jasmine Fernandez this evening, the project manager of the Anti-Racism Steering Committee and Policy Implementation Team. We will present an overview and update to th

In [197]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 grahampaige: That's not appearing. The slide's a little bit off. OK, there it is. It's there now. OK.<|eot_id|>"

In [198]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 766 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 12,156,928/3,000,000,000 (0.41% trained)


Step,Training Loss
1,0.8234
2,0.5429
3,0.6167
4,2.0071
5,0.8457
6,1.1462
7,0.6941
8,0.8987
9,0.9139
10,0.5216


In [206]:
train_data['conversations'][0]

[{'content': "Speaker_26: So Mr. Page, I'll turn it back over to you. It sounds like we have consensus to move forward.\ngrahampaige: Right, we do. So thank you for the out, and before I miss waters wicks leaves. This is just something I heard through the rumor mill so it may not necessarily be true, but I understand that she is only that one of her main incentives for doing this for us is that she expects some new cars over. at Albemarle High School. So we might have to look at some of those one-time funds for her new cars. I'm not sure really what it is. But that's what the rumor mill was saying. I think it's one per high school, Mr. Page.\nSpeaker_1: One per high school.\ngrahampaige: Oh, OK. I thought that was only at Albemarle. But OK, one per high school. So Monticello Western and Albemarle. Yes, sir. Thank you, Ms. Watters-Wicks. We'll see what we can do.\nSpeaker_1: OK. I think the students would like really fast ones.",
  'role': 'user'},
 {'content': 'grahampaige: OK. All rig

In [221]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Speaker_26: So Mr. Page, I'll turn it back over to you. It sounds like we have consensus to move forward.\ngrahampaige: Right, we do. So thank you for the out, and before I miss waters wicks leaves. This is just something I heard through the rumor mill so it may not necessarily be true, but I understand that she is only that one of her main incentives for doing this for us is that she expects some new cars over. at Albemarle High School. So we might have to look at some of those one-time funds for her new cars. I'm not sure really what it is. But that's what the rumor mill was saying. I think it's one per high school, Mr. Page.\nSpeaker_1: One per high school.\ngrahampaige: Oh, OK. I thought that was only at Albemarle. But OK, one per high school. So Monticello Western and Albemarle. Yes, sir. Thank you, Ms. Watters-Wicks. We'll see what we can do.\nSpeaker_1: OK. I think the students would like really fast ones."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

OK. I think the students would like really fast ones.

OK. I think the students would like really fast ones.
OK. I think the students would like really fast ones.

 OK. I think the students would like really fast ones.
 
OK. I think the students would like really fast ones.<|eot_id|>


In [208]:
# inputs is a tensor, shape: (batch_size, seq_length)
print("Inputs shape:", inputs.shape)

# Get first example token IDs as a list
input_ids_list = inputs[0].tolist()

# Decode back to text
decoded_text = tokenizer.decode(input_ids_list)

print("Token IDs:", input_ids_list)
print("Decoded Text:", decoded_text)

# Show tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids_list)
print("Tokens:", tokens)


Inputs shape: torch.Size([1, 278])
Token IDs: [128000, 128006, 9125, 128007, 271, 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 198, 15724, 2696, 25, 220, 1627, 5887, 220, 2366, 19, 271, 128009, 128006, 882, 128007, 271, 83136, 62, 1627, 25, 2100, 4491, 13, 5874, 11, 358, 3358, 2543, 433, 1203, 927, 311, 499, 13, 1102, 10578, 1093, 584, 617, 24811, 311, 3351, 4741, 627, 911, 1494, 23465, 7404, 25, 10291, 11, 584, 656, 13, 2100, 9901, 499, 369, 279, 704, 11, 323, 1603, 358, 3194, 21160, 289, 5908, 11141, 13, 1115, 374, 1120, 2555, 358, 6755, 1555, 279, 59001, 2606, 779, 433, 1253, 539, 14647, 387, 837, 11, 719, 358, 3619, 430, 1364, 374, 1193, 430, 832, 315, 1077, 1925, 36580, 369, 3815, 420, 369, 603, 374, 430, 1364, 25283, 1063, 502, 9515, 927, 13, 520, 32672, 336, 277, 273, 5234, 6150, 13, 2100, 584, 2643, 617, 311, 1427, 520, 1063, 315, 1884, 832, 7394, 10736, 369, 1077, 502, 9515, 13, 358, 2846, 539, 2771, 2216, 1148, 433, 374, 13, 2030, 430, 596, 1148, 279, 59001, 2606, 574, 

In [223]:
import torch
from transformers import TextStreamer

model.eval()
streamer = TextStreamer(tokenizer)

sample_inputs = [
    # No <|begin_of_text|> here
    """<|eot_id|><|start_header_id|>user<|end_header_id|>

Speaker_26: So Mr. Page, I'll turn it back over to you. It sounds like we have consensus to move forward.
grahampaige: Right, we do. So thank you for the out, and before I miss waters wicks leaves. This is just something I heard through the rumor mill so it may not necessarily be true, but I understand that she is only that one of her main incentives for doing this for us is that she expects some new cars over. at Albemarle High School. So we might have to look at some of those one-time funds for her new cars. I'm not sure really what it is. But that's what the rumor mill was saying. I think it's one per high school, Mr. Page.
Speaker_1: One per high school.
grahampaige: Oh, OK. I thought that was only at Albemarle. But OK, one per high school. So Monticello Western and Albemarle. Yes, sir. Thank you, Ms. Watters-Wicks. We'll see what we can do.
Speaker_1: OK. I think the students would like really fast ones.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\ngrahampaige:"""
]

for i, prompt in enumerate(sample_inputs):
    print(f"\n--- Example {i + 1} ---")

    formatted_prompt = f"<|begin_of_text|>{prompt}"
    inputs = tokenizer(
        formatted_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=4096,
    )
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.9,
            top_p=0.95,
            repetition_penalty=2.1,
            streamer=streamer,  # Comment this out if you want to capture the result
        )


--- Example 1 ---
<|begin_of_text|><|begin_of_text|><|eot_id|><|start_header_id|>user<|end_header_id|>

Speaker_26: So Mr. Page, I'll turn it back over to you. It sounds like we have consensus to move forward.
grahampaige: Right, we do. So thank you for the out, and before I miss waters wicks leaves. This is just something I heard through the rumor mill so it may not necessarily be true, but I understand that she is only that one of her main incentives for doing this for us is that she expects some new cars over. at Albemarle High School. So we might have to look at some of those one-time funds for her new cars. I'm not sure really what it is. But that's what the rumor mill was saying. I think it's one per high school, Mr. Page.
Speaker_1: One per high school.
grahampaige: Oh, OK. I thought that was only at Albemarle. But OK, one per high school. So Monticello Western and Albemarle. Yes, sir. Thank you, Ms. Watters-Wicks. We'll see what we can do.
Speaker_1: OK. I think the students w