In [57]:
! pip install -q evaluate torch tqdm datasets peft transformers rouge_score hf_transfer colorama
! pip install -q -U bitsandbytes

In [77]:
import os
import yaml
import torch
from transformers import (
    TrainingArguments,
    Trainer,
)
from torch.utils.data import DataLoader
from datasets import load_dataset, load_from_disk, Dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from pprint import pprint
from IPython.display import display, HTML
import pandas as pd

In [81]:
sample = {
    "dialogue": (
        "A: Hi!\n"
        "B: Hello! How are you?\n"
        "A: I'm great, thanks!"
    ),
    "summary": "A greets B and says they're doing well.",
}

print(sample)

{'dialogue': "A: Hi!\nB: Hello! How are you?\nA: I'm great, thanks!", 'summary': "A greets B and says they're doing well."}


In [61]:
from IPython.display import display, HTML
def visualize_chat_template_jupyter(text, show_newlines=True):
    """
    Visualize chat template with colors in Jupyter notebooks.
    Returns HTML object - use with display()
    """
    import html
    
    # Escape HTML special characters
    text = html.escape(text)
    
    # Make newlines visible
    if show_newlines:
        text = text.replace('\n', '<span style="color: #ff1493; font-weight: bold;">‚Üµ</span><br>')
    else:
        text = text.replace('\n', '<br>')
    
    # Color special tokens (cyan)
    special_tokens = [
        '&lt;|begin_of_text|&gt;',
        '&lt;|start_header_id|&gt;',
        '&lt;|end_header_id|&gt;',
        '&lt;|eot_id|&gt;'
    ]
    for token in special_tokens:
        text = text.replace(token, f'<span style="color: #00CED1; font-weight: bold;">{token}</span>')
    
    # Color role headers
    text = text.replace('system', '<span style="color: #FFD700; font-weight: bold; background: #3a3a00; padding: 2px 4px;">system</span>')
    text = text.replace('user', '<span style="color: #1E90FF; font-weight: bold; background: #001a33; padding: 2px 4px;">user</span>')
    text = text.replace('assistant', '<span style="color: #32CD32; font-weight: bold; background: #0d2d0d; padding: 2px 4px;">assistant</span>')
    
    # Wrap in styled div
    html_output = f'''
    <div style="
        font-family: 'Courier New', monospace; 
        background: #1e1e1e; 
        padding: 20px; 
        border-radius: 8px; 
        color: #d4d4d4;
        font-size: 13px;
        line-height: 1.6;
        overflow-x: auto;
        border: 2px solid #444;
    ">
        {text}
    </div>
    '''
    
    return HTML(html_output)


def print_template_breakdown_jupyter(text):
    """
    Structured breakdown for Jupyter with color-coded sections.
    """
    sections_html = '<div style="font-family: Arial, sans-serif;">'
    sections_html += '<h3 style="color: #333;">üìù CHAT TEMPLATE BREAKDOWN</h3>'
    sections_html += '<hr style="border: 1px solid #ddd;">'
    
    # Extract system
    if '<|start_header_id|>system<|end_header_id|>' in text:
        system_start = text.find('<|start_header_id|>system<|end_header_id|>') + len('<|start_header_id|>system<|end_header_id|>')
        system_end = text.find('<|eot_id|>', system_start)
        system_content = text[system_start:system_end].strip()
        sections_html += f'''
        <div style="margin: 15px 0; padding: 15px; background: #fff9e6; border-left: 4px solid #FFD700; border-radius: 4px;">
            <h4 style="color: #b8860b; margin: 0 0 10px 0;">üü° SYSTEM</h4>
            <pre style="margin: 0; white-space: pre-wrap; font-size: 12px;">{system_content[:300]}</pre>
        </div>
        '''
    
    # Extract user
    if '<|start_header_id|>user<|end_header_id|>' in text:
        user_start = text.find('<|start_header_id|>user<|end_header_id|>') + len('<|start_header_id|>user<|end_header_id|>')
        user_end = text.find('<|eot_id|>', user_start)
        user_content = text[user_start:user_end].strip()
        sections_html += f'''
        <div style="margin: 15px 0; padding: 15px; background: #e6f2ff; border-left: 4px solid #1E90FF; border-radius: 4px;">
            <h4 style="color: #1E90FF; margin: 0 0 10px 0;">üîµ USER (PROMPT)</h4>
            <pre style="margin: 0; white-space: pre-wrap; font-size: 12px;">{user_content[:300]}</pre>
        </div>
        '''
    
    # Extract assistant
    if '<|start_header_id|>assistant<|end_header_id|>' in text:
        asst_start = text.find('<|start_header_id|>assistant<|end_header_id|>') + len('<|start_header_id|>assistant<|end_header_id|>')
        asst_end = text.find('<|eot_id|>', asst_start)
        asst_content = text[asst_start:asst_end].strip()
        sections_html += f'''
        <div style="margin: 15px 0; padding: 15px; background: #e6ffe6; border-left: 4px solid #32CD32; border-radius: 4px;">
            <h4 style="color: #228b22; margin: 0 0 10px 0;">üü¢ ASSISTANT (TRAINING TARGET)</h4>
            <pre style="margin: 0; white-space: pre-wrap; font-size: 12px; font-weight: bold;">{asst_content}</pre>
            <p style="margin: 10px 0 0 0; color: #666; font-size: 11px;">‚úÖ Only this section is used for loss calculation</p>
        </div>
        '''
    
    sections_html += '</div>'
    
    return HTML(sections_html)


# Build Messages for Each Sample

### Helper functions

In [62]:
def build_user_prompt(dialogue: str, task_instruction: str) -> str:
    """Construct a summarization-style prompt given a dialogue and instruction."""
    return f"{task_instruction}\n\n## Dialogue:\n{dialogue}\n## Summary:"
    

def build_messages_for_sample(sample, task_instruction, include_assistant=False):
    """
    Build a chat-style message list for a given sample, compatible with
    models that use chat templates (like Llama 3).
    """
    messages = [
        {
            "role": "user",
            "content": build_user_prompt(sample["dialogue"], task_instruction),
        }
    ]
    if include_assistant:
        messages.append({"role": "assistant", "content": sample["summary"]})
    return messages

### Task Instruction

In [63]:
task_instruction = (
    "You are a helpful assistant who writes concise, factual summaries of conversations. "
    "Summarize the following conversation into a single sentence. "
)

task_instruction

'You are a helpful assistant who writes concise, factual summaries of conversations. Summarize the following conversation into a single sentence. '

### Sample Messages With Assistant Response

In [82]:
## ORIGINAL SAMPLE
print(sample)

{'dialogue': "A: Hi!\nB: Hello! How are you?\nA: I'm great, thanks!", 'summary': "A greets B and says they're doing well."}


In [64]:
sample_messages_w_assistant_resp = build_messages_for_sample(sample, task_instruction, include_assistant=True)

pprint(sample_messages_w_assistant_resp)

[{'content': 'You are a helpful assistant who writes concise, factual '
             'summaries of conversations. Summarize the following conversation '
             'into a single sentence. \n'
             '\n'
             '## Dialogue:\n'
             'A: Hi!\n'
             'B: Hello! How are you?\n'
             "A: I'm great, thanks!\n"
             '## Summary:',
  'role': 'user'},
 {'content': "A greets B and says they're doing well.", 'role': 'assistant'}]


### Sample Messages Without Assistant Response

In [65]:
sample_messages_wo_assistant_resp = build_messages_for_sample(sample, task_instruction, include_assistant=False)

pprint(sample_messages_wo_assistant_resp)

[{'content': 'You are a helpful assistant who writes concise, factual '
             'summaries of conversations. Summarize the following conversation '
             'into a single sentence. \n'
             '\n'
             '## Dialogue:\n'
             'A: Hi!\n'
             'B: Hello! How are you?\n'
             "A: I'm great, thanks!\n"
             '## Summary:',
  'role': 'user'}]


# Tokenization and Chat Template

In [66]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")

In [85]:
chat_full = tokenizer.apply_chat_template(
    sample_messages_w_assistant_resp, tokenize=False, add_generation_prompt=False
)

print(f"original messages: \n{sample_messages_w_assistant_resp}\n")
print("="*80)
print(f"After applying chat template: \n{chat_full}")

original messages: 
[{'role': 'user', 'content': "You are a helpful assistant who writes concise, factual summaries of conversations. Summarize the following conversation into a single sentence. \n\n## Dialogue:\nA: Hi!\nB: Hello! How are you?\nA: I'm great, thanks!\n## Summary:"}, {'role': 'assistant', 'content': "A greets B and says they're doing well."}]

After applying chat template: 
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 10 Nov 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

You are a helpful assistant who writes concise, factual summaries of conversations. Summarize the following conversation into a single sentence. 

## Dialogue:
A: Hi!
B: Hello! How are you?
A: I'm great, thanks!
## Summary:<|eot_id|><|start_header_id|>assistant<|end_header_id|>

A greets B and says they're doing well.<|eot_id|>


In [68]:
chat_prompt_only = tokenizer.apply_chat_template(
    sample_messages_wo_assistant_resp, tokenize=False, add_generation_prompt=True
)

chat_prompt_only

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 10 Nov 2025\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nYou are a helpful assistant who writes concise, factual summaries of conversations. Summarize the following conversation into a single sentence. \n\n## Dialogue:\nA: Hi!\nB: Hello! How are you?\nA: I'm great, thanks!\n## Summary:<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

In [69]:

display(visualize_chat_template_jupyter(chat_full))

print("\n" + "="*80 + "\n")

display(visualize_chat_template_jupyter(chat_prompt_only))





In [70]:
def preprocess_samples(examples, tokenizer, task_instruction, max_length):
    """Tokenize dialogues and apply assistant-only masking for causal LM."""
    input_ids_list, labels_list, attn_masks = [], [], []

    for d, s in zip(examples["dialogue"], examples["summary"]):
        sample = {"dialogue": d, "summary": s}

        # Build chat-style text
        msgs_full = build_messages_for_sample(
            sample, task_instruction, include_assistant=True
        )
        msgs_prompt = build_messages_for_sample(
            sample, task_instruction, include_assistant=False
        )

        text_full = tokenizer.apply_chat_template(
            msgs_full, tokenize=False, add_generation_prompt=False
        )
        text_prompt = tokenizer.apply_chat_template(
            msgs_prompt, tokenize=False, add_generation_prompt=True
        )
        prompt_len = len(text_prompt)

        tokens = tokenizer(
            text_full,
            max_length=max_length,
            truncation=True,
            padding=False,
            add_special_tokens=False,
            return_offsets_mapping=True,
        )

        # Mask non-assistant tokens
        start_idx = len(tokens["input_ids"])
        for i, (start, _) in enumerate(tokens["offset_mapping"]):
            if start >= prompt_len:
                start_idx = i
                break

        labels = [-100] * start_idx + tokens["input_ids"][start_idx:]
        input_ids_list.append(tokens["input_ids"])
        labels_list.append(labels)
        attn_masks.append(tokens["attention_mask"])

    return {
        "input_ids": input_ids_list,
        "labels": labels_list,
        "attention_mask": attn_masks,
    }

In [73]:
samples_dataset = Dataset.from_dict({
    "dialogue": [sample["dialogue"]],
    "summary": [sample["summary"]],
})

processed_samples = preprocess_samples(samples_dataset, tokenizer, task_instruction, max_length=256)

In [78]:
prompt_text = (
    f"{task_instruction.strip()}\n\n"
    f"## Dialogue:\n{sample['dialogue']}\n\n"
    "## Summary:"
)


print("\n============================")
print("üìú ORIGINAL PROMPT SENT TO TOKENIZER")
print("============================")
print(prompt_text)
print("============================\n")
print(f"Ground truth summary: \n{example['summary']}\n")
print("============================\n")

# ---------------------------------------------------------------------------
# Visualize tokenization and masking
# ---------------------------------------------------------------------------

# Extract first example
ex = {k: v[0] for k, v in processed_samples.items()}
tokens = [tokenizer.decode([tid]) for tid in ex["input_ids"]]

df = pd.DataFrame({
    "token": tokens,
    "input_id": ex["input_ids"],
    "attention_mask": ex["attention_mask"],
    "label": ex["labels"],
})
df["masked"] = df["label"].apply(lambda x: x == -100)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

print(df.head(100))



üìú ORIGINAL PROMPT SENT TO TOKENIZER
You are a helpful assistant who writes concise, factual summaries of conversations. Summarize the following conversation into a single sentence.

## Dialogue:
A: Hi!
B: Hello! How are you?
A: I'm great, thanks!

## Summary:

Ground truth summary: 
A greets B and says they're doing well.


                  token  input_id  attention_mask   label  masked
0     <|begin_of_text|>    128000               1    -100    True
1   <|start_header_id|>    128006               1    -100    True
2                system      9125               1    -100    True
3     <|end_header_id|>    128007               1    -100    True
4                  \n\n       271               1    -100    True
5                   Cut     38766               1    -100    True
6                  ting      1303               1    -100    True
7             Knowledge     33025               1    -100    True
8                  Date      2696               1    -100    True
9         