In [203]:
! pip install -q evaluate torch tqdm datasets peft transformers rouge_score hf_transfer colorama
! pip install -q -U bitsandbytes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
from datasets import Dataset
from transformers import AutoTokenizer
from pprint import pprint
from IPython.display import display
import pandas as pd

### Helper functions

In [205]:
def build_user_prompt(dialogue: str, task_instruction: str) -> str:
    """Construct a summarization-style prompt given a dialogue and instruction."""
    return f"{task_instruction}\n\n## Dialogue:\n{dialogue}\n## Summary:"
    

def build_messages_for_sample(sample, task_instruction, include_assistant=False):
    """
    Build a chat-style message list for a given sample, compatible with
    models that use chat templates (like Llama 3).
    """
    messages = [
        {
            "role": "user",
            "content": build_user_prompt(sample["dialogue"], task_instruction),
        }
    ]
    if include_assistant:
        messages.append({"role": "assistant", "content": sample["summary"]})
    return messages

def preprocess_samples(examples, tokenizer, task_instruction, max_length):
    """Tokenize dialogues and apply assistant-only masking for causal LM."""
    input_ids_list, labels_list, attn_masks = [], [], []

    for d, s in zip(examples["dialogue"], examples["summary"]):
        sample = {"dialogue": d, "summary": s}

        # Build chat-style text
        msgs_full = build_messages_for_sample(
            sample, task_instruction, include_assistant=True
        )
        msgs_prompt = build_messages_for_sample(
            sample, task_instruction, include_assistant=False
        )

        text_full = tokenizer.apply_chat_template(
            msgs_full, tokenize=False, add_generation_prompt=False
        )
        text_prompt = tokenizer.apply_chat_template(
            msgs_prompt, tokenize=False, add_generation_prompt=True
        )
        prompt_len = len(text_prompt)

        tokens = tokenizer(
            text_full,
            max_length=max_length,
            truncation=True,
            padding=False,
            add_special_tokens=False,
            return_offsets_mapping=True,
        )

        # Mask non-assistant tokens
        start_idx = len(tokens["input_ids"])
        for i, (start, _) in enumerate(tokens["offset_mapping"]):
            if start >= prompt_len:
                start_idx = i
                break

        labels = [-100] * start_idx + tokens["input_ids"][start_idx:]
        input_ids_list.append(tokens["input_ids"])
        labels_list.append(labels)
        attn_masks.append(tokens["attention_mask"])

    return {
        "input_ids": input_ids_list,
        "labels": labels_list,
        "attention_mask": attn_masks,
    }

# Example

In [206]:
sample = {
    "dialogue": (
        "A: Hi!\n"
        "B: Hello! How are you?\n"
        "A: I'm great, thanks!"
    ),
    "summary": "A greets B and says they're doing well.",
}

In [207]:
task_instruction = (
    "You are a helpful assistant who writes concise, factual summaries of conversations. "
    "Summarize the following conversation into a single sentence. "
)

In [208]:
# STEP 1: Convert to messages
sample_messages_full = build_messages_for_sample(
    sample,
    task_instruction,
    include_assistant=True
)

# STEP 2: Convert to chat template
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
chat_full = tokenizer.apply_chat_template(
    sample_messages_full,
    tokenize=False,
    add_generation_prompt=False
)

# STEP 3/4: Tokenize and apply assistant-only masking
samples_dataset = Dataset.from_dict({
    "dialogue": [sample["dialogue"]],
    "summary": [sample["summary"]],
})
processed_samples = preprocess_samples(samples_dataset, tokenizer, task_instruction, max_length=256)

In [209]:
print("=" * 80)
print("ðŸ“œ ORIGINAL EXAMPLE:\n")
pprint(sample)
print("=" * 80)
print("ðŸ“œ STEP 1: AFTER CONVERTING TO 'MESSAGES':\n")
pprint(sample_messages_full)
print("=" * 80)
print("STEP 2: AFTER APPLYING CHAT TEMPLATE: \n")
print(chat_full)
print("=" * 80)

# ---------------------------------------------------------------------------
# Visualize tokenization and masking
# ---------------------------------------------------------------------------
print("STEP 3/4: AFTER TOKENIZATION AND MASKING: \n")

# Extract first example
ex = {k: v[0] for k, v in processed_samples.items()}
tokens = [tokenizer.decode([tid]) for tid in ex["input_ids"]]

df = pd.DataFrame({
    "token": tokens,
    "input_id": ex["input_ids"],
    "attention_mask": ex["attention_mask"],
    "label": ex["labels"],
})
df["masked"] = df["label"].apply(lambda x: x == -100)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

print(df)

ðŸ“œ ORIGINAL EXAMPLE:

{'dialogue': "A: Hi!\nB: Hello! How are you?\nA: I'm great, thanks!",
 'summary': "A greets B and says they're doing well."}
ðŸ“œ STEP 1: AFTER CONVERTING TO 'MESSAGES':

[{'content': 'You are a helpful assistant who writes concise, factual '
             'summaries of conversations. Summarize the following conversation '
             'into a single sentence. \n'
             '\n'
             '## Dialogue:\n'
             'A: Hi!\n'
             'B: Hello! How are you?\n'
             "A: I'm great, thanks!\n"
             '## Summary:',
  'role': 'user'},
 {'content': "A greets B and says they're doing well.", 'role': 'assistant'}]
STEP 2: AFTER APPLYING CHAT TEMPLATE: 

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 11 Nov 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

You are a helpful assistant who writes concise, factual summaries of conversations. Summarize the following conversati