In [1]:
from transformers import AutoTokenizer

student_ckpt = "meta-llama/Llama-3.2-1B-Instruct"
student_tokenizer = AutoTokenizer.from_pretrained(
    student_ckpt,
    use_fast=True,
    device_map="auto",
    padding_side="left",
    extra_special_tokens={"pad_token":"<|pad|>", "end_of_translation_token":"<|end_of_translation|>"}
)

print(f"Tokenization padding side: {student_tokenizer.padding_side}")

src_lang = "en"
tgt_lang = "fr"

def unlabeled_chat_template(example):
    messages = [
        {"role": "system", "content": "You are a professional translator. Translate the provided text from English to French, remaining true to the source text. Do not add any additional commentary or conversational elements to your response."},
        {"role": "user", "content": example["translation"][src_lang]}
    ]
    prompt = student_tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return {"prompt": prompt}

def apply_chat_template(example):
    messages = [
        {"role": "system", "content": "You are a professional translator. Translate the provided text from English to French, remaining true to the source text. Do not add any additional commentary or conversational elements to your response."},
        {"role": "user", "content": example["translation"][src_lang]},
        {"role": "assistant", "content": example["translation"][tgt_lang] + student_tokenizer.end_of_translation_token}
    ]
    prompt = student_tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    return {"prompt": prompt}

Tokenization padding side: left


In [2]:
print(apply_chat_template({"translation":{"en":"Hello", "fr":"Bonjour"}}))

{'prompt': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 31 Mar 2025\n\nYou are a professional translator. Translate the provided text from English to French, remaining true to the source text. Do not add any additional commentary or conversational elements to your response.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nBonjour<|end_of_translation|><|eot_id|>'}


In [3]:
print(unlabeled_chat_template({"translation":{"en":"Hello", "fr":"Bonjour"}}))

{'prompt': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 31 Mar 2025\n\nYou are a professional translator. Translate the provided text from English to French, remaining true to the source text. Do not add any additional commentary or conversational elements to your response.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'}


In [4]:
from datasets import load_dataset, Dataset

# Load the dataset

dataset = load_dataset("Helsinki-NLP/europarl", "en-fr")

# Shuffle it so we get a balanced distribution

shuffled_dataset = dataset.shuffle(seed=42)
# shuffled_dataset.save_to_disk("europarl_dataset")

# Select train, val, and test splits

train_dataset = Dataset.from_dict(shuffled_dataset["train"][:32000])
val_dataset = Dataset.from_dict(shuffled_dataset["train"][32000:36571])
test_dataset = Dataset.from_dict(shuffled_dataset["train"][36571:45715])

# train_dataset = load_dataset("europarl_dataset", split='train[:70000]')
# val_dataset = load_dataset("europarl_dataset", split='train[70000:80000]')
# test_dataset = load_dataset("europarl_dataset", split='train[80000:100000]')

# Apply the prompt template

train_prompt = train_dataset.map(apply_chat_template)
val_prompt = val_dataset.map(apply_chat_template)
test_prompt = test_dataset.map(apply_chat_template)

test_prompt_unlabeled = test_dataset.map(unlabeled_chat_template)

Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4571 [00:00<?, ? examples/s]

Map:   0%|          | 0/9144 [00:00<?, ? examples/s]

Map:   0%|          | 0/9144 [00:00<?, ? examples/s]

In [5]:
def tokenize_function(example):
    tokens = student_tokenizer(example['prompt'], truncation=True, padding='max_length', max_length=256)
    tokens['labels'] = [
        -100 if token == student_tokenizer.pad_token_id else token for token in tokens['input_ids']
    ]
    return tokens

In [6]:
train_tokenized = train_prompt.map(tokenize_function)
test_tokenized = test_prompt.map(tokenize_function)
val_tokenized = val_prompt.map(tokenize_function)

test_unlabeled_tokenized = test_prompt_unlabeled.map(tokenize_function)

train_tokenized.save_to_disk("europarl_dataset/train_tokenized")
val_tokenized.save_to_disk("europarl_dataset/val_tokenized")
test_tokenized.save_to_disk("europarl_dataset/test_tokenized")
test_unlabeled_tokenized.save_to_disk("europarl_dataset/test_tokenized_unlabeled")

Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9144 [00:00<?, ? examples/s]

Map:   0%|          | 0/4571 [00:00<?, ? examples/s]

Map:   0%|          | 0/9144 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/32000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4571 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9144 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9144 [00:00<?, ? examples/s]

In [7]:
print(student_tokenizer.decode(train_tokenized[100]["input_ids"]))

2025-03-31 23:34:49.580492: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743478489.598461  899774 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743478489.604017  899774 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743478489.618971  899774 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743478489.618981  899774 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743478489.618984  899774 computation_placer.cc:177] computation placer alr

<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 31 Mar 2025

You are a professional translator. Translate the provided text from English to French, remaining true to the source text. Do not add any additional commentary or conversational elements to your response.<|eot_id|><|start_header_id|>user<|end_header_id|>

This institution – which, I would take this opportunity to point out, has absolutely no democratic or electoral legitimacy – can, as and when it sees fit, withdraw or modify a legislative proposal, inform or not inform Parliament of the reasons for its decision and take account, or not take account, of Parliament’s opinion. In short, it can do what it wishes.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Cette institution - dont je rappelle ic