## Task 1: Fine-tune LLM for Language Translation







In [4]:
# Install required libraries
!pip install -q torch                                  # Pytorch
!pip install -q transformers datasets                  # Comes from HuggingFace
!pip install -q bitsandbytes                           # For quantization from HuggingFace
!pip install -q peft                                   # Parameter-efficient Fine-tuning from HuggingFace
!pip install -q trl                                    # For supervised fine-tuning for LLMs from HuggingFace
!pip install -q accelerate                             # For distributed training from HuggingFace
!pip install evaluate                                  # For bleu score

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m112.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m92.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
# To load the model, we’ll need a HuggingFace API token.
from google.colab import userdata
HF_API_TOKEN = userdata.get('HF_APITOKEN')

# Set the token as an environment variable
import os
os.environ["HF_TOKEN"] = HF_API_TOKEN

Loading Dataset

In [6]:
# Load the dataset from HuggingFace
from datasets import load_dataset


# Download and load the dataset
dataset = load_dataset("Helsinki-NLP/europarl",'de-fr')



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/76.3k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1942666 [00:00<?, ? examples/s]

In [4]:
# Display the dataset features
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1942666
    })
})

In [5]:
# Let's view an example from the training dataset
dataset["train"][0]

{'translation': {'de': 'Wiederaufnahme der Sitzungsperiode',
  'fr': 'Reprise de la session'}}

In [7]:
# Randomly select 1,000 sentence pairs
sampled_data = dataset["train"].shuffle(seed=40).select(range(1000))

# Split into train (80%) and validation (20%)
train_size = int(0.8 * len(sampled_data))
train_data = sampled_data.select(range(train_size))
test_data = sampled_data.select(range(train_size, len(sampled_data)))

# Print dataset sizes
print(f"Training set size: {len(train_data)}")
print(f"Test set size: {len(test_data)}")

Training set size: 800
Test set size: 200


In [8]:
print(train_data['translation'][0])

{'de': 'Während Österreich, Deutschland und die Benelux-Staaten bald keine relevanten EU-Außengrenze mehr sichern müssen, muss Italien allein 7.600 km Seegrenze kontrollieren, davon die Hälfte zur Adria und dem südlichen Mittelmeer.', 'fr': 'Alors que l’Autriche, l’Allemagne et les États du Benelux n’auront bientôt plus de frontière extérieure de l’Union européenne à défendre, l’Italie doit surveiller à elle seule 7\xa0600 kilomètres de frontières maritimes, dont la moitié le long de l’Adriatique et de la Méditerranée du Sud.'}


Loading Model

In [9]:
# Load BitsAndBytes object from HuggingFace Transformers
from transformers import BitsAndBytesConfig
import torch

# Set up the quantization configuration
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,                     # Use 4-bit quantization (Q = 4 bits)
    bnb_4bit_use_double_quant=True,        # Double quantization: quantize the quantization constants to save an additional 0.4 bits per parameter
    bnb_4bit_quant_type="nf4",             # Use 4-bit NormalFloat Quantization (optimal for normal weights; enforces w ∈ [-1,1])
    bnb_4bit_compute_dtype=torch.bfloat16  # Dequantize to 16-bits before computation (as described in the paper)
)

# Pass the quantization configuration when loading the model
from transformers import AutoModelForCausalLM

# Load the quantized model
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct",
                                             quantization_config=quant_config)


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Loading Tokenizer

In [11]:
# Get the tokenizer object from HuggingFace
from transformers import AutoTokenizer

# Download and initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

tokenizer.pad_token = tokenizer.eos_token

# Specify that padding should be applied to the right side of the sequences.
# This is the standard behavior for Mistral and many other causal language models.
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [12]:
#Evaluate Model A on Test Data

import evaluate

bleu = evaluate.load("bleu")
def compute_bleu(predictions, references):
    results = bleu.compute(predictions=predictions, references=references)
    return results["bleu"]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [16]:
# Tokenization function


def tokenize_function(examples):
    inputs = [ex[0] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs,return_tensors='pt')
    return model_inputs

dataset_a_test = test_data.map(tokenize_function)
print(dataset_a_test[0])

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

{'translation': {'de': 'Verdacht scheint mir nicht ausreichend zu sein.', 'fr': "J'estime qu'il ne suffit pas d'avoir des soupçons."}, 'input_ids': [[128000, 67], [128000, 69]], 'attention_mask': [[1, 1], [1, 1]], 'labels': [[128000, 68], [128000, 81]]}


In [19]:
# # prompt = """
# # Verdacht scheint mir nicht ausreichend zu sein.
# # """
# # target = "J'estime qu'il ne suffit pas d'avoir des soupçons."

# for example in dataset:

#         inputs = tokenizer(example['translation']["de"], return_tensors="pt", padding=True, truncation=True, max_length=128)

# # Tokenize the user prompt
# inputs = tokenizer(prompt,text_target=target, return_tensors='pt')

# # Since the model is loaded on GPU, we need to move the input to the same device
# inputs = inputs.to('cuda')

# # Generate a response from the model
# output_tokens = model.generate(
#     inputs["input_ids"],
#     max_new_tokens=40,
#     attention_mask=inputs["attention_mask"]
# )[0]  # Get the first sequence from the batch of generated tokens

# # Decode the output tokens back into text
# output = tokenizer.decode(output_tokens, skip_special_tokens=True)  # Skip special tokens like EOS, padding, etc.

# # Print the output, wrapped to 80 characters per line for better readability
# import textwrap
# print(textwrap.fill(output, width=80))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 Verdacht scheint mir nicht ausreichend zu sein. Es ist mir nicht gelungen, den
Fall zu lösen, und ich bin mir nicht sicher, ob ich es hätte tun können. Es ist
mir nicht gelungen, den Fall zu lösen,


In [24]:
def evaluate_model(model, dataset):
    model.eval()
    predictions, references = [], []
    for example in dataset:

        inputs = tokenizer(example['translation']["de"], return_tensors="pt", padding=True, truncation=True,)
        inputs = inputs.to('cuda')
        output = model.generate(inputs["input_ids"],attention_mask=inputs["attention_mask"],pad_token_id=tokenizer.eos_token_id,max_new_tokens=40)
        pred_text = tokenizer.decode(output[0], skip_special_tokens=True)
        predictions.append(pred_text)
        references.append(example['translation']["fr"])
    return compute_bleu(predictions, references)

In [25]:
initial_bleu = evaluate_model(model, dataset_a_test)
print(f"Initial Model BLEU Score: {initial_bleu}")


Initial Model BLEU Score: 0.003849090809870262
