In [24]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [1]:
#Run in quiet mode and as a shell command
!pip install -q transformers
!pip install -q evaluate
!pip install -U datasets
!pip install -q sacrebleu

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Succes

In [2]:
from huggingface_hub import login
from google.colab import userdata
HF_TOKEN=userdata.get('HF_TOKEN')

if HF_TOKEN:
    login(HF_TOKEN)
    print("Successfully logged in to Hugging Face!")
else:
    print("Token is not set. Please save the token first.")

Successfully logged in to Hugging Face!


In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
model = model.to("cuda")

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

## **Loading the Dataset**

In [58]:
from datasets import load_dataset, Dataset
from tqdm import tqdm

dataset = load_dataset("Darth-Vaderr/English-German", split="train", streaming = True)

samples = []
for i, example in tqdm(enumerate(dataset)):
    samples.append(example)
    if i >= 1499:
        break


ds = Dataset.from_list(samples)
ds = ds.train_test_split(train_size=0.8)

# Test split
test = ds["test"]

# Split the train again
ds = ds["train"].train_test_split(train_size=0.8)

# train and val split
train = ds["train"]
val = ds["test"]

1499it [00:00, 5826.39it/s]


In [60]:
print(train)
print(test)
print(val)

Dataset({
    features: ['German', 'English'],
    num_rows: 960
})
Dataset({
    features: ['German', 'English'],
    num_rows: 300
})
Dataset({
    features: ['German', 'English'],
    num_rows: 240
})


## **Testing Vanila Model**

In [62]:
# Test the model for translation without fine tuning.
english = test[3]["English"]

prompt = f"""
### Instruction:
You are a translation assistant. Translate the following English sentence to German.
### English:
{english}
### German:
"""

print(prompt)


### Instruction:
You are a translation assistant. Translate the following English sentence to German.
### English:
So, as far as this is concerned, well done to the Commission.
### German:



In [11]:
from transformers import pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer,
                max_new_tokens = 50,
                do_sample = True,eos_token_id=tokenizer.eos_token_id,
                device=0,
                return_full_text=False,
                num_return_sequences=1)
output = pipe(prompt)[0]["generated_text"]
res = output.replace(prompt,"")
print(res)

Device set to use cuda:0


Frau van der Laan's Bericht ist richtig, aber dies führt zu dem Schluss, dass der Kommision noch nicht ausreichend Fortschritte auf dem Personalpolitikablauf für den Bundestag erreicht hat.


# **Evalute Score before fine tuning**

In [10]:
# Load SacreBleu

import evaluate
metric = evaluate.load("sacrebleu")


Downloading builder script: 0.00B [00:00, ?B/s]

In [12]:
# test
predictions = ["Hallo, ich bin sankalp"]
references = [["Hallo, ich bin sankalp", "Hallo, ich heiße sankalp"]]
metric.compute(predictions=predictions, references=references)

{'score': 100.00000000000004,
 'counts': [5, 4, 3, 2],
 'totals': [5, 4, 3, 2],
 'precisions': [100.0, 100.0, 100.0, 100.0],
 'bp': 1.0,
 'sys_len': 5,
 'ref_len': 5}

In [63]:
# Generate References
references = [[dp["German"]] for dp in test]

In [64]:
def format_instruction_test(english):
  return f"""
  ### Instruction:
  You are a translation assistant. Translate the following English sentence to German.
  ### English:
  {english}
  ### German:
  """

In [65]:
def convert_to_instruction_format_test(data_point):
  return {
      "text": format_instruction_test(data_point["English"])
  }


In [66]:
output = convert_to_instruction_format_test(ds["test"][0])
print(output["text"])



  ### Instruction:
  You are a translation assistant. Translate the following English sentence to German.
  ### English:
  It is to the rapporteur' s credit that he constantly draws attention to this, and not just in times such as these.
  ### German:
  


In [67]:
#generate Predections from the vanila model
def predict(model, ds):
  predictions = []
  for dp in tqdm(ds):
    prompt = convert_to_instruction_format_test(dp)["text"]
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    inputs = inputs.to("cuda")

    output_tokens = model.generate(inputs["input_ids"],
                                  max_new_tokens=50,
                                  pad_token_id=tokenizer.eos_token_id,
                                   temperature=0.7,
                                   do_sample=True)[0]
    output = tokenizer.decode(output_tokens, skip_special_tokens=True)
    res = output.replace(prompt,"")
    predictions.append(res)
  return predictions


In [68]:
predictions= predict(model, test)

100%|██████████| 300/300 [08:25<00:00,  1.69s/it]


In [69]:
metric.compute(predictions=predictions, references=references)

{'score': 8.877162257177117,
 'counts': [3213, 1074, 441, 201],
 'totals': [7944, 7644, 7344, 7045],
 'precisions': [40.44561933534743,
  14.050235478806908,
  6.004901960784314,
  2.8530872959545777],
 'bp': 0.8936760915323725,
 'sys_len': 7944,
 'ref_len': 8837}

**A score of <10 is very low, so now we try to Fine tune**

# ***Fine Tuning***

In [81]:
def format_instruction_train(english, german):
  return f"""
  ### Instruction:
  You are a translation assistant. Translate the following English sentence to German.
  ### Input:
  {english.strip()}
  ### German:
  {german.strip()}
  """


In [82]:
def convert_to_instruction_format_train(data_point):
  return {
      "text": format_instruction_train(data_point["English"], data_point["German"])
  }


In [83]:
#pre process each row of the dataset
def process_dataset(data):
  return data.map(
      convert_to_instruction_format_train
  ).remove_columns(["German","English"])

In [84]:
train_data = process_dataset(train.shuffle(seed=42))
validatio_data = process_dataset(val)

Map:   0%|          | 0/960 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

In [85]:
print(train_data[0]["text"])


  ### Instruction:
  You are a translation assistant. Translate the following English sentence to German.
  ### Input:
  The next item is the final draft agenda as drawn up by the Conference of Presidents at its meeting of Tuesday 11 January.
  ### German:
  Nach der Tagesordnung folgt die Prüfung des endgültigen Entwurfs der Tagesordnung, wie er von der Konferenz der Präsidenten in der Sitzung am 11. Januar erstellt wurde.
  


# **PEFT Setup**

In [86]:
print(model)

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
      )
    )
    (norm): GemmaRMSNorm((2048,), 