In [1]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [2]:
#Run in quiet mode and as a shell command
!pip install -q transformers
!pip install -q torch
!pip install -q evaluate
!pip install -U datasets
!pip install -q sacrebleu
!pip install -q peft                                   # Parameter-efficient Fine-tuning from HuggingFace
!pip install -q trl                                    # For supervised fine-tuning for LLMs from HuggingFace
!pip install -q accelerate
!pip install -q bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from huggingface_hub import login
from google.colab import userdata
HF_TOKEN=userdata.get('HF_TOKEN')

if HF_TOKEN:
    login(HF_TOKEN)
    print("Successfully logged in to Hugging Face!")
else:
    print("Token is not set. Please save the token first.")

Successfully logged in to Hugging Face!


# **Model Quantization**

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, BitsAndBytesConfig
import torch

torch.cuda.empty_cache()


quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
    )

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it",)
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", quantization_config=quant_config)
model = model.to("cuda")

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [7]:
gbs = model.get_memory_footprint() / 1e9
print(f"Number of parameters: {model.num_parameters()}")
print(f"Memory footprint in float 32 : {model.num_parameters() * 4 / 1e9} GB") # 32 bit / 8bytes
print(f"Memory footprint after quantization in 4 bit : {gbs:.2f}GB")

Number of parameters: 2506172416
Memory footprint in float 32 : 10.024689664 GB
Memory footprint after quantization in 4 bit : 2.04GB


## **Loading the Dataset**

In [8]:
from datasets import load_dataset, Dataset
from tqdm import tqdm

dataset = load_dataset("Darth-Vaderr/English-German", split="train", streaming = True)

samples = []
for i, example in tqdm(enumerate(dataset)):
    samples.append(example)
    if i >= 1499:
        break


ds = Dataset.from_list(samples)
ds = ds.train_test_split(train_size=0.8)

# Test split
test = ds["test"]

# Split the train again
ds = ds["train"].train_test_split(train_size=0.8)

# train and val split
train = ds["train"]
val = ds["test"]

README.md: 0.00B [00:00, ?B/s]

1499it [00:00, 1925.05it/s]


In [None]:
print(train)
print(test)
print(val)

Dataset({
    features: ['German', 'English'],
    num_rows: 960
})
Dataset({
    features: ['German', 'English'],
    num_rows: 300
})
Dataset({
    features: ['German', 'English'],
    num_rows: 240
})


## **Testing Vanilla Model**

In [None]:
# Test the model for translation without fine tuning.
english = test[3]["English"]

prompt = f"""
### Instruction:
You are a translation assistant. Translate the following English sentence to German.
### English:
{english}
### German:
"""

print(prompt)


### Instruction:
You are a translation assistant. Translate the following English sentence to German.
### English:
I see the European Central Bank as Caesar's wife and she is respectable, but she must make more of an effort to show the outside world that she is respectable.
### German:



In [None]:
from transformers import pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer,
                max_new_tokens = 50,
                do_sample = True,eos_token_id=tokenizer.eos_token_id,
                # device=0,
                return_full_text=False,
                num_return_sequences=1)
output = pipe(prompt)[0]["generated_text"]
res = output.replace(prompt,"")
print(res)

Device set to use cuda:0


Ich sehe die Europäische Zentralbank als die Frau Caesars und sie ist respektabel, aber sie muss mehr tun, um außerhalb der eigenen Augen zu zeigen, dass sie respektabel ist.
###


# **Evalute Score before fine tuning**

In [None]:
# Load SacreBleu

import evaluate
metric = evaluate.load("sacrebleu")


Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# test
predictions = ["Hallo, ich bin sankalp"]
references = [["Hallo, ich bin sankalp", "Hallo, ich heiße sankalp"]]
metric.compute(predictions=predictions, references=references)

{'score': 100.00000000000004,
 'counts': [5, 4, 3, 2],
 'totals': [5, 4, 3, 2],
 'precisions': [100.0, 100.0, 100.0, 100.0],
 'bp': 1.0,
 'sys_len': 5,
 'ref_len': 5}

In [None]:
# Generate References
references = [[dp["German"]] for dp in test]

In [None]:
def format_instruction_test(english):
  return f"""
  ### Instruction:
  You are a translation assistant. Translate the following English sentence to German.
  ### English:
  {english}
  ### German:
  """

In [None]:
def convert_to_instruction_format_test(data_point):
  return {
      "text": format_instruction_test(data_point["English"])
  }


In [None]:
output = convert_to_instruction_format_test(ds["test"][0])
print(output["text"])



  ### Instruction:
  You are a translation assistant. Translate the following English sentence to German.
  ### English:
  It is worth recalling the objective of the UNESCO Convention, which is to protect heritage and not to enter it on a heritage list.
  ### German:
  


In [None]:
#generate Predections from the vanila model
def predict(model, ds):
  predictions = []
  for dp in tqdm(ds):
    prompt = convert_to_instruction_format_test(dp)["text"]
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    inputs = inputs.to("cuda")

    output_tokens = model.generate(inputs["input_ids"],
                                  max_new_tokens=50,
                                  pad_token_id=tokenizer.eos_token_id,
                                   temperature=0.7,
                                   do_sample=True)[0]
    output = tokenizer.decode(output_tokens, skip_special_tokens=True)
    res = output.replace(prompt,"")
    predictions.append(res)
  return predictions


In [None]:
pre_ft_predictions= predict(model, test)

  0%|          | 0/300 [00:00<?, ?it/s]


NameError: name 'convert_to_instruction_format_test' is not defined

In [None]:
metric.compute(predictions=predictions, references=references)

**A score of <10 is very low, so now we try to Fine tune**

# ***Fine Tuning***

In [None]:
def format_instruction_train(english, german):
  return f"""
  ### Instruction:
  You are a translation assistant. Translate the following English sentence to German.
  ### Input:
  {english.strip()}
  ### German:
  {german.strip()}
  """


In [None]:
def convert_to_instruction_format_train(data_point):
  return {
      "text": format_instruction_train(data_point["English"], data_point["German"])
  }


In [None]:
#pre process each row of the dataset
def process_dataset(data):
  return data.map(
      convert_to_instruction_format_train
  ).remove_columns(["German","English"])

In [None]:
train_data = process_dataset(train.shuffle(seed=42))
validation_data = process_dataset(val)

Map:   0%|          | 0/960 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

In [None]:
print(train_data[0]["text"])


  ### Instruction:
  You are a translation assistant. Translate the following English sentence to German.
  ### Input:
  So I fail to see what there is to criticise about something which is within the bounds of both what is acceptable and what has been planned.
  ### German:
  Folglich verstehe ich nicht, warum hier eine Regelung angegriffen wird, die sich im Rahmen des Annehmbaren und Vorhersehbaren bewegt.
  


# **PEFT Setup**

In [None]:
from peft import LoraConfig, TaskType,get_peft_model,prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=64,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
# Get the model with unfrozen LoRA layers applied
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 3,686,400 || all params: 2,509,858,816 || trainable%: 0.1469


# **Fine Tuning**

In [None]:
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Set up the FSDP config. To enable FSDP via SPMD, set xla_fsdp_v2 to True.
fsdp_config = {
    "fsdp_transformer_layer_cls_to_wrap": ["GemmaDecoderLayer"],
    "xla": True,
    "xla_fsdp_v2": True,
    "xla_fsdp_grad_ckpt": True
}


In [6]:
from trl import SFTConfig

# Set up the training hyperparameters
training_arguments = SFTConfig(
    fp16=True,                           # Use 16-bit precision for training computations (optimizer states, gradients)
    dataset_text_field="text",           # Specify the text field in the dataset for training
    max_seq_length=256,                 # Set the maximum sequence length for the training data (Reduced)

    # Batch-related parameters
    per_device_train_batch_size=4,       # Batch size per device during training (Reduced)
    gradient_accumulation_steps=2,       # Accumulate gradients over 2 steps

    # Optimizer-related parameters
    optim="paged_adamw_32bit",           # Use the paged AdamW optimizer, optimized for 32-bit GPUs
    learning_rate=1e-4,                  # Set the learning rate for training

    # Epochs and saving configuration
    num_train_epochs=2,                  # Number of training epochs (more epochs generally lead to better results)
    save_strategy="epoch",               # Save the model after each epoch
    output_dir="./epoch-finetuned",      # Directory to save the fine-tuned model

    # Validation-related parameters
    eval_strategy="steps",               # Evaluation strategy, performed at specified steps
    eval_steps=0.2,                      # Evaluate after 20% of the training steps

    # Logging-related parameters
    report_to="none",                    # Disable reporting to external tools
    logging_dir="./logs",                # Directory to save the training logs
    logging_steps=20,                    # Number of steps between each log entry
    seed=42,                             # Set a random seed for reproducibility
)

# Enable gradient checkpointing to save memory and recompute during backpropagation
model.gradient_checkpointing_enable()

# Disable attention cache during training; it should be enabled during inference
model.config.use_cache = False

In [None]:
from trl import SFTTrainer

# Initialize the trainer
trainer = SFTTrainer(
    # Assign the model and tokenizer
    model=model,
    processing_class = tokenizer,

    # Provide the training and validation datasets
    train_dataset=train_data,
    eval_dataset=validation_data,

    # Pass the LoRA configuration
    peft_config=lora_config,

    # Set the training hyperparameters
    args=training_arguments,
    dataloader_drop_last = True,  # Required for SPMD.
    fsdp="full_shard",
    fsdp_config=fsdp_config,
)

Adding EOS to train dataset:   0%|          | 0/960 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/960 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/960 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/240 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/240 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/240 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

Step,Training Loss,Validation Loss
48,1.8013,1.725557
96,1.6944,1.65305
144,1.5516,1.626208
192,1.499,1.61067
240,1.5567,1.603283


TrainOutput(global_step=240, training_loss=1.7081714630126954, metrics={'train_runtime': 540.9676, 'train_samples_per_second': 3.549, 'train_steps_per_second': 0.444, 'total_flos': 3367543988502528.0, 'train_loss': 1.7081714630126954})

In [None]:
# Define the save path for the fine-tuned model on Colab
peft_model_path = "./fine-tuned-gemma"

# Save the trained model
trainer.model.save_pretrained(peft_model_path)

# Save the tokenizer
tokenizer.save_pretrained(peft_model_path)

# List the saved files
!ls -lh {peft_model_path}


total 51M
-rw-r--r-- 1 root root  851 Jul 11 14:28 adapter_config.json
-rw-r--r-- 1 root root  15M Jul 11 14:28 adapter_model.safetensors
-rw-r--r-- 1 root root  591 Jul 11 14:28 chat_template.jinja
-rw-r--r-- 1 root root 5.1K Jul 11 14:28 README.md
-rw-r--r-- 1 root root  636 Jul 11 14:28 special_tokens_map.json
-rw-r--r-- 1 root root  40K Jul 11 14:28 tokenizer_config.json
-rw-r--r-- 1 root root  33M Jul 11 14:28 tokenizer.json
-rw-r--r-- 1 root root 4.1M Jul 11 14:28 tokenizer.model


# **Test BLEU Score with Fine Trained model**

In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

# Load the fine-tuned model
peft_model_path = "./fine-tuned-gemma"
tuned_model = AutoPeftModelForCausalLM.from_pretrained(
    peft_model_path,
    quantization_config=quant_config  # Load with 4-bit quantization
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(peft_model_path)

# Set the padding token to be the same as the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

# Specify that padding should be added to the right side of the sequences
tokenizer.padding_side = "right"

# Enable attention cache during inference
tuned_model.config.use_cache = True

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
post_ft_predictions = predict(tuned_model, test)

  0%|          | 0/300 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 300/300 [13:07<00:00,  2.62s/it]


In [None]:
test[0]

{'German': 'Herr Präsident! Ich möchte ein paar Worte zum ausgezeichneten Bericht des Kollegen Staes verlieren.',
 'English': 'Mr President, I would like to say a few words concerning the excellent report by Mr Staes.'}

In [None]:
predictions[0]

'Herr Präsident! Ich möchte sich auch einige Worte über den gutmütigen Bericht von Herrn Staes machen.\n  '

In [None]:
metric.compute(predictions=predictions, references=references)

{'score': 8.333167176648319,
 'counts': [2759, 838, 369, 180],
 'totals': [7231, 6931, 6631, 6332],
 'precisions': [38.15516526068317,
  12.090607415957294,
  5.5647715276730505,
  2.8427037271004423],
 'bp': 0.9016022681360983,
 'sys_len': 7231,
 'ref_len': 7980}