# Headers and Installs

In [None]:
#!pip install transformers datasets peft accelerate
!pip install bitsandbytes datasets trl

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.17.0-py3-none-any.whl.metadata (12 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12

In [None]:
# load dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Finetuning the Model

**Loading the Model and Tokenizer**

In [None]:
# Import necessary libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, setup_chat_format, DataCollatorForCompletionOnlyLM
import torch

# Load the model and tokenizer
model_name = "aisingapore/gemma2-9b-cpt-sea-lionv3-instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/870 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/37.2k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.57G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.7k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

**Loading the Dataset**

In [None]:
file_path = "/content/drive/MyDrive/TweetTaglish/TweetTaglish-parallel.csv"

# Load the CSV file
raw_dataset = load_dataset("csv", data_files={"train": file_path}, split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# Add prompt to finetuning data so the model understands what it's looking at
def reformat(example):
  example = {
      "prompt": [{"role": "user", "content": f"Translate the following Tweet from English to Tagalog-English code-switching:\n {example['input_text']}"}],
      "completion": [{"role": "assistant", "content": example['target_text']}]
      }

  return example

In [None]:
raw_dataset = raw_dataset.map(reformat, remove_columns=raw_dataset.column_names)
raw_dataset

Map:   0%|          | 0/3010 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 3010
})

**LoRA Config**

In [None]:
from peft import LoraConfig

# r: rank dimension for LoRA update matrices (smaller = more compression)
rank_dimension = 16
# lora_alpha: scaling factor for LoRA layers (higher = stronger adaptation)
lora_alpha = 8
# lora_dropout: dropout probability for LoRA layers (helps prevent overfitting)
lora_dropout = 0.05

peft_config = LoraConfig(
    r=rank_dimension,  # Rank dimension - typically between 4-32
    lora_alpha=lora_alpha,  # LoRA scaling factor - typically 2x rank
    lora_dropout=lora_dropout,  # Dropout probability for LoRA layers
    bias="none",  # Bias type for LoRA. the corresponding biases will be updated during training.
    target_modules="all-linear",  # Which modules to apply LoRA to
    task_type="CAUSAL_LM",  # Task type for model architecture
)

In [None]:
# Training configuration
# Hyperparameters based on QLoRA paper recommendations
args = SFTConfig(
    max_seq_length=4096,
    packing=False,

    # Output settings
    output_dir="./lora-sealion-finetuned",  # Directory to save model checkpoints

    # Training duration
    num_train_epochs=3,  # Number of training epochs

    # Batch size settings
    per_device_train_batch_size=4,  # Batch size per GPU
    gradient_accumulation_steps=4,  # Accumulate gradients for larger effective batch

    # Memory optimization
    gradient_checkpointing=True,  # Trade compute for memory savings

    # Optimizer settings
    optim="adamw_torch_fused",  # Use fused AdamW for efficiency
    learning_rate=2e-4,  # Learning rate (QLoRA paper)
    max_grad_norm=0.3,  # Gradient clipping threshold

    # Learning rate schedule
    warmup_ratio=0.03,  # Portion of steps for warmup
    lr_scheduler_type="constant",  # Keep learning rate constant after warmup

    # Logging and saving
    logging_steps=10,  # Log metrics every N steps
    save_strategy="epoch",  # Save checkpoint every epoch

    # Precision settings
    bf16=True,  # Use bfloat16 precision

    # Integration settings
    push_to_hub=False,  # Don't push to HuggingFace Hub
    report_to="none",  # Disable external logging
)

In [None]:
# Create SFTTrainer with LoRA configuration
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=raw_dataset,
    peft_config=peft_config,  # LoRA configuration
    processing_class=tokenizer
    # data_collator=collator
)



Converting train dataset to ChatML:   0%|          | 0/3010 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
train_dataloader = trainer.get_train_dataloader()

index = 0
for batch_data in train_dataloader:
    input_ids = batch_data['input_ids'][index]
    attention_mask = batch_data['attention_mask'][index]
    label_ids = batch_data['labels'][index]

    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    decoded = tokenizer.decode(input_ids, skip_special_tokens=False)

    print("Tokens:")
    for i, token in enumerate(tokens):
        attn = attention_mask[i]
        if label_ids[i] != -100:
          label_token = tokenizer.convert_ids_to_tokens([label_ids[i]])[0]
        else:
          label_token = 'IGN'

        print(f"{i:2d}: {token:12s} | Label_id: {label_ids[i]} | Attention: {attn} | Label: {label_token}")

    print("\nDecoded sentence:")
    print(decoded)
    break

Tokens:
 0: <bos>        | Label_id: -100 | Attention: 1 | Label: IGN
 1: <start_of_turn> | Label_id: -100 | Attention: 1 | Label: IGN
 2: user         | Label_id: -100 | Attention: 1 | Label: IGN
 3: 
            | Label_id: -100 | Attention: 1 | Label: IGN
 4: Translate    | Label_id: -100 | Attention: 1 | Label: IGN
 5: ▁the         | Label_id: -100 | Attention: 1 | Label: IGN
 6: ▁following   | Label_id: -100 | Attention: 1 | Label: IGN
 7: ▁Tweet       | Label_id: -100 | Attention: 1 | Label: IGN
 8: ▁from        | Label_id: -100 | Attention: 1 | Label: IGN
 9: ▁English     | Label_id: -100 | Attention: 1 | Label: IGN
10: ▁to          | Label_id: -100 | Attention: 1 | Label: IGN
11: ▁Tag         | Label_id: -100 | Attention: 1 | Label: IGN
12: alog         | Label_id: -100 | Attention: 1 | Label: IGN
13: -            | Label_id: -100 | Attention: 1 | Label: IGN
14: English      | Label_id: -100 | Attention: 1 | Label: IGN
15: ▁code        | Label_id: -100 | Attention: 1 | Label: I

In [None]:
trainer.train()

It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,2.4472
20,1.9221
30,1.807
40,1.7645
50,1.6889
60,1.7456
70,1.8722
80,1.6516
90,1.6092
100,1.7677


TrainOutput(global_step=564, training_loss=1.286307155240512, metrics={'train_runtime': 1673.9018, 'train_samples_per_second': 5.395, 'train_steps_per_second': 0.337, 'total_flos': 5.466406524046541e+16, 'train_loss': 1.286307155240512})

In [None]:
from peft import PeftModel

# After training with SFTTrainer
trainer.model.save_pretrained("lora-sealion-finetuned-1")

# Merge and save full weights
base_model = AutoModelForCausalLM.from_pretrained(
    "aisingapore/gemma2-9b-cpt-sea-lionv3-instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)
merged_model = PeftModel.from_pretrained(base_model, "lora-sealion-finetuned-1")
merged_model = merged_model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



In [None]:
# Save model to Hugging Face
from huggingface_hub import login
login(token="") #deleted

merged_model.push_to_hub("charlottepuopolo/sealion-3v-9b-it-taglish")
tokenizer.push_to_hub("charlottepuopolo/sealion-3v-9b-it-taglish")

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]



Saving checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/charlotte657/lora-sealion-finetuned-1/commit/c386e240c3b48df51153db6d431a89fc466ab91e', commit_message='Upload tokenizer', commit_description='', oid='c386e240c3b48df51153db6d431a89fc466ab91e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/charlotte657/lora-sealion-finetuned-1', endpoint='https://huggingface.co', repo_type='model', repo_id='charlotte657/lora-sealion-finetuned-1'), pr_revision=None, pr_num=None)

# Inference

**inference**

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from transformers import pipeline
pipe = pipeline("text-generation", model="charlottepuopolo/sealion-3v-9b-it-taglish")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/870 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/39.1k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.8k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
messages = [
    {"role": "user", "content": "Translate the following Tweet from English to Tagalog-English code-switching:\nHey How are you? Today has been crazy omg"},
]
pipe(messages)


[{'generated_text': [{'role': 'user',
    'content': 'Translate the following Tweet from English to Tagalog-English code-switching:\nHey How are you? Today has been crazy omg'},
   {'role': 'assistant',
    'content': 'hoy kamusta ka? grabe talaga today omg'}]}]