# Headers and Installs

In [None]:
#!pip install transformers datasets peft accelerate
!pip install bitsandbytes datasets trl

In [None]:
# load dataset
from google.colab import drive
drive.mount('/content/drive')

# Finetuning the Model

**Loading the Model and Tokenizer**

In [None]:
# Import necessary libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, setup_chat_format, DataCollatorForCompletionOnlyLM
import torch

# Load the model and tokenizer
model_name = "aisingapore/gemma2-9b-cpt-sea-lionv3-instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

**Loading the Dataset**

In [None]:
file_path = "/content/drive/MyDrive/TweetTaglish/TweetTaglish-parallel.csv"

# Load the CSV file
raw_dataset = load_dataset("csv", data_files={"train": file_path}, split="train")

In [None]:
# Add prompt to finetuning data so the model understands what it's looking at
def reformat(example):
  example = {
      "prompt": [{"role": "user", "content": f"Translate the following Tweet from English to Tagalog-English code-switching:\n {example['input_text']}"}],
      "completion": [{"role": "assistant", "content": example['target_text']}]
      }

  return example

In [None]:
raw_dataset = raw_dataset.map(reformat, remove_columns=raw_dataset.column_names)
raw_dataset

**LoRA Config**

In [None]:
from peft import LoraConfig

# r: rank dimension for LoRA update matrices (smaller = more compression)
rank_dimension = 16
# lora_alpha: scaling factor for LoRA layers (higher = stronger adaptation)
lora_alpha = 8
# lora_dropout: dropout probability for LoRA layers (helps prevent overfitting)
lora_dropout = 0.05

peft_config = LoraConfig(
    r=rank_dimension,  # Rank dimension - typically between 4-32
    lora_alpha=lora_alpha,  # LoRA scaling factor - typically 2x rank
    lora_dropout=lora_dropout,  # Dropout probability for LoRA layers
    bias="none",  # Bias type for LoRA. the corresponding biases will be updated during training.
    target_modules="all-linear",  # Which modules to apply LoRA to
    task_type="CAUSAL_LM",  # Task type for model architecture
)

In [None]:
# Training configuration
# Hyperparameters based on QLoRA paper recommendations
args = SFTConfig(
    max_seq_length=4096,
    packing=False,

    # Output settings
    output_dir="./lora-sealion-finetuned",  # Directory to save model checkpoints

    # Training duration
    num_train_epochs=3,  # Number of training epochs

    # Batch size settings
    per_device_train_batch_size=4,  # Batch size per GPU
    gradient_accumulation_steps=4,  # Accumulate gradients for larger effective batch

    # Memory optimization
    gradient_checkpointing=True,  # Trade compute for memory savings

    # Optimizer settings
    optim="adamw_torch_fused",  # Use fused AdamW for efficiency
    learning_rate=2e-4,  # Learning rate (QLoRA paper)
    max_grad_norm=0.3,  # Gradient clipping threshold

    # Learning rate schedule
    warmup_ratio=0.03,  # Portion of steps for warmup
    lr_scheduler_type="constant",  # Keep learning rate constant after warmup

    # Logging and saving
    logging_steps=10,  # Log metrics every N steps
    save_strategy="epoch",  # Save checkpoint every epoch

    # Precision settings
    bf16=True,  # Use bfloat16 precision

    # Integration settings
    push_to_hub=False,  # Don't push to HuggingFace Hub
    report_to="none",  # Disable external logging
)

In [None]:
# Create SFTTrainer with LoRA configuration
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=raw_dataset,
    peft_config=peft_config,  # LoRA configuration
    processing_class=tokenizer
    # data_collator=collator
)

In [None]:
train_dataloader = trainer.get_train_dataloader()

index = 0
for batch_data in train_dataloader:
    input_ids = batch_data['input_ids'][index]
    attention_mask = batch_data['attention_mask'][index]
    label_ids = batch_data['labels'][index]

    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    decoded = tokenizer.decode(input_ids, skip_special_tokens=False)

    print("Tokens:")
    for i, token in enumerate(tokens):
        attn = attention_mask[i]
        if label_ids[i] != -100:
          label_token = tokenizer.convert_ids_to_tokens([label_ids[i]])[0]
        else:
          label_token = 'IGN'

        print(f"{i:2d}: {token:12s} | Label_id: {label_ids[i]} | Attention: {attn} | Label: {label_token}")

    print("\nDecoded sentence:")
    print(decoded)
    break

In [None]:
trainer.train()

In [None]:
from peft import PeftModel

# After training with SFTTrainer
trainer.model.save_pretrained("lora-sealion-finetuned-1")

# Merge and save full weights
base_model = AutoModelForCausalLM.from_pretrained(
    "aisingapore/gemma2-9b-cpt-sea-lionv3-instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)
merged_model = PeftModel.from_pretrained(base_model, "lora-sealion-finetuned-1")
merged_model = merged_model.merge_and_unload()

##Save model to Hugging Face

In [None]:
# Save model to Hugging Face
from huggingface_hub import login
login(token="") #deleted

merged_model.push_to_hub("charlottepuopolo/sealion-3v-9b-it-taglish")
tokenizer.push_to_hub("charlottepuopolo/sealion-3v-9b-it-taglish")

# Inference

**inference**

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from transformers import pipeline
pipe = pipeline("text-generation", model="charlottepuopolo/sealion-3v-9b-it-taglish")

In [None]:
messages = [
    {"role": "user", "content": "Translate the following Tweet from English to Tagalog-English code-switching:\nHey How are you? Today has been crazy omg"},
]
pipe(messages)
