## Install Deps

In [1]:
!pip install torch==2.4.1 transformers==4.45.1 bitsandbytes==0.44.1 peft==0.13.0 datasets trl



## Import Packages and Data

In [2]:
import torch
import numpy as np
import pandas as pd
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    TrainerCallback,
)
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model,
)
from trl import SFTTrainer
from tqdm import tqdm
import datasets
import os
import json
from typing import Dict

In [3]:
# Function to clear GPU memory
def clear_gpu_memory():
    # Delete any large variables (replace 'variable' with your variable names if applicable)
    # del variable_name  # Uncomment and replace variable_name with any large variable you want to delete

    # Clear PyTorch's cache
    torch.cuda.empty_cache()

    # Optionally, you can print the GPU memory to confirm clearance
    print(f"Memory Allocated: {torch.cuda.memory_allocated()} bytes")
    print(f"Memory Reserved: {torch.cuda.memory_reserved()} bytes")

# Usage
clear_gpu_memory()

Memory Allocated: 0 bytes
Memory Reserved: 0 bytes


In [46]:
!rm -rf ./csie5431-applied-data-learning
!git clone https://github.com/ruby0322/csie5431-applied-data-learning.git
!mv ./csie5431-applied-data-learning/hw3/preprocess.py ./

Cloning into 'csie5431-applied-data-learning'...
remote: Enumerating objects: 346, done.[K
remote: Counting objects: 100% (139/139), done.[K
remote: Compressing objects: 100% (100/100), done.[K
remote: Total 346 (delta 46), reused 121 (delta 31), pack-reused 207 (from 1)[K
Receiving objects: 100% (346/346), 4.23 MiB | 16.98 MiB/s, done.
Resolving deltas: 100% (140/140), done.


In [5]:
!gdown 1X04jwetkzUnlAtX0W4exMQU1x1Us-yfc

Downloading...
From: https://drive.google.com/uc?id=1X04jwetkzUnlAtX0W4exMQU1x1Us-yfc
To: /content/hw3.zip
  0% 0.00/1.26M [00:00<?, ?B/s]100% 1.26M/1.26M [00:00<00:00, 103MB/s]


In [6]:
import os

# Check if the `./hw3` directory already exists
if not os.path.exists('./hw3'):
    # Unzip the file if `./hw3` does not exist
    !unzip hw3.zip
    # Move the data directory
    !mv ./hw3/data ./data
else:
    print("Directory './hw3' already exists. Skipping unzip.")

Directory './hw3' already exists. Skipping unzip.


In [7]:
!python preprocess.py ./data/train.json ./data/train_preprocessed.json
!python preprocess.py ./data/public_test.json ./data/public_test_preprocessed.json

Data has been saved to ./data/train_preprocessed.json
Data has been saved to ./data/public_test_preprocessed.json


In [8]:
# preprocessed_train_df = pd.read_json('./data/train_preprocessed.json')
preprocessed_train_df = pd.read_json('./data/train.json')
preprocessed_train_df = preprocessed_train_df[preprocessed_train_df['task'] != '']
# preprocessed_test_df = pd.read_json('./data/public_test_preprocessed.json')
preprocessed_test_df = pd.read_json('./data/public_test.json')

In [9]:
preprocessed_train_df

Unnamed: 0,id,instruction,output,task
0,db63fb72-e211-4596-94a4-69617706f7ef,雅裏惱怒地說： 從前在福山田獵時，你誣陷獵官，現在又說這種話。翻譯成文言文：,雅裏怒曰： 昔畋於福山，卿誣獵官，今復有此言。,翻譯成文言文
1,a48b0e8f-dc7a-4130-acc6-a91cc4a81bd1,沒過十天，鮑泉果然被拘捕。翻譯成文言文：,後未旬，果見囚執。,翻譯成文言文
2,f98882de-6962-46cf-a8f0-5d534eddda3a,辛未，命吳堅為左丞相兼樞密使，常楙參知政事。翻譯成白話文：,初五，命令吳堅為左承相兼樞密使，常增為參知政事。,翻譯成白話文
3,c491b5f1-fe54-4276-8dd2-cbc7a8d0f3c3,十八年，奚、契丹侵犯邊界，以皇上為河北道元帥，信安王為副，率禦史大夫李朝隱、京兆尹裴亻由先等...,十八年，奚、契丹犯塞，以上為河北道元帥，信安王禕為副，帥禦史大夫李朝隱、京兆尹裴伷先等八總管...,翻譯成文言文
4,bc8d68a3-cfe2-42ee-9d99-415380975642,正月，甲子朔，鼕至，太後享通天宮；赦天下，改元。翻譯成白話文：,聖曆元年正月，甲子朔，鼕至，太後在通天宮祭祀；大赦天下，更改年號。,翻譯成白話文
...,...,...,...,...
9995,98e706d5-0a33-4b56-9849-fe970f7dfcb2,所派的官員還未到哈密，脫脫便得暴病去世。翻譯成文言文：,未至，而脫脫以暴疾卒。,翻譯成文言文
9996,9f38308a-63b4-46fb-9262-e5442ab23a6b,李聽命令士兵收起武器在野外駐紮，魏州人纔安定下來。翻譯成文言文：,聽敕士櫜兵野次，魏人乃安。,翻譯成文言文
9997,3d714828-0cce-4806-8af0-11fd863446ae,因此忠貞的臣子，並非不想竭盡忠誠，竭盡忠誠實在太難瞭。翻譯成文言文：,故忠貞之臣，非不欲竭誠。竭誠者，乃是極難。,翻譯成文言文
9998,df77d459-7afe-4b50-81f9-fba03ee94ee2,祿山構逆，承嗣與張忠誌等為前鋒，陷河洛。翻譯成白話文：,安祿山叛亂，田承嗣和張忠誌等擔任先鋒，攻陷河洛。,翻譯成白話文


In [10]:
preprocessed_test_df.iloc[6]['instruction']

'從唐末幽、薊二州割據以來，戍守的軍隊廢置散失，契丹因而得以齣來攻陷平、營二州，而幽、薊二州的人每年深受契丹侵犯掠奪之苦。翻譯成文言文：'

In [11]:
preprocessed_test_df = preprocessed_test_df.iloc[:20]
preprocessed_test_df

Unnamed: 0,id,instruction,output,task
0,2fb7d211-978f-41c8-a3ab-e51d9df06280,於是，廢帝讓瀋慶之的堂侄、直將軍瀋攸之賜瀋慶之毒藥，命瀋慶之自殺。翻譯成文言文：,帝乃使慶之從父兄子直閣將軍攸之賜慶之藥。,翻譯成文言文
1,07f75449-94b9-4c3b-a525-e62cdbf85382,靈鑒忽臨，忻歡交集，乃迴燈拂席以延之。翻譯成白話文：,答案：靈仙忽然光臨，趙旭歡欣交集，於是他就把燈點亮，拂拭乾淨床席來延請仙女。,翻譯成白話文
2,7b7ead70-1353-433f-a59f-7704594cce59,希望您以後留意，不要再齣這樣的事，你的小女兒病就會好。翻譯成文言文：,以後幸長官留意，勿令如此。,翻譯成文言文
3,b8adf597-edb9-46d4-a1a6-074ce9724f07,第二年召迴朝廷，改任著作佐郎，直史館，改任左拾遺。翻譯成文言文：,明年召還，改著作佐郎，直史館，改左拾遺。,翻譯成文言文
4,87945a20-f869-4be9-b586-f5ce20ddd78b,中宗與庶人嘗因正月十五日夜幸其第，賜賚不可勝數。翻譯成白話文：,答案：唐中宗與韋庶人曾經在正月十五日夜到韋安石的宅第，並賜賞給他不可勝數的財物。,翻譯成白話文
5,243f36a1-edbf-4f64-b7e8-4b7414ced101,硃全忠聽後哈哈大笑。翻譯成文言文：,全忠大笑。,翻譯成文言文
6,d11ff6fb-4ff7-44b5-b08a-1dfa98249ec5,從唐末幽、薊二州割據以來，戍守的軍隊廢置散失，契丹因而得以齣來攻陷平、營二州，而幽、薊二州的...,自唐末幽、薊割據，戍兵廢散，契丹因得齣陷平、營，而幽、薊之人歲苦寇鈔。,翻譯成文言文
7,d8d4baf4-c634-453e-83a9-08bb113e6547,建武帝蕭鸞繼位做皇帝，沿襲陳舊的一套做法，當時流行風氣不好文學，輔臣宰相沒有學識，學校雖然設...,建武繼立，因循舊緒，時不好文，輔相無術，學校雖設，前軌難追。劉瓛承馬、鄭之後，一時學徒以為師範。,翻譯成文言文
8,72335b5a-ad07-4255-8a9a-0469ee1dd9f7,契丹主以陽城之戰為彥卿所敗，詰之。彥卿曰： 臣當時惟知為晉主竭力，今日死生惟命。翻譯成白話文：,答案：契丹主因陽城之戰被符彥卿打敗，追問符彥卿，彥卿說： 臣當時隻知為晉主竭盡全力，今日死生...,翻譯成白話文
9,d9de64d9-5c6c-4bfd-8e9c-136e43974319,秦領是都人，從江夏都尉升任南陽太守，上任時經過宜城城內，看見一傢朝東的房子，他停車觀看，說：...,頡，鄀人也，以江夏都尉齣為南陽太守。徑宜城中，見一傢東嚮，頡住車視之，曰：此居處可作塚。,翻譯成文言文


In [12]:
train_dataset = datasets.Dataset.from_pandas(preprocessed_train_df)
test_dataset = datasets.Dataset.from_pandas(preprocessed_test_df)

## Util Functions

In [13]:
MAX_SEQ_LENGTH = 512

In [14]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0+PTX"
os.environ["CUDA_LAUNCH_BLOCKING"] = "0"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["CUDNN_BENCHMARK"] = "0"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

In [15]:
def get_bnb_config() -> BitsAndBytesConfig:
    """Configure quantization for QLoRA 4-bit training"""
    return BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_fp32_cpu_offload=True  # Enable CPU offloading here
    )

def get_prompt(instruction: str) -> str:
    """Format the instruction into a prompt template"""
    return f"你是一位精通古今中文的翻譯專家。只回覆翻譯結果，不可有多餘說明或解釋。\nUSER: {instruction}\nASSISTANT:"

def calculate_perplexity(model, tokenizer, data, max_length=2048):
    """Calculate perplexity using custom loss function"""
    model.eval()
    data_size = len(data)
    instructions = [get_prompt(x["instruction"]) for x in data]
    outputs = [x["output"] for x in data]

    # Tokenize data
    tokenized_instructions = tokenizer(instructions, add_special_tokens=False)
    tokenized_outputs = tokenizer(outputs, add_special_tokens=False)
    output_masks = []

    # Format data
    for i in range(data_size):
        instruction_input_ids = [tokenizer.bos_token_id] + tokenized_instructions["input_ids"][i]
        output_input_ids = tokenized_outputs["input_ids"][i] + [tokenizer.eos_token_id]
        tokenized_instructions["input_ids"][i] = instruction_input_ids + output_input_ids
        tokenized_instructions["attention_mask"][i] = [1] * len(tokenized_instructions["input_ids"][i])
        output_mask = [0] * len(instruction_input_ids) + [1] * len(output_input_ids)

        tokenized_instructions["input_ids"][i] = torch.tensor(
            tokenized_instructions["input_ids"][i][:max_length]
        ).to(model.device)
        tokenized_instructions["attention_mask"][i] = torch.tensor(
            tokenized_instructions["attention_mask"][i][:max_length]
        ).to(model.device)
        output_mask = torch.tensor(output_mask[:max_length]).to(model.device)
        output_masks.append(output_mask)

    # Calculate ppl
    ppls = []
    loss_fct = torch.nn.CrossEntropyLoss(reduction="none")

    for i in tqdm(range(data_size), desc="Calculating perplexity"):
        input_ids = tokenized_instructions["input_ids"][i].unsqueeze(0)
        attn_mask = tokenized_instructions["attention_mask"][i].unsqueeze(0)
        output_mask = output_masks[i].unsqueeze(0)
        label = input_ids

        with torch.no_grad():
            out_logits = model(input_ids, attention_mask=attn_mask).logits

        shift_logits = out_logits[..., :-1, :].contiguous()
        shift_label = label[..., 1:].contiguous()
        shift_output_mask = output_mask[..., 1:].contiguous()

        perplexity_batch = torch.exp(
            (loss_fct(shift_logits.transpose(1, 2), shift_label) * shift_output_mask).sum(1)
            / shift_output_mask.sum(1)
        )
        ppls += perplexity_batch.tolist()

    return {"perplexities": ppls, "mean_perplexity": np.mean(ppls)}

def compute_metrics(eval_pred) -> Dict[str, float]:
    """Compute metrics for trainer evaluation"""
    # We don't use the eval_pred argument since we need the full model
    # for proper perplexity calculation. The actual computation is done
    # in the PerplexityCallback.
    return {}


In [16]:
def create_model_and_tokenizer():
    """Initialize the model and tokenizer with proper configurations"""
    model_name = "zake7749/gemma-2-2b-it-chinese-kyara-dpo"

    # Initialize tokenizer

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = 'right'
    tokenizer.pad_token = tokenizer.eos_token

    # Initialize model with quantization
    device_map = { "": torch.cuda.current_device() }
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=get_bnb_config(),
        device_map=device_map,
    )
    model = prepare_model_for_kbit_training(model)

    # Configure LoRA
    lora_config = LoraConfig(
        r=8,  # Rank
        lora_alpha=32,  # Alpha scaling
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Attention modules
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

    model = get_peft_model(model, lora_config)
    return model, tokenizer

In [17]:
torch.cuda.current_device()

0

## Trainer

In [18]:
class PerplexityCallback(TrainerCallback):
    """Custom callback to log perplexity metrics during training"""
    def __init__(self, eval_dataset, tokenizer):
        self.eval_dataset = eval_dataset
        self.tokenizer = tokenizer
        self.best_ppl = float('inf')
        self.metrics_history = []

    def on_evaluate(self, args, state, control, model, **kwargs):
        """Calculate and log perplexity after each evaluation step"""


        # # Save metrics history
        # with open(os.path.join(args.output_dir, "metrics_history.json"), "w") as f:
        #     json.dump(self.metrics_history, f, indent=2)

class CustomSFTTrainer(SFTTrainer):
    """Custom SFTTrainer with integrated perplexity evaluation"""
    def __init__(self, *args, eval_dataset=None, raw_eval_dataset=None, tokenizer=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.eval_dataset = eval_dataset
        self.raw_eval_dataset = raw_eval_dataset
        self.perplexity_callback = PerplexityCallback(raw_eval_dataset, tokenizer)
        self.add_callback(self.perplexity_callback)
    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        # Call the default evaluation method first
        metrics = super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
        eval_data = [
            {"instruction": item["instruction"], "output": item["output"]}
            for item in self.raw_eval_dataset
        ]
        results = calculate_perplexity(self.model, self.tokenizer, eval_data)
        mean_ppl = results["mean_perplexity"]
        metrics["perplexity"] = mean_ppl

        self.log({f"{metric_key_prefix}_perplexity": mean_ppl})
        self.log({f"train_perplexity": mean_ppl})
        print(f"perplexity:", mean_ppl)

        return metrics

## Training

In [19]:
model, tokenizer = create_model_and_tokenizer()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Unused kwargs: ['bnb_4bit_fp32_cpu_offload']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [20]:
def format_dataset(examples):
    """Format the dataset entries into prompt-completion pairs"""
    prompts = [get_prompt(ex) for ex in examples["instruction"]]
    texts = [f"{p}{ex}" for p, ex in zip(prompts, examples["output"])]
    tokenized = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=MAX_SEQ_LENGTH,
        return_tensors=None  # Return list of dict instead of tensors
    )
    print(tokenized)

    return tokenized

def format_dataset_with_masking(examples):
    """Format the dataset entries into input_ids and labels for fine-tuning."""
    prompts = [get_prompt(ex) for ex in examples["instruction"]]

    instruction_ids = [
        [tokenizer.bos_token_id] + tokenizer(prompt, add_special_tokens=False)["input_ids"]
        for prompt in prompts
    ]
    output_ids = [
        tokenizer(output, add_special_tokens=False)["input_ids"] + [tokenizer.eos_token_id]
        for output in examples["output"]
    ]

    combined_ids = [instr_ids + out_ids for instr_ids, out_ids in zip(instruction_ids, output_ids)]

    # masking for labels
    labels = [
        [-100] * len(instr_ids) + out_ids for instr_ids, out_ids in zip(instruction_ids, output_ids)
    ]
    # padding for labels
    labels = [
        label + [-100] * (MAX_SEQ_LENGTH - len(label)) if len(label) < MAX_SEQ_LENGTH else label[:MAX_SEQ_LENGTH]
        for label in labels
    ]

    return {
        "input_ids": combined_ids,
        "labels": labels
    }

def format_dataset_with_masking_and_attention(examples):
    """
    Format the dataset entries into input_ids and labels for fine-tuning.
    Applies proper masking to ensure loss is only calculated on the output tokens.

    Args:
        examples: Dictionary containing 'instruction' and 'output' keys

    Returns:
        Dictionary with 'input_ids' and 'labels' for training
    """
    # Tokenize instructions with BOS token
    prompts = [get_prompt(ex) for ex in examples["instruction"]]
    instruction_ids = [
        [tokenizer.bos_token_id] + tokenizer(prompt, add_special_tokens=False)["input_ids"]
        for prompt in prompts
    ]

    # Tokenize outputs with EOS token
    output_ids = [
        tokenizer(output, add_special_tokens=False)["input_ids"] + [tokenizer.eos_token_id]
        for output in examples["output"]
    ]

    # Combine input_ids and handle padding
    combined_ids = [instr_ids + out_ids for instr_ids, out_ids in zip(instruction_ids, output_ids)]
    combined_ids = [
        ids + [tokenizer.pad_token_id] * (MAX_SEQ_LENGTH - len(ids))
        if len(ids) < MAX_SEQ_LENGTH
        else ids[:MAX_SEQ_LENGTH]
        for ids in combined_ids
    ]

    # Create labels with masking and padding
    labels = [
        [-100] * len(instr_ids) + out_ids
        for instr_ids, out_ids in zip(instruction_ids, output_ids)
    ]
    labels = [
        label + [-100] * (MAX_SEQ_LENGTH - len(label))
        if len(label) < MAX_SEQ_LENGTH
        else label[:MAX_SEQ_LENGTH]
        for label in labels
    ]

    # Create attention masks
    attention_masks = [
        [1] * len(ids) + [0] * (MAX_SEQ_LENGTH - len(ids))
        if len(ids) < MAX_SEQ_LENGTH
        else [1] * MAX_SEQ_LENGTH
        for ids in combined_ids
    ]

    # Verify lengths match
    for i in range(len(combined_ids)):
        assert len(combined_ids[i]) == len(labels[i]) == len(attention_masks[i]) == MAX_SEQ_LENGTH, \
            f"Length mismatch: input_ids={len(combined_ids[i])}, labels={len(labels[i])}, mask={len(attention_masks[i])}"

    return {
        "input_ids": combined_ids,
        "labels": labels,
        "attention_mask": attention_masks
    }

In [21]:
formatted_train = train_dataset.map(
    format_dataset_with_masking_and_attention,
    batched=True,
    remove_columns=train_dataset.column_names
)
formatted_test = test_dataset.map(
    format_dataset_with_masking_and_attention,
    batched=True,
    remove_columns=test_dataset.column_names
)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [22]:
for item in formatted_train:
    print(item)
    print(len(item['input_ids']))
    break

{'input_ids': [2, 235608, 203100, 236329, 235767, 236194, 235811, 50039, 235370, 204032, 237938, 235618, 235362, 235918, 181016, 204032, 23541, 235365, 28319, 61755, 239359, 92267, 236132, 140800, 235362, 108, 14053, 235292, 115009, 237568, 242365, 237861, 235597, 236478, 235465, 184433, 235648, 235473, 236421, 112948, 242051, 235716, 235365, 235608, 246747, 238742, 242051, 236538, 235365, 21017, 236111, 236478, 64785, 235997, 235362, 204032, 235636, 235642, 235904, 235642, 235465, 108, 222412, 6100, 235292, 237363, 237568, 237861, 239249, 235465, 235248, 238505, 252830, 236434, 236421, 235822, 235365, 239284, 246747, 242051, 236538, 235365, 235811, 237417, 235461, 235966, 235904, 235362, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [23]:
for item in test_dataset:
    print(item)
    print(item)
    break
for item in formatted_test:
    print(item)
    break

{'id': '2fb7d211-978f-41c8-a3ab-e51d9df06280', 'instruction': '於是，廢帝讓瀋慶之的堂侄、直將軍瀋攸之賜瀋慶之毒藥，命瀋慶之自殺。翻譯成文言文：', 'output': '帝乃使慶之從父兄子直閣將軍攸之賜慶之藥。', 'task': '翻譯成文言文'}
{'id': '2fb7d211-978f-41c8-a3ab-e51d9df06280', 'instruction': '於是，廢帝讓瀋慶之的堂侄、直將軍瀋攸之賜瀋慶之毒藥，命瀋慶之自殺。翻譯成文言文：', 'output': '帝乃使慶之從父兄子直閣將軍攸之賜慶之藥。', 'task': '翻譯成文言文'}
{'input_ids': [2, 235608, 203100, 236329, 235767, 236194, 235811, 50039, 235370, 204032, 237938, 235618, 235362, 235918, 181016, 204032, 23541, 235365, 28319, 61755, 239359, 92267, 236132, 140800, 235362, 108, 14053, 235292, 235248, 126459, 235365, 240553, 237008, 237296, 247489, 238979, 235653, 235370, 236815, 242698, 235394, 235948, 189038, 247489, 244242, 235653, 241111, 247489, 238979, 235653, 237334, 239051, 235365, 236313, 247489, 238979, 235653, 193424, 235362, 204032, 235636, 235642, 235904, 235642, 235465, 108, 222412, 6100, 235292, 237008, 237991, 235755, 238979, 235653, 237322, 236701, 237525, 235535, 235948, 239530, 189038, 244242, 235653, 241111, 238979, 235653, 

In [24]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    return f"Trainable model parameters: {trainable_params}\nAll model parameters: {all_param}\nPercentage of trainable model parameters: {100 * trainable_params / all_param:.2f}%"

print(print_trainable_parameters(model))

Trainable model parameters: 3194880
All model parameters: 1605398784
Percentage of trainable model parameters: 0.20%


In [25]:
output_dir = "chinese_translation_model"

LOG_STEPS = 10

# Create training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=32,
    learning_rate=3e-4,
    max_grad_norm=0.4,
    eval_steps=LOG_STEPS,
    save_steps=50,
    logging_steps=LOG_STEPS,
    fp16=True,
    tf32=True,
    # bp16=True,
    save_strategy="steps",
    evaluation_strategy="steps",
    warmup_ratio=0.03,
    weight_decay=0.02,
    max_steps=400,
    load_best_model_at_end=True,
    optim="paged_adamw_8bit",
    greater_is_better=False,
    gradient_checkpointing=True,
    remove_unused_columns=False,
)

# Initialize trainer with custom evaluation
trainer = CustomSFTTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_train,
    eval_dataset=formatted_test,
    raw_eval_dataset=test_dataset,
    tokenizer=tokenizer,
    max_seq_length=MAX_SEQ_LENGTH,
    # dataset_text_field="text",
    compute_metrics=compute_metrics,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [26]:
torch.cuda.empty_cache()  # Frees up unallocated memory

In [27]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mruby0322[0m ([33mruby0322-national-taiwan-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
10,4.3941,0.686094
20,0.5041,0.431008
30,0.4147,0.401891
40,0.3873,0.379896
50,0.3762,0.371198
60,0.3667,0.365911
70,0.3644,0.363345
80,0.3614,0.360922
90,0.3589,0.358241
100,0.3527,0.354853


Calculating perplexity:   0%|          | 0/20 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.90it/s]


perplexity: 81.16077134609222


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 27.30590078830719


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


perplexity: 25.463362395763397


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 25.81434211730957


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 24.909466552734376


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


perplexity: 23.275531858205795


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


perplexity: 21.40688602924347


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


perplexity: 21.788471114635467


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


perplexity: 20.542650026082992


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 19.26263724565506


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 19.73383839726448


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 18.674848473072053


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


perplexity: 20.261848157644273


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 19.041208803653717


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


perplexity: 20.680968099832533


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 19.173552811145782


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


perplexity: 18.119714826345444


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 17.827914279699325


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 16.915142464637757


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 18.730205327272415


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


perplexity: 16.568819332122803


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


perplexity: 16.17318880558014


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


perplexity: 17.907288694381712


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 17.848120391368866


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


perplexity: 19.849804669618607


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 16.9929045855999


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 16.550563418865202


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


perplexity: 17.017216908931733


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 15.871825677156448


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


perplexity: 15.841205149888992


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 15.96219236254692


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 16.00205546617508


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


perplexity: 16.6025308072567


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


perplexity: 16.168438690900803


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 16.089733123779297


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 15.994008129835128


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


perplexity: 15.900015193223954


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.91it/s]


perplexity: 16.01348750591278


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 15.82367542386055


Calculating perplexity: 100%|██████████| 20/20 [00:10<00:00,  1.92it/s]


perplexity: 15.83280103802681


TrainOutput(global_step=400, training_loss=0.44907969772815703, metrics={'train_runtime': 10104.0075, 'train_samples_per_second': 5.067, 'train_steps_per_second': 0.04, 'total_flos': 3.189316415127552e+17, 'train_loss': 0.44907969772815703, 'epoch': 5.12})

In [28]:
trainer.model.save_pretrained(os.path.join(output_dir, "final_checkpoint"))
print(f"\nBest Perplexity: {trainer.perplexity_callback.best_ppl:.3f}")


Best Perplexity: inf


In [40]:
!gdown 1v55KH4lmNZqb-Hl2vQTdGq5o6AsvrjKS

Downloading...
From: https://drive.google.com/uc?id=1v55KH4lmNZqb-Hl2vQTdGq5o6AsvrjKS
To: /content/no-preprocess-zero-shot-15.13.zip
100% 23.2M/23.2M [00:00<00:00, 35.6MB/s]


In [41]:
!unzip no-preprocess-zero-shot-15.13.zip

Archive:  no-preprocess-zero-shot-15.13.zip
   creating: no-preprocess-zero-shot-15.13/
  inflating: __MACOSX/._no-preprocess-zero-shot-15.13  
  inflating: no-preprocess-zero-shot-15.13/adapter_model.safetensors  
  inflating: __MACOSX/no-preprocess-zero-shot-15.13/._adapter_model.safetensors  
  inflating: no-preprocess-zero-shot-15.13/rng_state.pth  
  inflating: __MACOSX/no-preprocess-zero-shot-15.13/._rng_state.pth  
  inflating: no-preprocess-zero-shot-15.13/tokenizer_config.json  
  inflating: __MACOSX/no-preprocess-zero-shot-15.13/._tokenizer_config.json  
  inflating: no-preprocess-zero-shot-15.13/special_tokens_map.json  
  inflating: __MACOSX/no-preprocess-zero-shot-15.13/._special_tokens_map.json  
  inflating: no-preprocess-zero-shot-15.13/optimizer.pt  
  inflating: __MACOSX/no-preprocess-zero-shot-15.13/._optimizer.pt  
  inflating: no-preprocess-zero-shot-15.13/scheduler.pt  
  inflating: __MACOSX/no-preprocess-zero-shot-15.13/._scheduler.pt  
  inflating: no-preprocess

In [43]:
!python3 ./hw3/ppl.py \
    --base_model_path zake7749/gemma-2-2b-it-chinese-kyara-dpo \
    --peft_path ./no-preprocess-zero-shot-15.13 \
    --test_data_path ./data/public_test.json

Unused kwargs: ['bnb_4bit_fp32_cpu_offload']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100% 3/3 [00:03<00:00,  1.13s/it]
  0% 0/250 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
2024-11-10 09:21:18.227428: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-10 09:21:18.247161: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-10 09:21:18.252542: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting

In [49]:
!python ./csie5431-applied-data-learning/hw3/inference.py \
    --peft_model_path "./no-preprocess-zero-shot-15.13" \
    --input_file "./data/public_test.json" \
    --output_file "./prediction.json" \

Loading tokenizer from zake7749/gemma-2-2b-it-chinese-kyara-dpo...
Loading base model from zake7749/gemma-2-2b-it-chinese-kyara-dpo...
Unused kwargs: ['bnb_4bit_fp32_cpu_offload']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Loading checkpoint shards: 100% 3/3 [00:03<00:00,  1.06s/it]
Loading PEFT model from ./no-preprocess-zero-shot-15.13...
Reading test data from ./data/public_test.json...
Generating predictions...
  0% 0/250 [00:00<?, ?it/s]The 'max_batch_size' argument of HybridCache is deprecated and will be removed in v4.46. Use the more precisely named 'batch_size' argument instead.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
2024-11-10 09:48:56.704435: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
20