In [1]:
import torch
import evaluate
import pandas as pd
import numpy as np
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
from datasets import Dataset, load_dataset
from transformers import Trainer, TrainingArguments, AutoTokenizer, DataCollatorForLanguageModeling
from transformers.models.llama.modeling_llama import LlamaForCausalLM
from pathlib import Path
from tqdm import tqdm

assert torch.cuda.is_available(), "CUDA not available"
device = torch.device("cuda")

model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype="float16")

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

dataset_id = "Headline"

## Dataset setup

In [7]:
## load_dataset causes an error, load directly from cached snapshot files
hub_basepath = Path(r"C:\Users\samba\.cache\huggingface\hub")

paths = {
    "FPB": hub_basepath / r"datasets--AdaptLLM--FPB\snapshots\7f203bd82f0b2b01ce391b9451c642dd732cf381",
    "Headline": hub_basepath / r"datasets--AdaptLLM--Headline\snapshots\68cf1056f3ed51d39b945d004259473759555559",
    "FiQA_SA": hub_basepath / r"datasets--AdaptLLM--FiQA_SA\snapshots\302a1fafc3ce5fdaec33548db39e5a80b0d51038"
}

In [None]:
names_mapping = {
    "FPB": ["text", "label"],
    "Headline": ["idx", "text", "question", "label", "subidx"],
    "FiQA_SA": ["Body ID", "text", "ticker", "label"]
}

dataset_path = paths[dataset_id]
train_dataset = pd.read_csv(dataset_path / "train.csv", delimiter="\t", names=names_mapping[dataset_id])
test_dataset  = pd.read_csv(dataset_path / "test.csv",  delimiter="\t", names=names_mapping[dataset_id])

In [9]:
dataset_cols = {
    "FPB": ["text"],
    "Headline": ["text", "question"],
    "FiQA_SA": ["text", "ticker"],
}

prompt_templates = {
    "FPB": "{0}\nQuestion: what is the sentiment?\nOptions:\n- Positive\n- Negative\n- Neutral",
    "Headline": "Headline: \"{0}\" Now answer this question: {1}",
    "FiQA_SA": "{0}\nWhat is the sentiment on \"{1}\" in this sentence?\nOptions:\n- Positive\n- Negative\n- Neutral",
}

id2labels = {
    "FPB": {"neutral": " Neutral", "positive": " Positive", "negative": " Negative"},
    "Headline": {0: " No", 1: " Yes"},
    "FiQA_SA": {0: " Neutral", 2: " Positive", 1: " Negative"},
}

def train_preprocess(example: dict, max_length=512):
    # Create prompt and target text
    args = [example[key] for key in dataset_cols[dataset_id]]
    prompt = prompt_templates[dataset_id].format(*args)

    target = id2labels[dataset_id][example["label"]]
    full_text = prompt + target

    # tokenize text
    tokenized = tokenizer(full_text,
                          truncation=True,
                          padding="max_length",
                          max_length=max_length)
    
    # add padding tokens
    prompt_tokenized = tokenizer(prompt,
                              truncation=True,
                              max_length=max_length)
    prompt_length = len(prompt_tokenized["input_ids"])

    labels = tokenized["input_ids"].copy()
    labels[:prompt_length] = [-100] * prompt_length
    tokenized["labels"] = labels
    return tokenized

In [10]:
train_dataset = Dataset.from_pandas(train_dataset).map(train_preprocess, batched=False).remove_columns(names_mapping[dataset_id])
val_dataset   = Dataset.from_pandas(test_dataset).map(train_preprocess, batched=False).remove_columns(names_mapping[dataset_id])

Map:   0%|          | 0/82161 [00:00<?, ? examples/s]

Map:   0%|          | 0/20547 [00:00<?, ? examples/s]

## LoRA Setup

In [14]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.01,
    target_modules=["q_proj", "v_proj"]
)

peft_model = get_peft_model(base_model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 851,968 || all params: 1,236,666,368 || trainable%: 0.0689


## Trainer setup

In [15]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

lr = 1e-3
num_epochs = 2
batch_size = 2

out_dir = Path(rf"D:/models/basic-Llama-3_2-LoRA-{dataset_id}")
training_args = TrainingArguments(
    output_dir=str(out_dir),
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    learning_rate=lr,
    weight_decay=0.01,
    warmup_steps=1000,
    save_strategy="epoch",
    do_train=True,
    do_eval=True,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

In [16]:
trainer.train()

  0%|          | 0/82162 [00:00<?, ?it/s]

{'loss': 1.793, 'grad_norm': 3.2848801612854004, 'learning_rate': 0.0005, 'epoch': 0.01}
{'loss': 1.3802, 'grad_norm': 2.7161777019500732, 'learning_rate': 0.001, 'epoch': 0.02}
{'loss': 1.4455, 'grad_norm': 2.013709545135498, 'learning_rate': 0.0009938394815307656, 'epoch': 0.04}
{'loss': 1.2632, 'grad_norm': 5.033692836761475, 'learning_rate': 0.0009876789630615312, 'epoch': 0.05}
{'loss': 1.1933, 'grad_norm': 2.3909170627593994, 'learning_rate': 0.0009815184445922968, 'epoch': 0.06}
{'loss': 1.1689, 'grad_norm': 2.631848096847534, 'learning_rate': 0.0009753579261230625, 'epoch': 0.07}
{'loss': 1.1381, 'grad_norm': 3.224199056625366, 'learning_rate': 0.0009691974076538281, 'epoch': 0.09}
{'loss': 1.1141, 'grad_norm': 3.8480710983276367, 'learning_rate': 0.0009630368891845938, 'epoch': 0.1}
{'loss': 1.1315, 'grad_norm': 2.9922499656677246, 'learning_rate': 0.0009568763707153594, 'epoch': 0.11}
{'loss': 1.1289, 'grad_norm': 4.0397443771362305, 'learning_rate': 0.000950715852246125, 'ep

TrainOutput(global_step=82162, training_loss=0.8985924755469817, metrics={'train_runtime': 28516.1937, 'train_samples_per_second': 5.762, 'train_steps_per_second': 2.881, 'total_flos': 4.916714880054067e+17, 'train_loss': 0.8985924755469817, 'epoch': 2.0})

# Eval

In [17]:
def eval_preprocess(example, max_length=512):
    zeroshot = example['input'].rsplit("\n\n", maxsplit=1)[-1]
    return tokenizer(zeroshot,
                     truncation=True,
                     padding="max_length",
                     max_length=max_length,
                     return_tensors="pt")

In [18]:
testset = load_dataset("AdaptLLM/finance-tasks", dataset_id, split="test").map(eval_preprocess, batched=False)

Map:   0%|          | 0/20547 [00:00<?, ? examples/s]

In [19]:
ckpt_path = Path(rf"D:/models/basic-Llama-3_2-LoRA-{dataset_id}") / "checkpoint-best"
best_model = PeftModel.from_pretrained(base_model, ckpt_path, torch_dtype="float16").eval().to(device)

In [None]:
def compute_metrics(eval_pred: np.ndarray) -> dict[str, float]:
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    
    accuracy_val = accuracy_metric.compute(predictions=predictions, references=labels)['accuracy']
    f1_val = f1_metric.compute(predictions=predictions, references=labels, average="weighted")['f1']

    return {
        "accuracy": accuracy_val,
        "f1": f1_val,
    }

In [25]:
tok_options = {
    "FPB": [59794, 45003, 51957],    # " Neutral", " Positive", " Negative"
    "Headline": [7566, 2360],        # " Yes", " No"
    "FiQA_SA": [59794, 45003, 51957] # " Neutral", " Positive", " Negative"
}

base_model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype="float16").to(device).eval()

guidance = False
tok_opts = tok_options[dataset_id]

correct = 0

prog_bar = tqdm(testset)
for i, example in enumerate(prog_bar):
    input_ids = torch.tensor(example["input_ids"], device=device)
    attn_mask = torch.tensor(example["attention_mask"])
    gen_idx = attn_mask.sum(dim=1).long() - 1

    out = base_model.forward(input_ids=input_ids, attention_mask=attn_mask.to(device))
    logits = out.logits.cpu()
    
    gen_logits = logits[torch.arange(logits.size(0)), gen_idx, :] # (B, C)
    if guidance:
        subset = gen_logits[0, tok_opts]
        local_argmax = torch.argmax(subset).item()
        gen_tokens = tok_opts[local_argmax]
    else:
        gen_tokens = torch.argmax(gen_logits, dim=-1)

    gen_raw = tokenizer.decode(gen_tokens).strip(" ")
    if example["options"][example["gold_index"]] == gen_raw:
        correct += 1

    prog_bar.set_description(f"{100 * correct / (i+1):.2f}")

perc = correct / len(testset)
print(f"Accuracy {perc*100:.2f}")

4.54:  16%|█▌        | 3303/20547 [06:58<36:24,  7.90it/s]   


KeyboardInterrupt: 