In [1]:
import torch
import evaluate
import pandas as pd
import numpy as np
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
from datasets import Dataset, load_dataset
from transformers import Trainer, TrainingArguments, AutoTokenizer, DataCollatorForLanguageModeling, PreTrainedModel
from transformers.models.llama.modeling_llama import LlamaForCausalLM
from pathlib import Path
from tqdm import tqdm
from typing import Optional

assert torch.cuda.is_available(), "CUDA not available"
device = torch.device("cuda")

model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype="float16")

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

dataset_id = "Topics"

## Dataset setup

In [2]:
## load_dataset causes an error, load directly from cached snapshot files
hub_basepath = Path(r"C:\Users\samba\.cache\huggingface\hub")

paths = {
    "FPB": hub_basepath / r"datasets--AdaptLLM--FPB\snapshots\7f203bd82f0b2b01ce391b9451c642dd732cf381",
    "Headline": hub_basepath / r"datasets--AdaptLLM--Headline\snapshots\68cf1056f3ed51d39b945d004259473759555559",
    "Topics": hub_basepath / r"datasets--Sujet--TopicClassification"
}

In [3]:
names_mapping = {
    "FPB": ["text", "label"],
    "Headline": ["idx", "text", "question", "label", "subidx"],
    "Topics": ["label", "text"]
}

del_mapping = {
    "FPB": "\t",
    "Headline": "\t",
    "Topics": None
}

dataset_path = paths[dataset_id]
train_dataset = pd.read_csv(dataset_path / "train.csv",
                            delimiter=del_mapping[dataset_id],
                            names=names_mapping[dataset_id])
test_dataset  = pd.read_csv(dataset_path / "test.csv",
                            delimiter=del_mapping[dataset_id],
                            names=names_mapping[dataset_id])

In [None]:
dataset_cols = {
    "FPB": ["text"],
    "Headline": ["text", "question"],
    "Topics": ["text"],
}

topics = ['Analyst Update', 'Fed | Central Banks', 'Company | Product News', 'Treasuries | Corporate Debt', 'Dividend', 'Earnings', 'Energy | Oil', 'Financials', 'Currencies', 'General News | Opinion', 'Gold | Metals | Materials', 'IPO', 'Legal | Regulation', 'M&A | Investments', 'Macro', 'Markets', 'Politics', 'Personnel Change', 'Stock Commentary', 'Stock Movement']
topic_options = "\n".join([f"{i} - {t}" for i, t in enumerate(topics)])
prompt_templates = {
    "FPB": "{0}\nQuestion: what is the sentiment?\nOptions:\n- Positive\n- Negative\n- Neutral",
    "Headline": "Headline: \"{0}\" Now answer this question: {1}",
    "Topics": "{0}\nNow classify the topic\nOptions 0-19:\n" + f"{topic_options} ",
}

id2labels = {
    "FPB": {"neutral": " Neutral", "positive": " Positive", "negative": " Negative"},
    "Headline": {0: " No", 1: " Yes"},
    "Topics": {i: str(i) for i in range(20)},
}

def train_preprocess(example: dict, max_length=512):
    # Create prompt and target text
    args = [example[key] for key in dataset_cols[dataset_id]]
    prompt = prompt_templates[dataset_id].format(*args)

    target = id2labels[dataset_id][example["label"]]
    full_text = prompt + target

    # tokenize text
    tokenized = tokenizer(full_text,
                          truncation=True,
                          padding="max_length",
                          max_length=max_length)
    
    # add padding tokens
    prompt_tokenized = tokenizer(prompt,
                              truncation=True,
                              max_length=max_length)
    prompt_length = len(prompt_tokenized["input_ids"])

    labels = tokenized["input_ids"].copy()
    labels[:prompt_length] = [-100] * prompt_length
    tokenized["labels"] = labels
    return tokenized

In [5]:
train_dataset = Dataset.from_pandas(train_dataset).map(train_preprocess, batched=False).remove_columns(names_mapping[dataset_id])
val_dataset   = Dataset.from_pandas(test_dataset).map(train_preprocess, batched=False).remove_columns(names_mapping[dataset_id])

Map:   0%|          | 0/16140 [00:00<?, ? examples/s]

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

## LoRA Setup

In [23]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.01,
    target_modules=["q_proj", "v_proj"]
)

peft_model = get_peft_model(base_model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 851,968 || all params: 1,236,666,368 || trainable%: 0.0689


## Trainer setup

In [24]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

lr = 1e-3
num_epochs = 3
batch_size = 2

out_dir = Path(rf"D:/models/basic-Llama-3_2-LoRA-{dataset_id}")
training_args = TrainingArguments(
    output_dir=str(out_dir),
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    learning_rate=lr,
    weight_decay=0.01,
    warmup_steps=1000,
    save_strategy="epoch",
    do_train=True,
    do_eval=True,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

In [25]:
trainer.train()

  0%|          | 0/24210 [00:00<?, ?it/s]

{'loss': 0.922, 'grad_norm': 0.6902908682823181, 'learning_rate': 0.0005, 'epoch': 0.06}
{'loss': 0.5725, 'grad_norm': 1.516623854637146, 'learning_rate': 0.001, 'epoch': 0.12}
{'loss': 0.5981, 'grad_norm': 0.6170763969421387, 'learning_rate': 0.00097845756139595, 'epoch': 0.19}
{'loss': 0.5922, 'grad_norm': 0.8993262648582458, 'learning_rate': 0.0009569151227919, 'epoch': 0.25}
{'loss': 0.5796, 'grad_norm': 0.833195149898529, 'learning_rate': 0.00093537268418785, 'epoch': 0.31}
{'loss': 0.5942, 'grad_norm': 0.7825578451156616, 'learning_rate': 0.0009138302455838002, 'epoch': 0.37}
{'loss': 0.5878, 'grad_norm': 1.6565760374069214, 'learning_rate': 0.0008922878069797501, 'epoch': 0.43}
{'loss': 0.5877, 'grad_norm': 0.9568967819213867, 'learning_rate': 0.0008707453683757002, 'epoch': 0.5}
{'loss': 0.5852, 'grad_norm': 0.9171866178512573, 'learning_rate': 0.0008492029297716502, 'epoch': 0.56}
{'loss': 0.5605, 'grad_norm': 1.5932117700576782, 'learning_rate': 0.0008276604911676002, 'epoch'

TrainOutput(global_step=24210, training_loss=0.5299051214672721, metrics={'train_runtime': 9743.2223, 'train_samples_per_second': 4.97, 'train_steps_per_second': 2.485, 'total_flos': 1.448785521672192e+17, 'train_loss': 0.5299051214672721, 'epoch': 3.0})

# Eval

In [6]:
def evaluate(model: PreTrainedModel,
             testset: Dataset,
             guidance = True,
             tok_opts: Optional[list[int]] = None) -> dict[str, float]:

    if guidance and tok_opts is None:
        raise ValueError("Guidance requires token options")

    correct = 0
    prog_bar = tqdm(testset)
    for i, example in enumerate(prog_bar):
        input_ids = torch.tensor(example["input_ids"], device=device)
        attn_mask = torch.tensor(example["attention_mask"])
        gen_idx = attn_mask.sum(dim=1).long() - 1

        out = model.forward(input_ids=input_ids, attention_mask=attn_mask.to(device))
        logits = out.logits.cpu()
        
        gen_logits = logits[torch.arange(logits.size(0)), gen_idx, :] # (B, C)
        if guidance:
            subset = gen_logits[0, tok_opts]
            local_argmax = torch.argmax(subset).item()
            gen_tokens = tok_opts[local_argmax]
        else:
            gen_tokens = torch.argmax(gen_logits, dim=-1)

        gen_raw = tokenizer.decode(gen_tokens).strip(" ")
        if example["options"][example["gold_index"]] == gen_raw:
            correct += 1

        prog_bar.set_description(f"{100 * correct / (i+1):.2f}")
    
    return {
        "accuracy": correct / len(testset)
    }

## For FPB and Headline

In [None]:
def eval_preprocess_a(example, max_length=512):
    zeroshot = example['input'].rsplit("\n\n", maxsplit=1)[-1]
    return tokenizer(zeroshot,
                     truncation=True,
                     padding="max_length",
                     max_length=max_length,
                     return_tensors="pt")

In [None]:
testset_adaptllm = load_dataset("AdaptLLM/finance-tasks", dataset_id, split="test").map(eval_preprocess_a, batched=False)

Map:   0%|          | 0/20547 [00:00<?, ? examples/s]

In [None]:
tok_options = {
    "FPB": [59794, 45003, 51957],    # " Neutral", " Positive", " Negative"
    "Headline": [7566, 2360],        # " Yes", " No"
}

ckpt_path = Path(rf"D:/models/basic-Llama-3_2-LoRA-{dataset_id}") / "checkpoint-best"

base_model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype="float16")
expert_model = PeftModel.from_pretrained(base_model, ckpt_path, torch_dtype="float16").eval().to(device)

results = evaluate(expert_model,
                   testset_adaptllm,
                   guidance=True,
                   tok_opts=tok_options[dataset_id])
print(results)

## Topics

In [11]:
def eval_preprocess_b(example, max_length=512):
    zeroshot = prompt_templates["Topics"].format(example["text"])
    return tokenizer(zeroshot,
                     truncation=True,
                     padding="max_length",
                     max_length=max_length,
                     return_tensors="pt")

topic_options = [str(i) for i in range(len(topics))]
def add_options(example):
    example["options"] = topic_options
    return example

In [12]:
testset_df = pd.read_csv(dataset_path / "test.csv",
                            delimiter=del_mapping[dataset_id],
                            names=names_mapping[dataset_id])
testset_topics = (Dataset
           .from_pandas(testset_df)
           .map(eval_preprocess_b, batched=False)
           .map(add_options, batched=False)
           .rename_column("label", "gold_index"))

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

In [18]:
tok_opts_ids = tokenizer(topic_options)["input_ids"]
tok_opts = [arr[1] for arr in tok_opts_ids]

In [None]:
ckpt_path = Path(rf"D:/models/basic-Llama-3_2-LoRA-Topics") / "checkpoint-8070"

base_model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype="float16") #.to(device)
expert_model = PeftModel.from_pretrained(base_model, ckpt_path, torch_dtype="float16").eval().to(device)

results = evaluate(expert_model,
                   testset_topics,
                   guidance=True,
                   tok_opts=tok_opts)
print(results)

85.18: 100%|██████████| 850/850 [03:35<00:00,  3.95it/s]

{'accuracy': 0.851764705882353}



