In [1]:
import torch
import evaluate
import pandas as pd
import numpy as np
from functools import partial
from pathlib import Path
from datasets import Dataset, load_dataset, interleave_datasets
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
from transformers import Trainer, TrainingArguments, AutoTokenizer, DataCollatorForLanguageModeling, PreTrainedModel
from transformers.models.llama.modeling_llama import LlamaForCausalLM
from tqdm import tqdm
from typing import Optional

assert torch.cuda.is_available(), "CUDA not available"
device = torch.device("cuda")

seed = 42
model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype="float16")

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

## Dataset setup

In [22]:
## load_dataset causes an error, load directly from cached snapshot files
hub_basepath = Path(r"C:\Users\samba\.cache\huggingface\hub")

paths = {
    "FPB": hub_basepath / r"datasets--AdaptLLM--FPB\snapshots\7f203bd82f0b2b01ce391b9451c642dd732cf381",
    "Headline": hub_basepath / r"datasets--AdaptLLM--Headline\snapshots\68cf1056f3ed51d39b945d004259473759555559",
    "Topics": hub_basepath / r"datasets--Sujet--TopicClassification"
}

names_mapping = {
    "FPB": None,
    "Headline": ["idx", "text", "question", "label", "subidx"],
    "Topics": ["label", "text"]
}

columns = {
    "FPB": ["text", "label"],
    "Headline": ["idx", "text", "question", "label", "subidx"],
    "Topics": ["label", "text"]
}

del_mapping = {
    "FPB": "\t",
    "Headline": "\t",
    "Topics": None ## regular comma-delimiter'd csv
}


topics = ['Analyst Update', 'Fed | Central Banks', 'Company | Product News', 'Treasuries | Corporate Debt', 'Dividend', 'Earnings', 'Energy | Oil', 'Financials', 'Currencies', 'General News | Opinion', 'Gold | Metals | Materials', 'IPO', 'Legal | Regulation', 'M&A | Investments', 'Macro', 'Markets', 'Politics', 'Personnel Change', 'Stock Commentary', 'Stock Movement']
topic_options = "\n".join([f"{i} - {t}" for i, t in enumerate(topics)])
prompt_templates = {
    "FPB": "{0}\nQuestion: what is the sentiment?\nOptions:\n- Positive\n- Negative\n- Neutral",
    "Headline": "Headline: \"{0}\" Now answer this question: {1}",
    "Topics": "{0}\nNow classify the topic\nOptions 0-19:\n" + f"{topic_options} ",
}

prompt_args = {
    "FPB": ["text"],
    "Headline": ["text", "question"],
    "Topics": ["text"],
}

id2labels = {
    "FPB": {"neutral": " Neutral", "positive": " Positive", "negative": " Negative"},
    "Headline": {0: " No", 1: " Yes"},
    "Topics": {i: str(i) for i in range(20)},
}

In [23]:
def train_preprocess(dataset_id: str, example: dict, max_length=512):
    # Create prompt and target text
    args = [example[key] for key in prompt_args[dataset_id]]
    prompt = prompt_templates[dataset_id].format(*args)

    target = id2labels[dataset_id][example["label"]]
    full_text = prompt + target

    # tokenize text
    tokenized = tokenizer(full_text,
                          truncation=True,
                          padding="max_length",
                          max_length=max_length)
    
    # add padding tokens
    prompt_tokenized = tokenizer(prompt,
                              truncation=True,
                              max_length=max_length)
    prompt_length = len(prompt_tokenized["input_ids"])

    labels = tokenized["input_ids"].copy()
    labels[:prompt_length] = [-100] * prompt_length
    tokenized["labels"] = labels
    return tokenized

In [25]:
dataset_list = []
for dataset_id, dataset_path in paths.items():
    train_subset = pd.read_csv(dataset_path / "train.csv",
                                delimiter=del_mapping[dataset_id],
                                names=names_mapping[dataset_id])

    preprocess_func = partial(train_preprocess, dataset_id)
    dataset_list.append(Dataset
                        .from_pandas(train_subset)
                        .map(preprocess_func, batched=False)
                        .remove_columns(columns[dataset_id]))

n_datasets = len(dataset_list)
train_dataset = interleave_datasets(dataset_list, 
                                    probabilities=[1/n_datasets]*n_datasets,
                                    seed=seed)

Map:   0%|          | 0/3876 [00:00<?, ? examples/s]

Map:   0%|          | 0/82161 [00:00<?, ? examples/s]

Map:   0%|          | 0/16140 [00:00<?, ? examples/s]

## LoRA Setup

In [38]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.01,
    target_modules=["q_proj", "v_proj"]
)

peft_model = get_peft_model(base_model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 851,968 || all params: 1,236,666,368 || trainable%: 0.0689


## Trainer setup

In [39]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

lr = 1e-3
num_epochs = 9
batch_size = 2

out_dir = Path(rf"D:/models/general-Llama-3_2-LoRA")
training_args = TrainingArguments(
    output_dir=str(out_dir),
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    learning_rate=lr,
    weight_decay=0.01,
    warmup_steps=5000,
    save_strategy="epoch",
    do_train=True,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

In [40]:
trainer.train()

  0%|          | 0/51705 [00:00<?, ?it/s]

{'loss': 2.1674, 'grad_norm': 10.780961036682129, 'learning_rate': 0.0001, 'epoch': 0.09}
{'loss': 1.2148, 'grad_norm': 2.0696489810943604, 'learning_rate': 0.0002, 'epoch': 0.17}
{'loss': 1.1985, 'grad_norm': 5.538705825805664, 'learning_rate': 0.0003, 'epoch': 0.26}
{'loss': 1.2013, 'grad_norm': 2.454968214035034, 'learning_rate': 0.0004, 'epoch': 0.35}
{'loss': 1.1825, 'grad_norm': 2.95001482963562, 'learning_rate': 0.0005, 'epoch': 0.44}
{'loss': 1.0664, 'grad_norm': 1.8794838190078735, 'learning_rate': 0.0006, 'epoch': 0.52}
{'loss': 1.1549, 'grad_norm': 1.8944607973098755, 'learning_rate': 0.0007, 'epoch': 0.61}
{'loss': 1.1203, 'grad_norm': 1.0156264305114746, 'learning_rate': 0.0008, 'epoch': 0.7}
{'loss': 1.1481, 'grad_norm': 3.977154016494751, 'learning_rate': 0.0009000000000000001, 'epoch': 0.78}
{'loss': 1.1451, 'grad_norm': 1.2982301712036133, 'learning_rate': 0.001, 'epoch': 0.87}
{'loss': 1.2226, 'grad_norm': 2.102498769760132, 'learning_rate': 0.0009892945080826464, 'ep

TrainOutput(global_step=51705, training_loss=0.9419316286552392, metrics={'train_runtime': 18099.2647, 'train_samples_per_second': 5.713, 'train_steps_per_second': 2.857, 'total_flos': 3.0938841744408576e+17, 'train_loss': 0.9419316286552392, 'epoch': 9.0})

# Eval

In [41]:
def evaluate(model: PreTrainedModel,
             testset: Dataset,
             guidance = True,
             tok_opts: Optional[list[int]] = None) -> dict[str, float]:

    if guidance and tok_opts is None:
        raise ValueError("Guidance requires token options")

    correct = 0
    prog_bar = tqdm(testset)
    for i, example in enumerate(prog_bar):
        input_ids = torch.tensor(example["input_ids"], device=device)
        attn_mask = torch.tensor(example["attention_mask"])
        gen_idx = attn_mask.sum(dim=1).long() - 1

        out = model.forward(input_ids=input_ids, attention_mask=attn_mask.to(device))
        logits = out.logits.cpu()
        
        gen_logits = logits[torch.arange(logits.size(0)), gen_idx, :] # (B, C)
        if guidance:
            subset = gen_logits[0, tok_opts]
            local_argmax = torch.argmax(subset).item()
            gen_tokens = tok_opts[local_argmax]
        else:
            gen_tokens = torch.argmax(gen_logits, dim=-1)

        gen_raw = tokenizer.decode(gen_tokens).strip(" ")
        if example["options"][example["gold_index"]] == gen_raw:
            correct += 1

        prog_bar.set_description(f"{100 * correct / (i+1):.2f}")
    
    return {
        "accuracy": correct / len(testset)
    }

## For FPB and Headline

In [42]:
def eval_preprocess_a(example, max_length=512):
    zeroshot = example['input'].rsplit("\n\n", maxsplit=1)[-1]
    return tokenizer(zeroshot,
                     truncation=True,
                     padding="max_length",
                     max_length=max_length,
                     return_tensors="pt")

In [45]:
dataset_id = "Headline"
testset_adaptllm = load_dataset("AdaptLLM/finance-tasks", dataset_id, split="test").map(eval_preprocess_a, batched=False)

Map:   0%|          | 0/20547 [00:00<?, ? examples/s]

In [46]:
tok_options = {
    "FPB": [59794, 45003, 51957],    # " Neutral", " Positive", " Negative"
    "Headline": [7566, 2360],        # " Yes", " No"
}

ckpt_path = Path(rf"D:/models/general-Llama-3_2-LoRA") / "checkpoint-best"

base_model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype="float16")
expert_model = PeftModel.from_pretrained(base_model, ckpt_path, torch_dtype="float16").eval().to(device)

results = evaluate(expert_model,
                   testset_adaptllm,
                   guidance=True,
                   tok_opts=tok_options[dataset_id])
print(results)

83.07: 100%|██████████| 20547/20547 [1:18:29<00:00,  4.36it/s]

{'accuracy': 0.8307295468924903}





## Topics

In [47]:
def eval_preprocess_b(example, max_length=512):
    zeroshot = prompt_templates["Topics"].format(example["text"])
    return tokenizer(zeroshot,
                     truncation=True,
                     padding="max_length",
                     max_length=max_length,
                     return_tensors="pt")

topic_options = [str(i) for i in range(len(topics))]
def add_options(example):
    example["options"] = topic_options
    return example

In [59]:
dataset_id = "Topics"

testset_df = pd.read_csv(dataset_path / "test.csv",
                            delimiter=del_mapping[dataset_id],
                            names=names_mapping[dataset_id])
testset_topics = (Dataset
           .from_pandas(testset_df)
           .map(eval_preprocess_b, batched=False)
           .map(add_options, batched=False)
           .rename_column("label", "gold_index"))

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

In [61]:
tok_opts_ids = tokenizer(topic_options)["input_ids"]
tok_opts = [arr[1] for arr in tok_opts_ids]

In [62]:
ckpt_path = Path(rf"D:/models/general-Llama-3_2-LoRA") / "checkpoint-best"

base_model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype="float16") #.to(device)
expert_model = PeftModel.from_pretrained(base_model, ckpt_path, torch_dtype="float16").eval().to(device)

results = evaluate(expert_model,
                   testset_topics,
                   guidance=True,
                   tok_opts=tok_opts)
print(results)

85.53: 100%|██████████| 850/850 [04:39<00:00,  3.04it/s]

{'accuracy': 0.8552941176470589}



