In [None]:
import torch
import pandas as pd
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
from datasets import Dataset, load_dataset
from transformers import Trainer, TrainingArguments, AutoTokenizer, DataCollatorForLanguageModeling, PreTrainedModel
from transformers.models.llama.modeling_llama import LlamaForCausalLM
from pathlib import Path

from evals import evaluate

assert torch.cuda.is_available(), "CUDA not available"
device = torch.device("cuda")

model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype="float16")

dataset_id = "FPB"

## Dataset setup

In [2]:
## load_dataset causes an error, load directly from cached snapshot files
hub_basepath = Path(r"C:\Users\samba\.cache\huggingface\hub")

paths = {
    "FPB": hub_basepath / r"datasets--AdaptLLM--FPB\snapshots\7f203bd82f0b2b01ce391b9451c642dd732cf381",
    "Headline": hub_basepath / r"datasets--AdaptLLM--Headline\snapshots\68cf1056f3ed51d39b945d004259473759555559",
    "Topics": hub_basepath / r"datasets--Sujet--TopicClassification"
}

names_mapping = {
    "FPB": None,
    "Headline": ["idx", "text", "question", "label", "subidx"],
    "Topics": ["label", "text"]
}

columns = {
    "FPB": ["text", "label"],
    "Headline": ["idx", "text", "question", "label", "subidx"],
    "Topics": ["label", "text"]
}

del_mapping = {
    "FPB": "\t",
    "Headline": "\t",
    "Topics": None
}

dataset_cols = {
    "FPB": ["text"],
    "Headline": ["text", "question"],
    "Topics": ["text"],
}

topics = ['Analyst Update', 'Fed | Central Banks', 'Company | Product News', 'Treasuries | Corporate Debt', 'Dividend', 'Earnings', 'Energy | Oil', 'Financials', 'Currencies', 'General News | Opinion', 'Gold | Metals | Materials', 'IPO', 'Legal | Regulation', 'M&A | Investments', 'Macro', 'Markets', 'Politics', 'Personnel Change', 'Stock Commentary', 'Stock Movement']
topic_options = "\n".join([f"{i} - {t}" for i, t in enumerate(topics)])
prompt_templates = {
    "FPB": "{0}\nQuestion: what is the sentiment?\nOptions:\n- Positive\n- Negative\n- Neutral",
    "Headline": "Headline: \"{0}\" Now answer this question: {1}",
    "Topics": "{0}\nNow classify the topic\nOptions 0-19:\n" + f"{topic_options} ",
}

id2labels = {
    "FPB": {"neutral": " Neutral", "positive": " Positive", "negative": " Negative"},
    "Headline": {0: " No", 1: " Yes"},
    "Topics": {i: str(i) for i in range(20)},
}

In [3]:
def train_preprocess(example: dict, max_length=512):
    # Create prompt and target text
    args = [example[key] for key in dataset_cols[dataset_id]]
    prompt = prompt_templates[dataset_id].format(*args)

    target = id2labels[dataset_id][example["label"]]
    full_text = prompt + target

    # tokenize text
    tokenized = tokenizer(full_text,
                          truncation=True,
                          padding="max_length",
                          max_length=max_length)
    
    # add padding tokens
    prompt_tokenized = tokenizer(prompt,
                              truncation=True,
                              max_length=max_length)
    prompt_length = len(prompt_tokenized["input_ids"])

    labels = tokenized["input_ids"].copy()
    labels[:prompt_length] = [-100] * prompt_length
    tokenized["labels"] = labels
    return tokenized

In [4]:
dataset_path = paths[dataset_id]
train_dataset = pd.read_csv(dataset_path / "train.csv",
                            delimiter=del_mapping[dataset_id],
                            names=names_mapping[dataset_id])
test_dataset  = pd.read_csv(dataset_path / "test.csv",
                            delimiter=del_mapping[dataset_id],
                            names=names_mapping[dataset_id])

train_dataset = Dataset.from_pandas(train_dataset).map(train_preprocess, batched=False).remove_columns(columns[dataset_id])
val_dataset   = Dataset.from_pandas(test_dataset).map(train_preprocess, batched=False).remove_columns(columns[dataset_id])

Map:   0%|          | 0/3876 [00:00<?, ? examples/s]

Map:   0%|          | 0/970 [00:00<?, ? examples/s]

## LoRA Setup

In [7]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.01,
    target_modules=["q_proj", "v_proj"]
)

peft_model = get_peft_model(base_model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 851,968 || all params: 1,236,666,368 || trainable%: 0.0689


## LoRA Continue Training

In [None]:
lora_ckpt = Path(rf"D:/models/expert-Llama-3_2-1B-{dataset_id}") / "checkpoint-best"

peft_model = PeftModel.from_pretrained(
    base_model,
    lora_ckpt
).to(device).train()

## Trainer setup

In [12]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

lr = 1e-3
num_epochs = 15
batch_size = 2

out_dir = Path(rf"D:/models/expert-Llama-3_2-1B-{dataset_id}-v2")
training_args = TrainingArguments(
    output_dir=str(out_dir),
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    learning_rate=lr,
    weight_decay=0.01,
    warmup_steps=5000,
    save_strategy="epoch",
    do_train=True,
    # do_eval=True,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=val_dataset,
    data_collator=data_collator,
)

In [13]:
trainer.train()

  0%|          | 0/29070 [00:00<?, ?it/s]

{'loss': 2.6319, 'grad_norm': 6.024759292602539, 'learning_rate': 0.0001, 'epoch': 0.26}
{'loss': 1.8756, 'grad_norm': 2.8309383392333984, 'learning_rate': 0.0002, 'epoch': 0.52}
{'loss': 1.8123, 'grad_norm': 4.807130813598633, 'learning_rate': 0.0003, 'epoch': 0.77}
{'loss': 1.78, 'grad_norm': 2.224982500076294, 'learning_rate': 0.0004, 'epoch': 1.03}
{'loss': 1.7515, 'grad_norm': 2.7508039474487305, 'learning_rate': 0.0005, 'epoch': 1.29}
{'loss': 1.7874, 'grad_norm': 3.5491952896118164, 'learning_rate': 0.0006, 'epoch': 1.55}
{'loss': 1.769, 'grad_norm': 2.632551908493042, 'learning_rate': 0.0007, 'epoch': 1.81}
{'loss': 1.7626, 'grad_norm': 4.6233391761779785, 'learning_rate': 0.0008, 'epoch': 2.06}
{'loss': 1.7299, 'grad_norm': 2.7689712047576904, 'learning_rate': 0.0009000000000000001, 'epoch': 2.32}
{'loss': 1.7742, 'grad_norm': 2.868759870529175, 'learning_rate': 0.001, 'epoch': 2.58}
{'loss': 1.7814, 'grad_norm': 2.8815135955810547, 'learning_rate': 0.000979227253842958, 'epoc

TrainOutput(global_step=29070, training_loss=1.382214455330622, metrics={'train_runtime': 10077.6843, 'train_samples_per_second': 5.769, 'train_steps_per_second': 2.885, 'total_flos': 1.739619789963264e+17, 'train_loss': 1.382214455330622, 'epoch': 15.0})

# Eval

## For FPB and Headline

In [3]:
def eval_preprocess_a(example, max_length=512):
    zeroshot = example['input'].rsplit("\n\n", maxsplit=1)[-1]
    return tokenizer(zeroshot,
                     truncation=True,
                     padding="max_length",
                     max_length=max_length,
                     return_tensors="pt")

In [4]:
testset_adaptllm = load_dataset("AdaptLLM/finance-tasks", dataset_id, split="test").map(eval_preprocess_a, batched=False)

In [None]:
tok_options = {
    "FPB": [59794, 45003, 51957],    # " Neutral", " Positive", " Negative"
    "Headline": [7566, 2360],        # " Yes", " No"
}

ckpt_path = Path(rf"D:/models/expert-Llama-3_2-1B-{dataset_id}-v2") / "checkpoint-best"

base_model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype="float16").eval()
expert_model = PeftModel.from_pretrained(base_model, ckpt_path, torch_dtype="float16").eval().to(device)

results = evaluate(base_model, tokenizer,
                   testset_adaptllm,
                   guidance=True,
                   tok_opts=tok_options[dataset_id])
print(results)

58.45: 100%|██████████| 970/970 [01:48<00:00,  8.94it/s]

{'accuracy': 0.5845360824742268}





## Topics

In [11]:
def eval_preprocess_b(example, max_length=512):
    zeroshot = prompt_templates["Topics"].format(example["text"])
    return tokenizer(zeroshot,
                     truncation=True,
                     padding="max_length",
                     max_length=max_length,
                     return_tensors="pt")

topic_options = [str(i) for i in range(len(topics))]
def add_options(example):
    example["options"] = topic_options
    return example

In [12]:
testset_df = pd.read_csv(dataset_path / "test.csv",
                            delimiter=del_mapping[dataset_id],
                            names=names_mapping[dataset_id])
testset_topics = (Dataset
           .from_pandas(testset_df)
           .map(eval_preprocess_b, batched=False)
           .map(add_options, batched=False)
           .rename_column("label", "gold_index"))

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

In [18]:
tok_opts_ids = tokenizer(topic_options)["input_ids"]
tok_opts = [arr[1] for arr in tok_opts_ids]

In [None]:
ckpt_path = Path(rf"D:/models/expert-Llama-3_2-1B-Topics") / "checkpoint-best"

base_model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype="float16") #.to(device)
expert_model = PeftModel.from_pretrained(base_model, ckpt_path, torch_dtype="float16").eval().to(device)

results = evaluate(expert_model, tokenizer,
                   testset_topics,
                   guidance=True,
                   tok_opts=tok_opts)
print(results)

85.18: 100%|██████████| 850/850 [03:35<00:00,  3.95it/s]

{'accuracy': 0.851764705882353}



