In [1]:
import torch
import pandas as pd
from functools import partial
from pathlib import Path
from datasets import Dataset, load_dataset, interleave_datasets
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
from transformers import Trainer, TrainingArguments, AutoTokenizer, DataCollatorForLanguageModeling
from transformers.models.llama.modeling_llama import LlamaForCausalLM

from evals import evaluate

assert torch.cuda.is_available(), "CUDA not available"
device = torch.device("cuda")

seed = 42
torch.manual_seed(seed)

MAX_LENGTH = 512

model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

## Dataset setup

In [2]:
def get_adaptllm_path(base_path: Path) -> Path:
    with open(base_path / "refs" / "main", "r") as f_in:
        snapshot_ref = f_in.readline()
    return base_path / "snapshots" / snapshot_ref

In [3]:
## load_dataset causes an error, load directly from cached snapshot files
hub_basepath = Path(r"C:\Users\samba\.cache\huggingface\hub")
dataset_cache_path = Path(r"D:/datasets/general-3-tasks")

paths = {
    "FPB": get_adaptllm_path(hub_basepath / "datasets--AdaptLLM--FPB"),
    "Headline": get_adaptllm_path(hub_basepath / "datasets--AdaptLLM--Headline"),
    "Topics": hub_basepath / r"datasets--Sujet--TopicClassification"
}

names_mapping = {
    "FPB": None,
    "Headline": ["idx", "text", "question", "label", "subidx"],
    "Topics": ["label", "text"]
}

columns = {
    "FPB": ["text", "label"],
    "Headline": ["idx", "text", "question", "label", "subidx"],
    "Topics": ["label", "text"]
}

del_mapping = {
    "FPB": "\t",
    "Headline": "\t",
    "Topics": None ## regular comma-delimiter'd csv
}


topics = ['Analyst Update', 'Fed | Central Banks', 'Company | Product News', 'Treasuries | Corporate Debt', 'Dividend', 'Earnings', 'Energy | Oil', 'Financials', 'Currencies', 'General News | Opinion', 'Gold | Metals | Materials', 'IPO', 'Legal | Regulation', 'M&A | Investments', 'Macro', 'Markets', 'Politics', 'Personnel Change', 'Stock Commentary', 'Stock Movement']
topic_options = "\n".join([f"{i} - {t}" for i, t in enumerate(topics)])
prompt_templates = {
    "FPB": "{0}\nQuestion: what is the sentiment?\nOptions:\n- Positive\n- Negative\n- Neutral",
    "Headline": "Headline: \"{0}\" Now answer this question: {1}",
    "Topics": "{0}\nNow classify the topic\nOptions 0-19:\n" + f"{topic_options} ",
}

prompt_args = {
    "FPB": ["text"],
    "Headline": ["text", "question"],
    "Topics": ["text"],
}

id2labels = {
    "FPB": {"neutral": " Neutral", "positive": " Positive", "negative": " Negative"},
    "Headline": {0: " No", 1: " Yes"},
    "Topics": {i: str(i) for i in range(20)},
}

token_list_raw = [v for task in id2labels.values() for v in task.values()]
token_list = [tokenizer.encode(tok, add_special_tokens=False)[0] for tok in token_list_raw]

In [4]:
def train_preprocess_causal(dataset_id: str, example: dict):
    # Create prompt and target text
    args = [example[key] for key in prompt_args[dataset_id]]
    prompt = prompt_templates[dataset_id].format(*args)

    target = id2labels[dataset_id][example["label"]]
    full_text = prompt + target

    # tokenize text
    tokenized = tokenizer(full_text,
                          truncation=True,
                          padding="max_length",
                          max_length=MAX_LENGTH)
    
    # add padding tokens
    prompt_tokenized = tokenizer(prompt,
                              truncation=True,
                              max_length=MAX_LENGTH)
    prompt_length = len(prompt_tokenized["input_ids"])

    labels = tokenized["input_ids"].copy()
    labels[:prompt_length] = [-100] * prompt_length
    tokenized["labels"] = labels
    return tokenized

def train_preprocess_tokenclass(dataset_id: str, token_list: list[int], example: dict):
    # Create prompt and target text
    args = [example[key] for key in prompt_args[dataset_id]]
    prompt = prompt_templates[dataset_id].format(*args)

    # tokenize text
    tokenized = tokenizer(prompt,
                          truncation=True,
                          padding="max_length",
                          max_length=MAX_LENGTH)

    # tokenize and index label
    target = id2labels[dataset_id][example["label"]]
    token_target = tokenizer.encode(target, add_special_tokens=False)[0]
    label = token_list.index(token_target)
    tokenized["labels"] = label

    return tokenized

In [5]:
nrows_list = [3876, 5000, 5000]
dataset_list = []
for i, (dataset_id, dataset_path) in enumerate(paths.items()):
    train_subset = pd.read_csv(dataset_path / "train.csv",
                                delimiter=del_mapping[dataset_id],
                                names=names_mapping[dataset_id],
                                nrows=nrows_list[i])

    preprocess_func = partial(train_preprocess_tokenclass, dataset_id, token_list)
    dataset_list.append(Dataset
                        .from_pandas(train_subset)
                        .map(preprocess_func, batched=False)
                        .remove_columns(columns[dataset_id]))

n_datasets = len(dataset_list)
train_dataset = interleave_datasets(dataset_list, 
                                    probabilities=[1/n_datasets]*n_datasets,
                                    seed=seed)

Map:   0%|          | 0/3876 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [6]:
save_path = dataset_cache_path.with_stem("finmoe-tokenclass_medium-len512")
train_dataset.save_to_disk(save_path)

Saving the dataset (0/1 shards):   0%|          | 0/11489 [00:00<?, ? examples/s]

In [4]:
load_path = dataset_cache_path.with_stem("finmoe-tokenclass-len512")
train_dataset = Dataset.load_from_disk(load_path)

## Training

### Peft Model trainer

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.01,
    target_modules=["q_proj", "v_proj"]
)

model_id = "meta-llama/Llama-3.2-1B"
base_model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype="float16")

peft_model = get_peft_model(base_model, peft_config)
peft_model.print_trainable_parameters()

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

out_dir = Path(rf"D:/models/general-Llama-3_2-3B")
training_args = TrainingArguments(
    output_dir=str(out_dir),
    num_train_epochs=6,
    per_device_train_batch_size=2,
    learning_rate=1e-3,
    weight_decay=0.01,
    warmup_steps=1000,
    save_strategy="epoch",
    do_train=True,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

### FinMoE trainer

In [7]:
from FinMoE import FinMoE, FinMoEConfig

ckpt_base = Path(r"D:\models")
expert_ids = {"FPB": "checkpoint-best",
              "Headline": "checkpoint-best",
              "Topics": "checkpoint-best"}

## note: str() wraps path as Path objects are not json serializable
expert_ckpts = [str(ckpt_base / f"expert-Llama-3_2-1B-{expert_name}" / ckpt_name)
                for expert_name, ckpt_name in expert_ids.items()]

finMoE_config = FinMoEConfig(
    # loss_type="ForCausalLM",
    loss_type="ForTokenClassification",
    num_labels=len(token_list),

    expert_ckpts=expert_ckpts,
    token_list=token_list,
)

finMoE_model = FinMoE(finMoE_config).to(device)
print("Memory allocated:", torch.cuda.memory_allocated())
print("Trainable params:")
for name, params in finMoE_model.named_parameters():
    if params.requires_grad:
        print(name, params.shape)

# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer, mlm=False
# )

out_dir = Path(rf"D:/models/FinMoE-v2")
training_args = TrainingArguments(
    output_dir=str(out_dir),
    num_train_epochs=5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=1e-3,
    weight_decay=0.01,
    warmup_steps=256,
    logging_steps=128,
    save_strategy="epoch",
    do_train=True,
)

trainer = Trainer(
    model=finMoE_model,
    args=training_args,
    train_dataset=train_dataset,
    # data_collator=data_collator,
)



Memory allocated: 4953515008
Trainable params:
gate.w_gate.weight torch.Size([3, 2048])
gate.w_gate.bias torch.Size([3])


### Train

In [8]:
trainer.train()

  0%|          | 0/7180 [00:00<?, ?it/s]

{'loss': 20.8188, 'grad_norm': 211.09530639648438, 'learning_rate': 0.0005, 'epoch': 0.09}
{'loss': 21.1283, 'grad_norm': 400.69146728515625, 'learning_rate': 0.001, 'epoch': 0.18}
{'loss': 21.0356, 'grad_norm': 310.02655029296875, 'learning_rate': 0.0009815135759676488, 'epoch': 0.27}
{'loss': 22.4787, 'grad_norm': 371.5384216308594, 'learning_rate': 0.0009630271519352975, 'epoch': 0.36}
{'loss': 20.1923, 'grad_norm': 132.94309997558594, 'learning_rate': 0.0009445407279029464, 'epoch': 0.45}
{'loss': 19.7472, 'grad_norm': 404.6505432128906, 'learning_rate': 0.0009260543038705951, 'epoch': 0.53}
{'loss': 21.4861, 'grad_norm': 584.789794921875, 'learning_rate': 0.0009075678798382439, 'epoch': 0.62}
{'loss': 20.2967, 'grad_norm': 271.59283447265625, 'learning_rate': 0.0008890814558058926, 'epoch': 0.71}
{'loss': 20.8726, 'grad_norm': 264.4077453613281, 'learning_rate': 0.0008705950317735413, 'epoch': 0.8}
{'loss': 20.376, 'grad_norm': 36.91596221923828, 'learning_rate': 0.000852108607741

TrainOutput(global_step=7180, training_loss=18.29993851005533, metrics={'train_runtime': 30382.7221, 'train_samples_per_second': 1.891, 'train_steps_per_second': 0.236, 'total_flos': 1.7216924738371584e+17, 'train_loss': 18.29993851005533, 'epoch': 4.99956480111411})

# Eval

In [None]:
ckpt_path = Path(r"D:/models/general-Llama-3_2-3B") / "checkpoint-best"

model_id = "meta-llama/Llama-3.2-3B"
base_model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype="float16") #.to(device)
expert_model = PeftModel.from_pretrained(base_model, ckpt_path, torch_dtype="float16").eval().to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
from FinMoE import FinMoE

ckpt_path = Path(rf"D:/models/FinMoE-v1") / "checkpoint-34464"
finMoE_model = FinMoE.load_pretrained(ckpt_path).to(device).eval()

## For FPB and Headline

In [3]:
def eval_preprocess_a(example, max_length=512):
    zeroshot = example['input'].rsplit("\n\n", maxsplit=1)[-1]
    return tokenizer(zeroshot,
                     truncation=True,
                     padding="max_length",
                     max_length=max_length,
                     return_tensors="pt")

In [4]:
dataset_id = "FPB"
testset_adaptllm = load_dataset("AdaptLLM/finance-tasks", dataset_id, split="test").map(eval_preprocess_a, batched=False)

Map:   0%|          | 0/970 [00:00<?, ? examples/s]

In [5]:
tok_options = {
    "FPB": [59794, 45003, 51957],    # " Neutral", " Positive", " Negative"
    "Headline": [7566, 2360],        # " Yes", " No"
}

results = evaluate(finMoE_model, tokenizer,
                   testset_adaptllm,
                   guidance=True,
                   tok_opts=tok_options[dataset_id])
print(results)

28.03:  16%|█▌        | 157/970 [00:54<04:44,  2.86it/s]


KeyboardInterrupt: 

## Topics

In [10]:
def eval_preprocess_b(example, max_length=512):
    zeroshot = prompt_templates["Topics"].format(example["text"])
    return tokenizer(zeroshot,
                     truncation=True,
                     padding="max_length",
                     max_length=max_length,
                     return_tensors="pt")

topic_options = [str(i) for i in range(len(topics))]
def add_options(example):
    example["options"] = topic_options
    return example

In [11]:
dataset_id = "Topics"

dataset_path = paths[dataset_id]
testset_df = pd.read_csv(dataset_path / "test.csv",
                            delimiter=del_mapping[dataset_id],
                            names=names_mapping[dataset_id])
testset_topics = (Dataset
           .from_pandas(testset_df)
           .map(eval_preprocess_b, batched=False)
           .map(add_options, batched=False)
           .rename_column("label", "gold_index"))

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

In [12]:
tok_opts_ids = tokenizer(topic_options)["input_ids"]
tok_opts = [arr[1] for arr in tok_opts_ids]

In [None]:
results = evaluate(expert_model, tokenizer,
                   testset_topics,
                   guidance=True,
                   tok_opts=tok_opts)
print(results)

82.59: 100%|██████████| 850/850 [02:47<00:00,  5.07it/s]

{'accuracy': 0.8258823529411765}



