In [1]:
import torch
import pandas as pd
from functools import partial
from pathlib import Path
from datasets import Dataset, interleave_datasets
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
from transformers import Trainer, TrainingArguments, AutoTokenizer, DataCollatorForLanguageModeling
from transformers.models.llama.modeling_llama import LlamaForCausalLM

from FinMoE import FinMoE, FinMoEConfig
from utils import DatasetArgs, get_dataset_args

assert torch.cuda.is_available(), "CUDA not available"
device = torch.device("cuda")

seed = 42
torch.manual_seed(seed)

MAX_LENGTH = 512
dataset_cache_path = Path(r"D:/datasets/general-3-tasks")

model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

## Dataset setup

In [2]:
hub_basepath = Path(r"C:\Users\samba\.cache\huggingface\hub")
args = get_dataset_args(tokenizer, hub_basepath)

## Load and Preprocess Dataset

In [3]:
def train_preprocess_causal(args: DatasetArgs, dataset_id: str, example: dict):
    # Create prompt and target text
    prompt_args = [example[key] for key in args.prompt_args[dataset_id]]
    prompt = args.prompt_templates[dataset_id].format(*prompt_args)

    target = args.id2labels[dataset_id][example["label"]]
    full_text = prompt + target

    # tokenize text
    tokenized = tokenizer(full_text,
                          truncation=True,
                          padding="max_length",
                          max_length=MAX_LENGTH)
    
    # add padding tokens
    prompt_tokenized = tokenizer(prompt,
                              truncation=True,
                              max_length=MAX_LENGTH)
    prompt_length = len(prompt_tokenized["input_ids"])

    labels = tokenized["input_ids"].copy()
    labels[:prompt_length] = [-100] * prompt_length
    tokenized["labels"] = labels
    return tokenized

def train_preprocess_tokenclass(args: DatasetArgs, dataset_id: str, example: dict):
    # Create prompt and target text
    prompt_args = [example[key] for key in args.prompt_args[dataset_id]]
    prompt = args.prompt_templates[dataset_id].format(*prompt_args)

    # tokenize text
    tokenized = tokenizer(prompt,
                          truncation=True,
                          padding="max_length",
                          max_length=MAX_LENGTH)

    # tokenize and index label
    target = args.id2labels[dataset_id][example["label"]]
    token_target = tokenizer.encode(target, add_special_tokens=False)[0]
    label = args.token_list.index(token_target)
    tokenized["labels"] = label

    return tokenized

In [None]:
nrows_list = [3876, 3876, 3876]
dataset_list = []
for i, (dataset_id, dataset_path) in enumerate(args.paths.items()):
    train_subset = pd.read_csv(dataset_path / "train.csv",
                                delimiter=args.del_mapping[dataset_id],
                                names=args.names_mapping[dataset_id],
                                nrows=nrows_list[i])

    preprocess_func = partial(train_preprocess_tokenclass, args, dataset_id)
    dataset_list.append(Dataset
                        .from_pandas(train_subset)
                        .map(preprocess_func, batched=False)
                        .remove_columns(args.columns[dataset_id]))

n_datasets = len(dataset_list)
train_dataset = interleave_datasets(dataset_list, 
                                    probabilities=[1/n_datasets]*n_datasets,
                                    seed=seed)

In [None]:
save_path = dataset_cache_path.with_stem("finmoe-tokenclass_balanced-len512")
train_dataset.save_to_disk(save_path)

In [3]:
load_path = dataset_cache_path.with_stem("finmoe-tokenclass_balanced-len512")
train_dataset = Dataset.load_from_disk(load_path)

## Training

### Peft Model trainer

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.01,
    target_modules=["q_proj", "v_proj"]
)

model_id = "meta-llama/Llama-3.2-1B"
base_model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype="float16")

peft_model = get_peft_model(base_model, peft_config)
peft_model.print_trainable_parameters()

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

out_dir = Path(rf"D:/models/general-Llama-3_2-1B")
training_args = TrainingArguments(
    output_dir=str(out_dir),
    num_train_epochs=6,
    per_device_train_batch_size=2,
    learning_rate=1e-3,
    weight_decay=0.01,
    warmup_steps=1000,
    save_strategy="epoch",
    do_train=True,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

### FinMoE trainer

In [None]:
ckpt_base = Path(r"D:/models")

expert_ckpt_names = {"FPB": "checkpoint-best",
                     "Headline": "checkpoint-best",
                     "Topics": "checkpoint-best"}

## note: str() wraps path as Path objects are not json serializable
expert_ckpts = [str(ckpt_base / f"expert-Llama-3_2-1B-{expert_name}" / expert_ckpt_names[expert_name])
                for expert_name in args.expert_order]

finMoE_config = FinMoEConfig(
    # loss_type="ForCausalLM",
    loss_type="ForTokenClassification",
    num_labels=len(args.token_list),

    expert_ckpts=expert_ckpts,
    # gating_guassian=0.2,
    token_list=args.token_list,
)

finMoE_model = FinMoE(finMoE_config).to(device)
print("Memory allocated:", torch.cuda.memory_allocated())
print("Trainable params:")
for name, params in finMoE_model.named_parameters():
    if params.requires_grad:
        print(name, params.shape)


## --- use data_collator when training with "ForCausalLM" loss
# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer, mlm=False
# )

max_steps=64
out_dir = Path(rf"D:/models/FinMoE-top3-64steps")
training_args = TrainingArguments(
    output_dir=str(out_dir),
    max_steps=max_steps,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=1e-3,
    weight_decay=0.01,
    warmup_steps=max_steps//2,
    logging_steps=4,
    save_strategy="steps",
    save_steps=max_steps,
    do_train=True,
)

trainer = Trainer(
    model=finMoE_model,
    args=training_args,
    train_dataset=train_dataset,
    # data_collator=data_collator,
)

Memory allocated: 4953514496
Trainable params:
gate.w_gate.weight torch.Size([3, 2048])


max_steps is given, it will override any value given in num_train_epochs


### Train

In [5]:
trainer.train()

  0%|          | 0/64 [00:00<?, ?it/s]

{'loss': 19.1301, 'grad_norm': 1840.8421630859375, 'learning_rate': 0.000125, 'epoch': 0.01}
{'loss': 3.9299, 'grad_norm': 107.18506622314453, 'learning_rate': 0.00025, 'epoch': 0.01}
{'loss': 2.1476, 'grad_norm': 16.01112174987793, 'learning_rate': 0.000375, 'epoch': 0.02}
{'loss': 0.8053, 'grad_norm': 24.19095230102539, 'learning_rate': 0.0005, 'epoch': 0.02}
{'loss': 2.3128, 'grad_norm': 55.93996810913086, 'learning_rate': 0.000625, 'epoch': 0.03}
{'loss': 2.9351, 'grad_norm': 21.010822296142578, 'learning_rate': 0.00075, 'epoch': 0.03}
{'loss': 0.6664, 'grad_norm': 16.120859146118164, 'learning_rate': 0.000875, 'epoch': 0.04}
{'loss': 2.4318, 'grad_norm': 0.29755476117134094, 'learning_rate': 0.001, 'epoch': 0.04}
{'loss': 1.6959, 'grad_norm': 46.28948974609375, 'learning_rate': 0.000875, 'epoch': 0.05}
{'loss': 2.8238, 'grad_norm': 111.68728637695312, 'learning_rate': 0.00075, 'epoch': 0.06}
{'loss': 0.6024, 'grad_norm': 0.11406492441892624, 'learning_rate': 0.000625, 'epoch': 0.0

TrainOutput(global_step=64, training_loss=3.00837929174304, metrics={'train_runtime': 494.4294, 'train_samples_per_second': 2.071, 'train_steps_per_second': 0.129, 'total_flos': 3069312478740480.0, 'train_loss': 3.00837929174304, 'epoch': 0.08912873183044652})

# Eval

In [3]:
from evals import evaluate, evaluate_FinMoE, evaluate_FinMoE_weighted, load_eval_dataset

In [12]:
dataset_id = "FPB"
testset = load_eval_dataset(tokenizer, dataset_id, args)

Loading FPB dataset from AdaptLLM/finance-tasks


### Load an expert checkpoint

In [4]:
ckpt_path = Path(r"D:/models/general-Llama-3_2-1B") / "checkpoint-best"

model_id = "meta-llama/Llama-3.2-1B"
base_model = LlamaForCausalLM.from_pretrained(model_id, torch_dtype="float16") #.to(device)
expert_model = PeftModel.from_pretrained(base_model, ckpt_path, torch_dtype="float16").eval().to(device)

In [None]:
results = evaluate(expert_model, tokenizer,
                   testset,
                   guidance=True,
                   token_opts=args.token_opts[dataset_id])
print(results)

### Load a FinMoE checkpoint

In [5]:
ckpt_path = Path(r"D:/models/FinMoE-top3") / "checkpoint-3590"
# ckpt_path = Path(r"D:/models/FinMoE-top3-64steps") / "checkpoint-64"
finMoE_model = FinMoE.load_pretrained(ckpt_path).to(device).eval()

In [6]:
results = evaluate_FinMoE(finMoE_model, tokenizer,
                          testset,
                          args.token_opts[dataset_id])
print(results)

Routing experts: 100%|██████████| 970/970 [00:50<00:00, 19.17it/s]
Evaluating | 54.95: 100%|██████████| 970/970 [01:01<00:00, 15.76it/s]

{'accuracy': 0.5494845360824743, 'n_correct': 533, 'n_total': 970}





In [13]:
results = evaluate_FinMoE_weighted(finMoE_model,
                                   testset,
                                   dataset_id,
                                   args)
print(results)

Routing experts: 100%|██████████| 970/970 [00:49<00:00, 19.60it/s]
Evaluating: 100%|██████████| 2910/2910 [02:58<00:00, 16.28it/s]

{'accuracy': tensor(0.4948), 'n_correct': tensor(480), 'n_total': 970}





In [None]:
# FinMoE = 1238376448
# 1 adapter = 851968
# llama  = 1235814400
# w_gate = 6144
# 1235814400 + 851968 * 3 + 6144 == 1238376448