In [None]:
import pandas as pd
from pathlib import Path
from functools import partial
from huggingface_hub import constants as hub_c
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

model_id = "meta-llama/Llama-3.2-3B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

out_path = Path("D:/datasets/fsa")

In [2]:
def get_hf_path(dir_: str | Path) -> Path:
    path = Path(hub_c.HF_HUB_CACHE) / dir_
    with open(path / "refs" / "main", "r") as f_in:
        snapshot_ref = f_in.readline()
    return path / "snapshots" / snapshot_ref

In [3]:
fpb_path = get_hf_path("datasets--AdaptLLM--FPB")
fiqa_path = get_hf_path("datasets--AdaptLLM--FiQA_SA")

id2label = {
    0: "negative",
    1: "positive",
    2: "neutral"
}
label2id = {
    "negative": 0,
    "positive": 1,
    "neutral": 2,
}

In [4]:
def tokenize_func(tokenizer, examples):
    return tokenizer(examples["text"])

In [None]:
out_dataset = DatasetDict()
for split in ["train", "test"]:
    fpb_df = pd.read_csv(fpb_path / f"{split}.csv", delimiter="\t", index_col=0)
    fpb_df["label"] = fpb_df["label"].map(label2id).astype("Int16")

    fiqa_df = pd.read_csv(fiqa_path / f"{split}.csv", delimiter="\t", names=["text", "ticker", "label"]).drop("ticker", axis="columns")
    final_df = pd.concat([fpb_df, fiqa_df])
    tok_fn = partial(tokenize_func, tokenizer)
    out_dataset[split] = (Dataset
        .from_pandas(final_df).map(tok_fn,
            batched=True,
            num_proc=8,
            remove_columns=["text"])
        .rename_column("label", "labels")
        .with_format(type="torch", columns=["input_ids", "attention_mask", "labels"]))

out_dataset.save_to_disk(str(out_path))