In [None]:
from pathlib import Path

import torch
import numpy as np
import pandas as pd

In [None]:
import logging
logging.getLogger().setLevel(logging.INFO)

## Set important local paths

Set your root directory:

In [None]:
ROOT_DIR = Path("~").expanduser().resolve()     # LOCAL dir
# ROOT_DIR = Path("/fast/groups/sf")            # CLUSTER dir
ROOT_DIR

Directory where LLMs are saved:

In [None]:
MODELS_DIR = ROOT_DIR / "huggingface-models"

Directory where data is saved (or will be saved to):

In [None]:
DATA_DIR = ROOT_DIR / "data"

Other configs:

In [None]:
# MODEL_NAME = "meta-llama/Meta-Llama-3-8B"
# MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
MODEL_NAME = "google/gemma-2b"    # NOTE: this is among the smallest models

# TASK_NAME = "ACSIncome"
TASK_NAME = "ACSHealthInsurance-test"

RESULTS_ROOT_DIR = ROOT_DIR / "folktexts-results"

In [None]:
from folktexts.llm_utils import load_model_tokenizer, get_model_folder_path
model_folder_path = get_model_folder_path(model_name=MODEL_NAME, root_dir=MODELS_DIR)
model, tokenizer = load_model_tokenizer(model_folder_path)

In [None]:
results_dir = RESULTS_ROOT_DIR / Path(model_folder_path).name
results_dir.mkdir(exist_ok=True, parents=True)
results_dir

### Construct LLM Classifier

Load prediction task (which maps tabular data to text):

In [None]:
from folktexts.acs import ACSTaskMetadata
task = ACSTaskMetadata.get_task(TASK_NAME)

In [None]:
from folktexts.classifier import LLMClassifier
llm_clf = LLMClassifier(
    model=model,
    tokenizer=tokenizer,
    task=task,
    batch_size=32,
)

### Load Dataset

In [None]:
%%time
from folktexts.acs import ACSDataset
dataset = ACSDataset(task=task, cache_dir=DATA_DIR)

Optionally, subsample to quickly get approximate results:

In [None]:
dataset = dataset.subsample(0.1)
print(f"{dataset.subsampling=}")

### Load and run ACS Benchmark

**_Note:_** Helper constructors exist at `CalibrationBenchmark.make_acs_benchmark` and `CalibrationBenchmark.make_benchmark` that avoid the above boilerplate code.

In [None]:
from folktexts.benchmark import CalibrationBenchmark, BenchmarkConfig

bench = CalibrationBenchmark(
    llm_clf=llm_clf,
    dataset=dataset,
    config=BenchmarkConfig.default_config(),
)

Optionally, you can fit the model's threshold on a few data samples.

This is generally quite fast as it is _not fine-tuning_; it only changes one parameter: the `llm_clf.threshold`.

In [None]:
%%time
X_sample, y_sample = dataset.sample_n_train_examples(n=100)
llm_clf.fit(X_sample, y_sample)

Run benchmark...

In [None]:
%%time
bench.run(results_root_dir=results_dir)

In [None]:
bench.plot_results();

In [None]:
from pprint import pprint
pprint(bench.results, depth=1)

---