In [1]:
from pathlib import Path

import torch
import numpy as np
import pandas as pd

In [2]:
import logging
logging.getLogger().setLevel(logging.INFO)

In [3]:
ROOT_DIR = Path("/fast/groups/sf")
# ROOT_DIR = Path("/fast/acruz/")
# ROOT_DIR = Path("/Users/acruz/")

MODELS_DIR = ROOT_DIR / "huggingface-models"
# MODELS_DIR = ROOT_DIR / "data" / "huggingface-models"

DATA_DIR = ROOT_DIR / "data"

MODEL_NAME = "meta-llama/Meta-Llama-3-8B"
# MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
# MODEL_NAME = "google/gemma-2b"

TASK_NAME = "ACSIncome"

RESULTS_ROOT_DIR = ROOT_DIR / "folktexts-results"

DEVICE = "cuda" if torch.cuda.is_available() else "mps"

In [4]:
from folktexts.llm_utils import load_model_tokenizer, get_model_folder_path
model_folder_path = get_model_folder_path(model_name=MODEL_NAME, root_dir=MODELS_DIR)
model, tokenizer = load_model_tokenizer(model_folder_path)

INFO:root:Loading model '/lustre/fast/fast/groups/sf/huggingface-models/meta-llama--Meta-Llama-3-8B'
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

INFO:root:Moving model to device: cuda


In [5]:
results_dir = RESULTS_ROOT_DIR / Path(model_folder_path).name
results_dir.mkdir(exist_ok=True, parents=False)
results_dir

PosixPath('/fast/groups/sf/folktexts-results/meta-llama--Meta-Llama-3-8B')

### Construct LLM Classifier

In [6]:
from folktexts.classifier import LLMClassifier

clf = LLMClassifier(
    model=model,
    tokenizer=tokenizer,
    task=TASK_NAME,
    batch_size=30,
)

### Load Dataset

In [7]:
%%time
from folktexts.acs import ACSDataset
dataset = ACSDataset(task_name=TASK_NAME, cache_dir=DATA_DIR)

CPU times: user 41.9 s, sys: 24.7 s, total: 1min 6s
Wall time: 1min 6s


In [8]:
#dataset = dataset.subsample(0.02)
print(f"{dataset.subsampling=}")

dataset.subsampling=None


### Run ACS Benchmark

In [9]:
import importlib
import folktexts
import folktexts.classifier
from folktexts import classifier
importlib.reload(folktexts)
importlib.reload(classifier)
importlib.reload(folktexts.classifier)

<module 'folktexts.classifier' from '/lustre/home/acruz/folktexts/folktexts/classifier.py'>

In [11]:
import folktexts.cli.run_benchmark
importlib.reload(folktexts.cli.run_benchmark)

results = folktexts.cli.run_benchmark.run_llm_risk_scores_evaluation(
    llm_clf=clf,
    dataset=dataset,
    results_dir=results_dir,
)

Test data features shape: (166450, 10)


INFO:root:Loaded predictions from /fast/groups/sf/folktexts-results/meta-llama--Meta-Llama-3-8B/ACSIncome_full_seed-42.test_predictions.csv.
INFO:root:Saving JSON file to '/fast/groups/sf/folktexts-results/meta-llama--Meta-Llama-3-8B/results.json'



** Test results **
Model balanced accuracy:  51.6%;
Model accuracy:           41.3%;
Model ROC AUC :           79.9%;

Saved experiment results to '/fast/groups/sf/folktexts-results/meta-llama--Meta-Llama-3-8B'


In [12]:
X_sample, y_sample = dataset.sample_n_train_examples(n=100)
clf.fit(X_sample, y_sample)

Computing risk estimates:   0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
results_new = folktexts.cli.run_benchmark.run_llm_risk_scores_evaluation(
    llm_clf=clf,
    dataset=dataset,
    results_dir=results_dir,
)

INFO:root:Loaded predictions from /fast/groups/sf/folktexts-results/meta-llama--Meta-Llama-3-8B/ACSIncome_full_seed-42.test_predictions.csv.


Test data features shape: (166450, 10)


INFO:root:Saving JSON file to '/fast/groups/sf/folktexts-results/meta-llama--Meta-Llama-3-8B/results.json'



** Test results **
Model balanced accuracy:  73.1%;
Model accuracy:           72.8%;
Model ROC AUC :           79.9%;

Saved experiment results to '/fast/groups/sf/folktexts-results/meta-llama--Meta-Llama-3-8B'


---
### Testing things out...

In [22]:
test_scores_path = results_dir / f"{dataset.get_name()}.test_predictions.csv"
assert test_scores_path.exists()
y_test_scores_df = pd.read_csv(test_scores_path, index_col=0)
y_test_scores_df.head()

Unnamed: 0,risk_score,label
2204045,0.592628,0
620893,0.562312,1
2824323,0.562196,1
1600833,0.562072,0
2331470,0.622403,1


In [18]:
y_test = y_test_scores_df["label"].to_numpy()
y_test_scores = y_test_scores_df["risk_score"].to_numpy()

In [23]:
import folktexts.plotting
import importlib
importlib.reload(folktexts.plotting)

folktexts.plotting.render_evaluation_plots(
    y_true=y_test,
    y_pred_scores=y_test_scores,
    sensitive_attribute=sensitive_attribute,
    imgs_dir=results_dir,
    # eval_results=results,
    eval_results=results_new,
    model_name=Path(MODEL_NAME).name,
)



{'roc_curve_path': '/lustre/fast/fast/groups/sf/folktexts-results/meta-llama--Meta-Llama-3-8B/roc_curve.pdf',
 'calibration_curve_path': '/lustre/fast/fast/groups/sf/folktexts-results/meta-llama--Meta-Llama-3-8B/calibration_curve.pdf',
 'score_distribution_path': '/lustre/fast/fast/groups/sf/folktexts-results/meta-llama--Meta-Llama-3-8B/score_distribution.pdf',
 'score_distribution_per_label_path': '/lustre/fast/fast/groups/sf/folktexts-results/meta-llama--Meta-Llama-3-8B/score_distribution_per_label.pdf',
 'roc_curve_per_subgroup_path': '/lustre/fast/fast/groups/sf/folktexts-results/meta-llama--Meta-Llama-3-8B/roc_curve_per_subgroup.pdf',
 'calibration_curve_per_subgroup_path': '/lustre/fast/fast/groups/sf/folktexts-results/meta-llama--Meta-Llama-3-8B/calibration_curve_per_subgroup.pdf'}