In [1]:
import sys

sys.path.append("../..")
%load_ext autoreload
%autoreload 2

In [2]:
import json
import os

import datasets
import torch
from transformers import AutoTokenizer, LlamaForCausalLM

from src.hyperdas.data_utils import (
    filter_dataset,
    generate_ravel_dataset,
)

In [3]:
p = "/workspace/HyperDAS/assets/data/ravel/ravel_nobel_prize_winner_attribute_to_prompts.json"
with open(p, "r") as f:
    prompts = json.load(f)
    print(prompts.keys())

dict_keys(['Field', 'Award Year', 'Birth Year', 'Country of Birth', 'Gender'])


In [4]:
p = "/workspace/HyperDAS/assets/data/ravel/ravel_occupation_attribute_to_prompts.json"
with open(p, "r") as f:
    prompts = json.load(f)
    print(prompts.keys())

dict_keys(['Duty', 'Gender Bias', 'Industry', 'Work Location'])


In [5]:
model_name_or_path = "meta-llama/Meta-Llama-3-8B"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = LlamaForCausalLM.from_pretrained(
    model_name_or_path, torch_dtype=torch.bfloat16, device_map={"": 0}
)

tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
# CHANGE: {base_entity} → {source_entity} | ATTR: {target_attribute}
from collections import defaultdict

all_attributes = {
    "city": ["Country", "Continent", "Language", "Latitude", "Longitude", "Timezone"],
    "nobel_prize_winner": [
        "Field",
        "Award Year",
        "Birth Year",
        "Country of Birth",
        "Gender",
    ],
    "occupation": ["Duty", "Industry", "Work Location"],
}

target_attributes = {
    "city": ["Country"],
    "nobel_prize_winner": ["Field"],
    "occupation": ["Duty"],
}


domains = ["city", "nobel_prize_winner", "occupation"]
all_datasets = defaultdict(list)

for domain in domains:
    for split in ["train", "test"]:
        args = {
            "n_samples": 10000,
            "root_path": "/workspace/HyperDAS/assets/data/ravel",
            "target_attributes": target_attributes[domain],
            "isolate_attributes": list(
                set(all_attributes[domain]) - set(target_attributes[domain])
            ),
            "template_split": split,
            "entity_split": split,
            "domain": domain,
            # "edit_instruction_template": "CHANGE: {base_entity} -> {source_entity} | ATTR: {random_target_attribute}",
        }

        dataset = generate_ravel_dataset(**args)

        dataset = filter_dataset(model, tokenizer, dataset, batch_size=16)
        dataset = filter_dataset(model, tokenizer, dataset, batch_size=16)
        dataset = filter_dataset(model, tokenizer, dataset, batch_size=16)

        metadata = {
            **args,
            "target_attributes": tuple(args["target_attributes"]),
            "isolate_attributes": tuple(args["isolate_attributes"]),
        }

        all_datasets[split].append((dataset, metadata))


0it [00:00, ?it/s]

625it [00:42, 14.86it/s]


Accuracy: 0.6333; filtered out 3667 examples


396it [00:26, 15.02it/s]


Accuracy: 0.9958945207642508; filtered out 26 examples


395it [00:26, 15.04it/s]


Accuracy: 0.9995243380371016; filtered out 3 examples


625it [00:41, 15.00it/s]


Accuracy: 0.6186; filtered out 3814 examples


387it [00:25, 15.15it/s]


Accuracy: 0.9964435822825736; filtered out 22 examples


386it [00:25, 15.18it/s]


Accuracy: 0.9990266060999351; filtered out 6 examples


625it [00:43, 14.45it/s]


Accuracy: 0.7362; filtered out 2638 examples


461it [00:32, 14.22it/s]


Accuracy: 0.9980983428416191; filtered out 14 examples


460it [00:32, 14.21it/s]


Accuracy: 0.9986390854654328; filtered out 10 examples


625it [00:43, 14.53it/s]


Accuracy: 0.715; filtered out 2850 examples


447it [00:31, 14.26it/s]


Accuracy: 0.9952447552447552; filtered out 34 examples


445it [00:31, 14.27it/s]


Accuracy: 0.9977515458122541; filtered out 16 examples


625it [00:45, 13.60it/s]


Accuracy: 0.2814; filtered out 7186 examples


176it [00:12, 13.62it/s]


Accuracy: 0.9832977967306326; filtered out 47 examples


173it [00:12, 13.63it/s]


Accuracy: 0.9916877484640405; filtered out 23 examples


625it [00:45, 13.66it/s]


Accuracy: 0.249; filtered out 7510 examples


156it [00:11, 13.48it/s]


Accuracy: 0.9863453815261044; filtered out 34 examples


154it [00:11, 13.44it/s]

Accuracy: 0.995114006514658; filtered out 12 examples





In [9]:
for split, dataset_list in all_datasets.items():
    dataset_list, metadata_list = zip(*dataset_list)
    combined = datasets.concatenate_datasets(dataset_list)
    path = f"/workspace/HyperDAS/experiments/RAVEL/data/city_nobel_prize_winner_occupation_{split}"
    combined.save_to_disk(path)
    with open(os.path.join(path, "metadata.json"), "w") as f:
        json.dump({"metadata": metadata_list}, f)

Saving the dataset (0/1 shards):   0%|          | 0/16386 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/15702 [00:00<?, ? examples/s]

In [None]:
# CHANGE: {base_entity} → {source_entity} | ATTR: {target_attribute}
from collections import defaultdict

all_attributes = {
    "city": ["Country", "Continent", "Language", "Latitude", "Longitude", "Timezone"],
    "nobel_prize_winner": [
        "Field",
        "Award Year",
        "Birth Year",
        "Country of Birth",
        "Gender",
    ],
}

target_attributes = {
    "city": ["Country"],
    "nobel_prize_winner": ["Field"],
}


domains = ["city", "nobel_prize_winner"]
all_datasets = defaultdict(list)

for domain in domains:
    for split in ["train", "test"]:
        args = {
            "n_samples": 10000,
            "root_path": "/workspace/HyperDAS/assets/data/ravel",
            "target_attributes": target_attributes[domain],
            "isolate_attributes": list(
                set(all_attributes[domain]) - set(target_attributes[domain])
            ),
            "template_split": split,
            "entity_split": split,
            "domain": domain,
            # "edit_instruction_template": "CHANGE: {base_entity} -> {source_entity} | ATTR: {random_target_attribute}",
        }

        dataset = generate_ravel_dataset(**args)

        dataset = filter_dataset(model, tokenizer, dataset, batch_size=16)
        dataset = filter_dataset(model, tokenizer, dataset, batch_size=16)
        dataset = filter_dataset(model, tokenizer, dataset, batch_size=16)

        metadata = {
            **args,
            "target_attributes": tuple(args["target_attributes"]),
            "isolate_attributes": tuple(args["isolate_attributes"]),
        }

        all_datasets[split].append((dataset, metadata))


In [None]:
for split, dataset_list in all_datasets.items():
    dataset_list, metadata_list = zip(*dataset_list)
    combined = datasets.concatenate_datasets(dataset_list)
    path = f"/workspace/HyperDAS/experiments/RAVEL/data/city_nobel_prize_winner_{split}"
    combined.save_to_disk(path)
    with open(os.path.join(path, "metadata.json"), "w") as f:
        json.dump({"metadata": metadata_list}, f)