In [141]:
from transformer_lens import HookedTransformer
import transformer_lens.utils as utils

In [142]:
import pandas as pd
import numpy as np
import re
import random
import json
from datasets import Dataset, Features, Value, ClassLabel, Sequence

In [143]:
TOKEN_NAME_SIZE = 1
THRESHOLD = 0.8
DATASET_SIZE = 75
TEMPLATE_TYPE = "subject_with_name"
MODEL_NAME =  "gpt2-small" # "facebook/opt-125m", 
PROMPT_TYPE_SIZE = 50

NAMES_FILEPATH = "../../datasets/names.csv"
DATASET_PATH = "../../datasets/names_dataset.csv"

In [144]:
model = HookedTransformer.from_pretrained(
    MODEL_NAME,
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    refactor_factored_attn_matrices=True
)

she_token = model.to_tokens(f" she", prepend_bos=False)[0].tolist()[0]
he_token  = model.to_tokens(f" he", prepend_bos=False)[0].tolist()[0]

she_token, he_token

Loaded pretrained model gpt2-small into HookedTransformer


(673, 339)

In [158]:
model.reset_hooks(including_permanent=True)
# example_prompt = "The nation heard that Laura had announced how"
# example_prompt = "If I'm not mistaken, nobody was talking about Lisa because"
example_prompt = "If I'm not mistaken, Lisa acknowledged Christopher because"
example_answer = " she"
utils.test_prompt(example_prompt, example_answer, model, prepend_bos=True, top_k=10)

Tokenized prompt: ['<|endoftext|>', 'If', ' I', "'m", ' not', ' mistaken', ',', ' Lisa', ' acknowledged', ' Christopher', ' because']
Tokenized answer: [' she']


Top 0th token. Logit: 16.53 Prob: 31.34% Token: | he|
Top 1th token. Logit: 16.44 Prob: 28.54% Token: | she|
Top 2th token. Logit: 15.56 Prob: 11.92% Token: | of|
Top 3th token. Logit: 14.52 Prob:  4.21% Token: | they|
Top 4th token. Logit: 13.79 Prob:  2.02% Token: | "|
Top 5th token. Logit: 13.65 Prob:  1.75% Token: | the|
Top 6th token. Logit: 13.57 Prob:  1.62% Token: |,|
Top 7th token. Logit: 13.55 Prob:  1.60% Token: | Christopher|
Top 8th token. Logit: 13.48 Prob:  1.49% Token: | it|
Top 9th token. Logit: 13.34 Prob:  1.29% Token: | his|


In [146]:
# model = HookedTransformer.from_pretrained(
#     "facebook/opt-125m",
#     center_unembed=True,
#     center_writing_weights=True,
#     fold_ln=True,
#     refactor_factored_attn_matrices=True
# )
# 
# she_token = model.to_tokens(f" she", prepend_bos=False)[0].tolist()[0]
# he_token  = model.to_tokens(f" he", prepend_bos=False)[0].tolist()[0]
# 
# she_token, he_token

In [147]:
df = pd.read_csv(NAMES_FILEPATH)
df.head()

Unnamed: 0,name,F,M,name_weight,F_prop,M_prop,F_weighted,M_weighted,F_weighted_norm,M_weighted_norm,gpt2-small,gpt2-small-size,gpt2-medium,gpt2-medium-size,gpt2-large,gpt2-large-size,facebook/opt-125m,facebook/opt-125m-size,EleutherAI/gpt-neo-125M,EleutherAI/gpt-neo-125M-size
0,Aaban,0,87,2.550345e-07,0.0,1.0,0.0,2.550345e-07,0.0,0.000505,31745094,2,31745094,2,31745094,2,8326528,2,31745094,2
1,Aabha,28,0,8.208008e-08,1.0,0.0,8.208008e-08,0.0,0.000166,0.0,3173973099,3,3173973099,3,3173973099,3,838731999,3,3173973099,3
2,Aabid,0,5,1.465716e-08,0.0,1.0,0.0,1.465716e-08,0.0,2.9e-05,317397312,3,317397312,3,317397312,3,83873808,3,317397312,3
3,Aabriella,15,0,4.397147e-08,1.0,0.0,4.397147e-08,0.0,8.9e-05,0.0,31739738012627,4,31739738012627,4,31739738012627,4,8387310698461,4,31739738012627,4
4,Aada,5,0,1.465716e-08,1.0,0.0,1.465716e-08,0.0,3e-05,0.0,3174763,2,3174763,2,3174763,2,832095,2,3174763,2


In [148]:
def fill_prompt(prompt: str, 
                fillers: dict) -> str:
    """
    Fills a prompt template by replacing placeholders with actual values.

    Args:
        prompt (str): The prompt template containing placeholders in square brackets.
        name (str): The name to insert into the prompt.
        fillers (dict): Dictionary with possible replacements for placeholders.

    Returns:
        str: The filled prompt with all placeholders replaced.
    """
    elems = re.findall(r"\[(.*?)\]", prompt)
    for elem in elems:
        if elem in fillers:
            prompt = re.sub(rf"\[{elem}\]", random.choice(fillers[elem]), prompt)

    return prompt

In [149]:
def get_prompts(df: pd.DataFrame, prompt: str) -> pd.DataFrame:
    return pd.DataFrame({
        "original_prompt": df.apply(lambda row: re.sub(r"\[name\]", row["name_1"], prompt), axis=1),
        "corrupted_prompt": df.apply(lambda row: re.sub(r"\[name\]", row["name_2"], prompt), axis=1),
        "ablation_prompt": df.apply(lambda row: re.sub(r"\[name\]", "someone", prompt), axis=1)
    })

def get_name_token_info(model: HookedTransformer, 
                        df: pd.DataFrame, 
                        name_col: str = "name_1", 
                        prompt_col: str = "original_prompt") -> pd.DataFrame:
    """
    Returns token ids, token strings, and token positions for just the name (not the whole prompt) using the provided model.

    Args:
        model: The language model used for tokenization.
        df: DataFrame containing at least the columns with names and prompts.
        name_col: Name of the column containing the name. Default is "name".
        prompt_col: Name of the column containing the prompt. Default is "prompt".

    Returns:
        pd.DataFrame: DataFrame with columns 'name', 'token_ids', 'token_strs', 'token_positions'.
    """
    results = []
    for _, row in df.iterrows():
        name = row[name_col]
        prompt = row[prompt_col]
        
        # Tokenize prompt
        prompt_tokens = model.to_tokens(prompt, prepend_bos=True)[0].tolist()

        # Tokenize the name
        name_tokens = model.to_tokens(f" {name}", prepend_bos=False)[0].tolist()
        name_token_strs = model.to_str_tokens(f" {name}", prepend_bos=False)
        
        # Find positions of name tokens in the prompt
        positions = [ model.get_token_position(single_token=int(token), input=prompt, prepend_bos=True) for token in name_tokens ]

        results.append({
            "subject_name": name,
            "subject_token_ids": ",".join([str(t) for t in name_tokens]),
            "subject_token_strs": ",".join([str(t) for t in name_token_strs]),
            "subject_token_positions": ",".join([str(p) for p in positions]),
            "last_token_position": len(prompt_tokens) - 1
        })
        
    return pd.DataFrame(results)

In [150]:
def create_dataset(names: pd.DataFrame,
                   model: HookedTransformer,
                   prompt: str,
                   prompt_type: int,
                   dataset_size: int = DATASET_SIZE,
                   gdr_prop_thres: float = THRESHOLD,
                   he_token: int = 339,
                   she_token: int = 673
                   ) -> pd.DataFrame:
    """
    Creates a dataset by generating prompts and collecting model responses.

    Args:
        names: DataFrame containing names for prompt generation.
        model: The model used for generating responses.
        prompt: The prompt template to use for generation.
        dataset_size: The desired size of the dataset.
        template_type: The type of template to use.
        gdr_prop_thres: The threshold for gender proportion in the dataset.

    Returns:
        Dataset: The generated dataset.
    """
    males   = names[ names["M_weighted_norm"] > names["F_weighted_norm"] ]
    females = names[ names["F_weighted_norm"] > names["M_weighted_norm"] ]

    m_quantile = males["M_weighted_norm"].quantile(gdr_prop_thres)
    f_quantile = females["F_weighted_norm"].quantile(gdr_prop_thres)
    
    males   = males[ males["M_weighted_norm"] >= m_quantile ]
    females = females[ females["F_weighted_norm"] >= f_quantile ]

    sampled_males   = males.sample(n=dataset_size // 2, weights="M_weighted_norm", replace=False)
    sampled_females = females.sample(n=dataset_size // 2, weights="F_weighted_norm", replace=False)

    data_1 = pd.concat([sampled_males, sampled_females], axis=0).reset_index(drop=True)
    data_2 = pd.concat([sampled_females, sampled_males], axis=0).reset_index(drop=True)

    data_1["expected_token_id"] = he_token
    data_2["expected_token_id"] = she_token    

    data_1 = data_1.add_suffix("_1")
    data_2 = data_2.add_suffix("_2")

    data = pd.concat([data_1, data_2], axis=1).reset_index(drop=True)

    data = data.assign(**get_prompts(data, prompt))
    data = data.assign(**get_name_token_info(model, data))
    data["id"] = range(1, len(data) + 1)
    data["prompt_type"] = prompt_type
    data["expected_token_id"] = data.apply(lambda row: he_token if row["M_weighted_norm_1"] > row["F_weighted_norm_1"] else she_token, axis=1)

    return data

In [151]:
dataset_df = pd.DataFrame()

with open("../../src/json/templates_he_she.json", "r", encoding="utf-8") as f:
    jdata = json.load(f)

    df_filtered = df[ df[f"{MODEL_NAME}-size"] == TOKEN_NAME_SIZE ]
    df_filtered = df_filtered[ ["name", "F_weighted_norm", "M_weighted_norm", f"{MODEL_NAME}", f"{MODEL_NAME}-size"] ]

    for _ in range(PROMPT_TYPE_SIZE):
        prompt      = random.choice(jdata["templates"][TEMPLATE_TYPE]["prompt_templates"])
        complements = jdata["complements"]
        prompt      = fill_prompt(prompt, complements)

        data = create_dataset(names=df_filtered, 
                              model=model, 
                              prompt=prompt, 
                              prompt_type=_,
                              he_token=339 if "gpt" in MODEL_NAME else 37,
                              she_token=673 if "gpt" in MODEL_NAME else 79
                              )

        if len(dataset_df) == 0:
            dataset_df = data
        else:
            dataset_df = pd.concat([dataset_df, data], axis=0).reset_index(drop=True)

In [152]:
features = Features({
    "id": Value("int32"),
    "prompt_type": ClassLabel(names=[f"type_{i}" for i in range(PROMPT_TYPE_SIZE)]),
    "prompts":{
        "org_prompt": Value("string"),
        "corr_prompt": Value("string"),
        "ablated_prompt": Value("string")
    },
    "subject":{
        "token_idxs": Sequence(Value("int32")),
        "tokens": Sequence(Value("string")),
        "pos": Sequence(Value("int32"))
    },
    "end":{
        "pos": Value("int32")
    },
    "expected_token_id": Value("int32")
})

shared_config = {
    "F": {"token": " she",
          "token_id": she_token
        },
    "M": {"token": " he",
          "token_id": he_token
        }
}

In [153]:
dataset_dict = {
    "id": dataset_df["id"].tolist(),
    "prompt_type": dataset_df["prompt_type"].tolist(),
    "prompts":[
        {
            "org_prompt": org,
            "corr_prompt": corr,
            "ablated_prompt": ablated
        } for org, corr, ablated in zip(dataset_df["original_prompt"].tolist(),
                                       dataset_df["corrupted_prompt"].tolist(),
                                       dataset_df["ablation_prompt"].tolist())
    ],
    "subject":[
        {
            "token_idxs": [int(t) for t in token_ids.split(",")],
            "tokens": [str(t) for t in token_strs.split(",")],
            "pos": [int(p) for p in positions.split(",")]
        } for token_ids, token_strs, positions in zip(dataset_df["subject_token_ids"].tolist(),
                                                     dataset_df["subject_token_strs"].tolist(),
                                                     dataset_df["subject_token_positions"].tolist())
    ],
    "end":[
        {"pos": pos} for pos in dataset_df["last_token_position"].tolist()
    ],
    "expected_token_id": dataset_df["expected_token_id"].tolist()
}

In [154]:
final_dataset = Dataset.from_dict(dataset_dict, features=features)

In [155]:
final_dataset["prompts"]["org_prompt"][[ idx for idx in range(1, 3000, 150)]] # type: ignore

['In a moment of clarity, George mentioned how',
 'As people say, Sam confessed when',
 "If I'm not mistaken, Eric (with a smile) decided that",
 'In a moment of clarity, Jose (with excitement) decided how',
 'As people say, Jason acknowledged that',
 'Back then, Cameron confessed in a playful voice how',
 'At that moment, Richard (in a clear voice) admitted why',
 'In a moment of clarity, Jose (with a smile) mentioned that',
 "From what I've heard, Ryan said with hesitation when",
 'As I recall, Jose (to the masses) mentioned that',
 'During the event, Matthew revealed what',
 'At that moment, Jonathan mentioned to the assembly how',
 'Before anyone noticed, Justin announced how',
 'At the party, Julian declared in a loud voice that',
 'During the event, Robert admitted despite the noise what',
 'A few days back, Adrian announced with excitement when',
 'A few days back, Phillip (in a hesitant voice) announced that',
 'Last time Terry confessed in a nervous voice that',
 'In a flash o

In [156]:
final_dataset[158]

{'id': 11,
 'prompt_type': 2,
 'prompts': {'org_prompt': 'As people say, Kevin confessed when',
  'corr_prompt': 'As people say, Helen confessed when',
  'ablated_prompt': 'As people say, someone confessed when'},
 'subject': {'token_idxs': [7939], 'tokens': [' Kevin'], 'pos': [5]},
 'end': {'pos': 7},
 'expected_token_id': 339}

In [157]:
final_dataset.save_to_disk(f"../../datasets/{TEMPLATE_TYPE}_{TOKEN_NAME_SIZE}_tokens{'_opt' if 'opt' in MODEL_NAME else ''}")

Saving the dataset (1/1 shards): 100%|██████████| 3700/3700 [00:00<00:00, 192690.71 examples/s]
