In [1]:
from transformer_lens import HookedTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import sys
import os

src_path = os.path.normpath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

from utils.dataset import to_token_ids

In [3]:
model_names = ["gpt2-small", "gpt2-medium", "gpt2-large", "facebook/opt-125m", "EleutherAI/gpt-neo-125M"]
source_path = "../datasets/raw_names.csv"
target_path = "../datasets/names.csv"

In [4]:
df = pd.read_csv(source_path)
df.head()

Unnamed: 0,name,gender,number
0,James,M,5122407
1,John,M,5096818
2,Robert,M,4803587
3,Michael,M,4326215
4,Mary,F,4118147


In [5]:
df_pivoted = (
    df
    .pivot_table(
        index="name",
        columns="gender",
        values="number",
        aggfunc="sum",
        fill_value=0
    )
    .reset_index()
)

In [6]:
total = df_pivoted["F"].sum() + df_pivoted["M"].sum()


df_pivoted["name_weight"] = (df_pivoted["F"] + df_pivoted["M"]) / total

# Calcular proporciones de género
df_pivoted["F_prop"] = df_pivoted["F"] / (df_pivoted["F"] + df_pivoted["M"])
df_pivoted["M_prop"] = 1 - df_pivoted["F_prop"]

# Calcular ponderaciones globales
df_pivoted["F_weighted"] = df_pivoted["F_prop"] * df_pivoted["name_weight"] 
df_pivoted["M_weighted"] = df_pivoted["M_prop"] * df_pivoted["name_weight"] 

# Normalizar ponderaciones
df_pivoted["F_weighted_norm"] = (df_pivoted["F_weighted"] / df_pivoted["F_weighted"].sum()) * 1000
df_pivoted["M_weighted_norm"] = (df_pivoted["M_weighted"] / df_pivoted["M_weighted"].sum()) * 1000


In [8]:
for model_name in model_names:
    model = HookedTransformer.from_pretrained(
        model_name,
        center_unembed=True,
        center_writing_weights=True,
        fold_ln=True,
        refactor_factored_attn_matrices=True
    )

    df_pivoted[model_name] = df_pivoted["name"].apply(lambda x: to_token_ids(model, x))
    df_pivoted[f"{model_name}-size"] = df_pivoted[model_name].apply(lambda x: len(x.split(",")))

Loaded pretrained model gpt2-small into HookedTransformer


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Loaded pretrained model gpt2-medium into HookedTransformer


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Loaded pretrained model gpt2-large into HookedTransformer


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Loaded pretrained model facebook/opt-125m into HookedTransformer


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Loaded pretrained model EleutherAI/gpt-neo-125M into HookedTransformer


In [9]:
df_pivoted

gender,name,F,M,name_weight,F_prop,M_prop,F_weighted,M_weighted,F_weighted_norm,M_weighted_norm,gpt2-small,gpt2-small-size,gpt2-medium,gpt2-medium-size,gpt2-large,gpt2-large-size,facebook/opt-125m,facebook/opt-125m-size,EleutherAI/gpt-neo-125M,EleutherAI/gpt-neo-125M-size
0,Aaban,0,87,2.550345e-07,0.0,1.0,0.000000e+00,2.550345e-07,0.000000,0.000505,31745094,2,31745094,2,31745094,2,8326528,2,31745094,2
1,Aabha,28,0,8.208008e-08,1.0,0.0,8.208008e-08,0.000000e+00,0.000166,0.000000,3173973099,3,3173973099,3,3173973099,3,838731999,3,3173973099,3
2,Aabid,0,5,1.465716e-08,0.0,1.0,0.000000e+00,1.465716e-08,0.000000,0.000029,317397312,3,317397312,3,317397312,3,83873808,3,317397312,3
3,Aabriella,15,0,4.397147e-08,1.0,0.0,4.397147e-08,0.000000e+00,0.000089,0.000000,31739738012627,4,31739738012627,4,31739738012627,4,8387310698461,4,31739738012627,4
4,Aada,5,0,1.465716e-08,1.0,0.0,1.465716e-08,0.000000e+00,0.000030,0.000000,3174763,2,3174763,2,3174763,2,832095,2,3174763,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95021,Zyvion,0,5,1.465716e-08,0.0,1.0,0.000000e+00,1.465716e-08,0.000000,0.000029,4090585295,3,4090585295,3,4090585295,3,300647051499,3,4090585295,3
95022,Zyvon,0,6,1.758859e-08,0.0,1.0,0.000000e+00,1.758859e-08,0.000000,0.000035,4090526982,2,4090526982,2,4090526982,2,3006420482,2,4090526982,2
95023,Zyyanna,6,0,1.758859e-08,1.0,0.0,1.758859e-08,0.000000e+00,0.000036,0.000000,1168225567697,3,1168225567697,3,1168225567697,3,525331754057,3,1168225567697,3
95024,Zyyon,0,6,1.758859e-08,0.0,1.0,0.000000e+00,1.758859e-08,0.000000,0.000035,4090519181,2,4090519181,2,4090519181,2,3006421743,2,4090519181,2


In [10]:
df_pivoted[df_pivoted["name"] == "Madison"]

gender,name,F,M,name_weight,F_prop,M_prop,F_weighted,M_weighted,F_weighted_norm,M_weighted_norm,gpt2-small,gpt2-small-size,gpt2-medium,gpt2-medium-size,gpt2-large,gpt2-large-size,facebook/opt-125m,facebook/opt-125m-size,EleutherAI/gpt-neo-125M,EleutherAI/gpt-neo-125M-size
56809,Madison,358035,7543,0.001072,0.979367,0.020633,0.00105,2.2e-05,2.118761,0.043817,14909,1,14909,1,14909,1,6370,1,14909,1


In [11]:
df_pivoted[ df_pivoted["M_weighted_norm"] < 0.01  ].sort_values("F_weighted_norm", ascending=False).head(20)

gender,name,F,M,name_weight,F_prop,M_prop,F_weighted,M_weighted,F_weighted_norm,M_weighted_norm,gpt2-small,gpt2-small-size,gpt2-medium,gpt2-medium-size,gpt2-large,gpt2-large-size,facebook/opt-125m,facebook/opt-125m-size,EleutherAI/gpt-neo-125M,EleutherAI/gpt-neo-125M-size
21538,Deborah,739654,1658,0.002173,0.997763,0.002237,0.002168,5e-06,4.377086,0.009631,36976,1,36976,1,36976,1,18878,1,36976,1
45737,Kathleen,710947,1693,0.002089,0.997624,0.002376,0.002084,5e-06,4.207205,0.009835,31275,1,31275,1,31275,1,15931,1,31275,1
27398,Emma,616404,1585,0.001812,0.997435,0.002565,0.001807,5e-06,3.647723,0.009207,18966,1,18966,1,18966,1,7957,1,18966,1
68522,Pamela,593850,1387,0.001745,0.99767,0.00233,0.001741,4e-06,3.514255,0.008057,43341,1,43341,1,43341,1,22998,1,43341,1
75429,Samantha,562160,1194,0.001651,0.997881,0.002119,0.001648,4e-06,3.326721,0.006936,34778,1,34778,1,34778,1,14155,1,34778,1
38159,Janet,555585,1518,0.001633,0.997275,0.002725,0.001629,4e-06,3.287812,0.008818,28111,1,28111,1,28111,1,11239,1,28111,1
14453,Carolyn,554145,1619,0.001629,0.997087,0.002913,0.001624,5e-06,3.27929,0.009405,43450,1,43450,1,43450,1,19949,1,43450,1
21552,Debra,550226,1146,0.001616,0.997922,0.002078,0.001613,3e-06,3.256099,0.006657,8965430,2,8965430,2,8965430,2,10532763,2,8965430,2
33547,Heather,524018,1482,0.00154,0.99718,0.00282,0.001536,4e-06,3.101006,0.008609,24253,1,24253,1,24253,1,10588,1,24253,1
23723,Diane,517797,1386,0.001522,0.99733,0.00267,0.001518,4e-06,3.064192,0.008051,26542,1,26542,1,26542,1,13649,1,26542,1


In [12]:
df_pivoted.to_csv(target_path, index=False)