#### This notebook studies the impact of changing the seed in the undersampling of CK classe on the classification metrics. The data in `pythia_mlp_l1_relation_query_seed_impact_data` was generated on `Pythia`'s relation query token on the `MLP`'s first layer using different seeds (0, 1, 2, 3, and 4).

In [2]:
import pandas as pd
import numpy as np

In [None]:
def load_and_process_data_with_ci_for_seed_experiments(
    model_name: str, module_type: str, specific_relation: str, seed: int
) -> pd.DataFrame:
    classification_metrics = pd.read_csv(
        f"pythia_mlp_l1_relation_query_seed_impact_data/metrics_{seed}.csv"
    )

    if specific_relation is not None:
        classification_metrics = classification_metrics[classification_metrics.relation_group_id == specific_relation]

    classification_metrics = classification_metrics.rename(columns={"success_rate": "success_rate_layer"})

    classification_metrics["se_k"] = np.sqrt(
        (classification_metrics["success_rate_layer"] * (1 - classification_metrics["success_rate_layer"]))
        / classification_metrics["nb_test_examples"]
    )
    classification_metrics["nb_success_trials_layer"] = (
        classification_metrics["success_rate_layer"] * classification_metrics["nb_test_examples"]
    )

    grouped_classification_metrics = (
        classification_metrics.groupby(["layer"])
        .agg(
            acc_mean=("success_rate_layer", "mean"),
            total_trials=("nb_success_trials_layer", "sum"),
        )
        .reset_index()
    )

    all_data_nb_trials = pd.merge(classification_metrics, grouped_classification_metrics, on="layer")

    all_data_nb_trials["weighted_se_k"] = (
        all_data_nb_trials["nb_test_examples"] / all_data_nb_trials["total_trials"]
    ) ** 2 * all_data_nb_trials["se_k"] ** 2

    global_se = (
        all_data_nb_trials.groupby(["layer"])
        .agg(
            se=("weighted_se_k", "sum"),
            nb_success_total=("nb_success_trials_layer", "sum"),
            nb_trials_total=("nb_test_examples", "sum"),
        )
        .reset_index()
    )

    global_se["se"] = np.sqrt(global_se["se"])
    global_se["P"] = global_se["nb_success_total"] / global_se["nb_trials_total"]

    # Calculate the confidence intervals using the Z-score for 95% CI
    z = 1.96
    global_se["P_ci_lower"] = global_se["P"] - z * global_se["se"]
    global_se["P_ci_upper"] = global_se["P"] + z * global_se["se"]

    return global_se


In [None]:
mlp_l1_data_seeds = []

for seed in [0, 1, 2, 3, 4]:

    mlp_l1_data_seed = load_and_process_data_with_ci_for_seed_experiments(
        model_name="EleutherAI_pythia-1.4b",
        module_type="mlps_l1",
        specific_relation=None,
        seed=seed
    )

    mlp_l1_data_seed["seed"] = seed

    mlp_l1_data_seeds.append(mlp_l1_data_seed)

In [5]:
df = pd.concat(mlp_l1_data_seeds)
df.groupby("layer").std()

Unnamed: 0_level_0,se,nb_success_total,nb_trials_total,P,P_ci_lower,P_ci_upper,seed
layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,0.000176,5.94138,0.0,0.00345,0.003796,0.003105,1.581139
1.0,0.0054,208.215513,0.0,0.120915,0.131465,0.110371,1.581139
2.0,0.006352,193.978092,0.0,0.112647,0.125035,0.100275,1.581139
3.0,0.003108,114.733169,0.0,0.066628,0.072715,0.060542,1.581139
4.0,0.002952,117.703866,0.0,0.068353,0.074138,0.062568,1.581139
5.0,0.002364,100.471389,0.0,0.058346,0.062978,0.053714,1.581139
6.0,0.00464,179.943602,0.0,0.104497,0.113583,0.095412,1.581139
7.0,0.002716,119.028148,0.0,0.069122,0.074444,0.0638,1.581139
8.0,0.003656,155.42458,0.0,0.090258,0.097421,0.083096,1.581139
9.0,0.003716,163.98567,0.0,0.09523,0.102512,0.087947,1.581139
