In [None]:
import pandas as pd

FINGERPRINT_COLS = ['optimizer', 'optimizer_other', 'subspace', 'opset']
HPSET = "hyperparameter_set"
TAG = "tag"
TAG_MIXED_HPS = "models-train-mixed-hps"


In [None]:
df = pd.read_csv("complete_model_runs.csv")
df.columns, len(df)

In [None]:
df.tag.unique()

In [None]:
grouped_df = df.groupby(['optimizer', 'optimizer_other', 'subspace', 'opset'])
group_counts = grouped_df.size()
group_counts

In [None]:
unique_hyperparameter_sets = grouped_df['hyperparameter_set'].nunique()
incomplete_runs_groups = unique_hyperparameter_sets[unique_hyperparameter_sets != 9]
incomplete_runs_groups

In [None]:
n_missing = 0
missing_runs = []
for name, group in grouped_df:
    if name in incomplete_runs_groups.index:
        missing = set(range(9)) - set(group["hyperparameter_set"].unique())
        print(f"Group: {name}", missing)
        n_missing += len(missing)
        missing_runs.append((name, missing))

print(n_missing, missing_runs)

for config, hpsets in missing_runs:
    optimizer, optimizer_other, subspace, opset = config
    for hp in hpsets:
        print(f"python launch_model_train.py --optimizer {optimizer} --subspace {subspace} --opset {opset} --dataset cifar10_model --hpsets {hp} --seed 0 --epochs 300 --other {optimizer_other} --tag models-train-batch7 --genotypes_folder exp/genotypes & sleep 5")
    
n_missing

In [None]:
duplicate_runs_dfs = grouped_df.filter(lambda x: len(x) > 9)
grouped_duplicate_dfs = duplicate_runs_dfs.groupby(['optimizer', 'optimizer_other', 'subspace', 'opset'])
grouped_duplicate_dfs

In [None]:
single_duplicate_tag_groups = []

for name, group in grouped_duplicate_dfs:
    print(name, len(group))
    # print(group["hyperparameter_set"])
    duplicates = group[group.duplicated(subset=['hyperparameter_set'], keep=False)]
    hpsets = set(duplicates["hyperparameter_set"])

    for hp in hpsets:
        duplicates_hp = duplicates[duplicates["hyperparameter_set"] == hp]
        duplicate_tags = sorted(set(duplicates_hp["tag"]))
        if len(duplicate_tags) == 1:
            single_duplicate_tag_groups.append((name, hp))
        print(hp, duplicate_tags, "ONLY ONE!!!" if len(set(duplicates_hp["tag"])) == 1 else "")

single_duplicate_tag_groups

In [None]:
# Group by all columns except 'tag'
grouped = df.groupby(['optimizer', 'optimizer_other', 'opset', 'subspace', 'hyperparameter_set', 'seed'])

# Filter groups where the only difference is the 'tag'
filtered_groups = grouped.filter(lambda x: len(x[TAG].unique()) > 1)
mixed_hp_df = filtered_groups[filtered_groups[TAG] == TAG_MIXED_HPS]
mixed_hp_df

In [None]:
complete_runs_df = grouped_df.filter(lambda x: len(x) == 9)
complete_runs_df

In [None]:
df

In [None]:
complete_runs_df = df.groupby(FINGERPRINT_COLS).filter(lambda x: len(x) == 9)
len(complete_runs_df) // 9

In [None]:
incomplete_runs_df = df.groupby(FINGERPRINT_COLS).filter(lambda x: len(x) < 9)
incomplete_runs_df

In [None]:
duplicate_runs_df = df.groupby(FINGERPRINT_COLS).filter(lambda x: len(x) > 9)
duplicate_runs_groups = duplicate_runs_df.groupby(FINGERPRINT_COLS)
print(len(duplicate_runs_groups))
duplicate_runs_df

In [None]:
# Rows to drop by picked the earliest runs (from tag train-models-mixed-hp)
rows_to_drop = []

for index, data in duplicate_runs_groups:
    hp_freq = data[HPSET].value_counts() > 1
    for hp, is_duplicate in hp_freq.items():

        if is_duplicate:
            rows = data[data[HPSET] == hp]
            if(TAG_MIXED_HPS in rows[TAG].values):
                rows_to_delete = data[(data[HPSET] == hp) & (data[TAG] != TAG_MIXED_HPS)]
                rows_to_drop.append(rows_to_delete.index.item())

In [None]:
deduplicated_runs_groups = df.drop(rows_to_drop).groupby(FINGERPRINT_COLS).filter(lambda x: len(x) > 9).groupby(FINGERPRINT_COLS)
deduplicated_runs_groups.size()

# Rows to drop by picked the earliest runs which have to be picked by hand
# Populating the array is just for convenience
rows_to_delete_ = []
for index, data in deduplicated_runs_groups:
    hp_freq = data[HPSET].value_counts() > 1
    for hp, is_duplicate in hp_freq.items():

        if is_duplicate:
            rows = data[data[HPSET] == hp]
            print(rows[[HPSET, TAG]])
            # if len(set(rows[TAG])) == 1:
            rows_to_delete_.append(rows.index[0])

rows_to_delete_

In [None]:
final_df = df.drop(rows_to_drop + rows_to_delete_).groupby(FINGERPRINT_COLS).filter(lambda x: len(x) == 9)
final_df

In [None]:
final_df.groupby(FINGERPRINT_COLS).size()

In [None]:
df.drop(rows_to_drop + rows_to_delete_).groupby(FINGERPRINT_COLS).filter(lambda x: len(x) < 9).groupby(FINGERPRINT_COLS).size()

In [None]:
final_df.columns

In [None]:
ADDITIONAL_COLS = ["hyperparameter_set", "discrete/test/acc_top1", "discrete/test/acc_top5"]
refined_df = final_df.copy()[FINGERPRINT_COLS + ADDITIONAL_COLS]
refined_df["Optimizer"] = refined_df["optimizer"]
refined_df["Subspace"] = refined_df["subspace"]
refined_df["Opset"] = refined_df["opset"]
refined_df["HP_Idx"] = refined_df["hyperparameter_set"]
refined_df["TestAcc1"] = refined_df["discrete/test/acc_top1"]
refined_df["TestAcc5"] = refined_df["discrete/test/acc_top5"]

refined_df["Optimizer"] = refined_df.apply(lambda row: row["optimizer_other"] if row["optimizer_other"] != "baseline" else row["optimizer"], axis=1)
refined_df = refined_df.drop(FINGERPRINT_COLS + ADDITIONAL_COLS, axis=1)
refined_df

refined_df.to_csv("model_trains_full.csv", index=False)

In [None]:
refined_df