In [31]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import pearsonr, spearmanr

In [32]:
def get_data(json_file):
    df = pd.read_json(json_file, lines=True)
    df["subset_seed"] = (
        df["exp_name"].str.split("seed_", expand=True)[1].astype(int)
    )
    df = df.sort_values(by="subset_seed")
    df = df[["aesthetic_score_0.9", "subset_seed"]]
    return df

In [33]:
outdir = "/gscratch/aims/diffusion-attr/seed42/artbench_post_impressionism"

retrain_df = get_data(os.path.join(outdir, "retrain_artist_shapley.jsonl"))
sgd_df_dict = {}
for steps in [100, 200, 400, 800]:
    sgd_df_dict[steps] = get_data(
        os.path.join(outdir, f"sparse_gd_artist_shapley_{steps}steps.jsonl")
    )

In [34]:
num_subsets = 100
retrain_df = retrain_df[:num_subsets]
subset_seeds = retrain_df["subset_seed"].tolist()
for key, df in sgd_df_dict.items():
    sgd_df_dict[key] = df[df["subset_seed"].isin(subset_seeds)]

In [35]:
for key, df in sgd_df_dict.items():
    print(f"Num steps = {key}")
    retrain_scores = retrain_df["aesthetic_score_0.9"].to_numpy()
    sgd_scores = df["aesthetic_score_0.9"].to_numpy()
    pearson_val, _ = pearsonr(retrain_scores, sgd_scores)
    spearman_val, _ = spearmanr(retrain_scores, sgd_scores)
    mse_val = np.mean((retrain_scores - sgd_scores) ** 2)
    print(f"Pearson: {pearson_val}")
    print(f"Spearman: {spearman_val}")
    print(f"MSE: {mse_val}")
    print("")

Num steps = 100
Pearson: 0.4273268299437773
Spearman: 0.2981938193819382
MSE: 0.12316345537066944

Num steps = 200
Pearson: 0.6812814860355162
Spearman: 0.49298529852985296
MSE: 0.07597882444507498

Num steps = 400
Pearson: 0.7078885332425474
Spearman: 0.5665526552655265
MSE: 0.07332177400383642

Num steps = 800
Pearson: 0.7302704707960246
Spearman: 0.604080408040804
MSE: 0.14274842940409582

