In [105]:
import pathlib
import ast
import pandas as pd

from inverse_cai.app.loader import load_json_file


def create_votes_df(results_dir: pathlib.Path) -> list[dict]:

    # load relevant data from experiment logs
    votes_per_comparison = pd.read_csv(
        results_dir / "040_votes_per_comparison.csv", index_col="index"
    )
    votes_per_comparison["votes_dicts"] = votes_per_comparison["votes"].apply(ast.literal_eval)
    # add single votes_dicts per comparison_id
    votes_per_comparison["original_index"] = votes_per_comparison.index

    # create a single list of dicts per comparison_id
    votes_per_comparison = votes_per_comparison.groupby("original_index")["votes_dicts"].apply(list).reset_index(name="votes_dicts")

    # compile list of dicts into a single dict
    votes_per_comparison["votes_dicts"] = votes_per_comparison["votes_dicts"].apply(lambda x: {k: v for d in list(x) for k, v in d.items()})

    principles_by_id: dict = load_json_file(
        results_dir / "030_distilled_principles_per_cluster.json",
    )
    comparison_df = pd.read_csv(results_dir / "000_train_data.csv", index_col="index")

    # merge original comparison data with votes per comparison
    full_df = comparison_df.merge(
        votes_per_comparison, left_index=True, right_on="original_index"
    )
    full_df["comparison_id"] = full_df.index


    new_cols = {}

    def make_vote_numeric(vote: bool | None) -> int:
        if vote is True:
            return 1
        elif vote is False:
            return -1
        elif vote is None:
            return 0
        else:
            raise ValueError(f"Invalid vote value: {vote}")

    # add one column for each principle vote
    for principle_id in principles_by_id.keys():
        new_cols[f"vote_principle_{principle_id}"] = full_df["votes_dicts"].apply(
            lambda x: make_vote_numeric(x.get(int(principle_id), None))
        )

    full_df = pd.concat([full_df, pd.DataFrame(new_cols)], axis=1)

    return full_df

In [None]:
df = create_votes_df(pathlib.Path("../exp/outputs/prism_v2/results"))

In [104]:

shared_subset = df[:4000]
shared_subset.to_csv("../data/experimental/prism_v2_4k_shared_subset.csv", index=False)

non_shared_subset = df[4000:]
non_shared_subset.to_csv("../data/experimental/prism_v2_non_shared_subset.csv", index=False)
