### Load libraries

In [1]:
import pathlib

import git.repo
import numpy as np
import pandas as pd

### Define what data to load

In [2]:
GIT_ROOT = pathlib.Path(
    str(git.repo.Repo(".", search_parent_directories=True).working_tree_dir)
)

CSV_ROOT = GIT_ROOT / "round1_aug_submission"
CSVS_TO_INCLUDE = (
    # Ethan
    "inverse_scaling_ethan/MACH.csv",
    "inverse_scaling_ethan/NPI.csv",
    "inverse_scaling_ethan/RSE.csv",
    "inverse_scaling_ethan/np_MACH.csv",
    "inverse_scaling_ethan/np_NPI.csv",
    "inverse_scaling_ethan/np_RSE.csv",
    #
    # Jiahai
    "inverse_scaling_jiahai/dass_formatted.csv",
    "inverse_scaling_jiahai/tma_formatted.csv",
    "inverse_scaling_jiahai/np_dass_formatted.csv",
    "inverse_scaling_jiahai/np_tma_formatted.csv",
    #
    # Simon
    "inverse_scaling_simon/beck_hopelessness.csv",
    "inverse_scaling_simon/depression_scenarios.csv",
    "inverse_scaling_simon/geriatric_depression_scale.csv",
    "inverse_scaling_simon/levenson_selfreport_psychopathy.csv",
    "inverse_scaling_simon/mmpi.csv",
    "inverse_scaling_simon/phq9.csv",
    "inverse_scaling_simon/np_beck_hopelessness.csv",
    "inverse_scaling_simon/np_geriatric_depression_scale.csv",
    "inverse_scaling_simon/np_levenson_selfreport_psychopathy.csv",
    "inverse_scaling_simon/np_mmpi.csv",
    "inverse_scaling_simon/np_phq9.csv",
    #
    # Tony
    "inverse_scaling_tony/ECR/final.csv",
    #
    # Saptarashmi
    "InverseScaling_Saptarashmi/CFCS/final.csv",
    "InverseScaling_Saptarashmi/HBDS-data/final.csv",
    "InverseScaling_Saptarashmi/HSNS+DD/final.csv",
    "InverseScaling_Saptarashmi/SD3/final.csv",
    "InverseScaling_Saptarashmi/CFCS/np_final.csv",
    "InverseScaling_Saptarashmi/HBDS-data/np_final.csv",
    "InverseScaling_Saptarashmi/HSNS+DD/final_np.csv",
    "InverseScaling_Saptarashmi/SD3/np_final.csv",
)

# Writes in same directory as this notebook
OUT_FILE = "combined_v2.csv"

print("Number of csvs to include:", len(CSVS_TO_INCLUDE))
print("Output file:", OUT_FILE)

Number of csvs to include: 30
Output file: combined_v2.csv


### Load data

In [3]:
def canonicalize_df(df: pd.DataFrame) -> pd.DataFrame:
    # Drop unnamed initial column (jiahai)
    if "Unnamed: 0" in df.columns:
        df = df.drop("Unnamed: 0", axis="columns")

    # Rename front -> prompt_in_front (simon)
    if "front" in df.columns:
        df = df.rename(columns={"front": "prompt_in_front"})

    # Rename answer_idx -> answer_index
    if "answer_idx" in df.columns:
        df = df.rename(columns={"answer_idx": "answer_index"})

    # Rename 'binarized-v2' -> binarized_v2
    if "binarized-v2" in df.columns:
        df = df.rename(columns={"binarized-v2": "binarized_v2"})

    # Make body a boolean field
    if df.body.dtype == np.object_:
        df["body"] = (df.body != "NB")

    return df

In [4]:
raw_dfs = [pd.read_csv(CSV_ROOT / x) for x in CSVS_TO_INCLUDE]
dfs = [canonicalize_df(raw_df) for raw_df in raw_dfs]
df = pd.concat(dfs)
df.columns

Index(['prompt', 'classes', 'answer_index', 'source_dataset', 'Q_id', 'body',
       'prompt_in_front', 'binarized', 'no_prompt', 'binarized_v2',
       'half_prompt', 'pov', 'editted'],
      dtype='object')

### Check data

In [5]:
df.classes.unique()

array(["[' 1', ' 2', ' 3', ' 4', ' 5']", "[' 1', ' 2']",
       "[' A', ' B', ' C', ' D']", "[' A', ' B']", "[' B', ' A']",
       "[' True', ' False']", "[' Yes', ' No']",
       "[' A', ' B', ' C', ' D', ' E', ' F', ' G']", "[' Yes',' No']",
       "[' No', ' Yes']", "[' 1', ' 3', ' 5']"], dtype=object)

In [6]:
df.answer_index.unique()

array([0, 4, 1, 3, 6, 2])

In [7]:
df.source_dataset.unique()

array(['MACH', 'NPI', 'RSE', 'dass', 'tma', 'beck_hopelessness',
       'depression_scenarios', 'geriatric_depression',
       'levenson_selfreport_psychopathy', 'mmpi', 'phq9', 'ECR', 'cfcs',
       'hbds', 'hsns+dd', 'sd3'], dtype=object)

In [8]:
print(df.body.unique())
df.source_dataset[df.body == 1].unique()

[0 1]


array(['dass', 'depression_scenarios', 'mmpi', 'phq9'], dtype=object)

In [9]:
df.prompt_in_front.unique()

array([1, 0])

In [10]:
df.binarized.unique()

array([0, 1])

In [11]:
print(df.binarized_v2.unique())
df.source_dataset[~df.binarized_v2.isna()].unique()

[nan  0.  1.]


array(['levenson_selfreport_psychopathy'], dtype=object)

### Export data

In [12]:
len(df)

2648

In [13]:
df.to_csv(OUT_FILE)