In [2]:
import os
import pandas as pd

In [3]:
# @title Helper Functions


def convert_yes_df_to_wide(df, save_dir, save_fname, verified = True, matrix_completion = False):
    '''This function converts a dataframe to wide format. Fill the missing values with 0 and rest with 1'''
    if not matrix_completion:
        if verified:
            # add a column 'Response' with value 'Yes'
            df['Response'] = 1
             # the concept names are in the column 'Concept', the feature names are in the column 'Feature', the response is in the column 'Response'
            df_wide = df.pivot_table(index='Concept', columns='Feature', values='Response', aggfunc='first').fillna(0)
            df_wide.to_csv(os.path.join(save_dir,save_fname))
        else:
            print('unverified')
            # the concept names are in the column 'Concept', the feature names are in the column 'Feature', the response is in the column 'Response'
            df_wide = df.pivot_table(index='Concept', columns='Feature', values='Yes/No', aggfunc='first').fillna(0)
            df_wide.to_csv(os.path.join(save_dir,save_fname))
    return df_wide

def preprocess_flan_responses_on_leuven(df):
    ''' This function preprocesses the responses from the flan model on the Leuven dataset'''
    # replace 'mask>' in the column 'response' with 'No'
    df['response'] = df['response'].str.replace('mask>', 'No')
    # replave 'True' and 'Yes' with 1
    df['response'] = df['response'].str.replace('True', '1')
    df['response'] = df['response'].str.replace('Yes', '1')
    # replace 'False' and 'No' with 0
    df['response'] = df['response'].str.replace('False', '0')
    df['response'] = df['response'].str.replace('No', '0')
    # convert the column 'response' to int
    df['response'] = df['response'].astype(int)
    return df

def convert_flan_df_to_wide(df):
    '''This function converts a dataframe to wide format'''
    # the concept names are in the column 'concept', the feature names are in the column 'feature', the response is in the column 'response'
    df_wide = df.pivot_table(index='concept', columns='feature', values='response', aggfunc='first').fillna(0)
    return df_wide


# ICLR

In [8]:
save_dir = '../data/leuven/flan'
flan_responses = pd.read_csv(os.path.join(save_dir, 'flan_leuven_prompts_answers_overlap.csv'))
flan_responses = preprocess_flan_responses_on_leuven(flan_responses)
flan_responses_wide = convert_flan_df_to_wide(flan_responses)
flan_responses_wide.to_csv(os.path.join(save_dir, '../flan_leuven_norms_overlap.csv'))

In [9]:
save_dir = '../data/leuven/flan'
flan_responses = pd.read_csv(os.path.join(save_dir, 'flan_leuven_broad_prompt.csv'))
flan_responses = preprocess_flan_responses_on_leuven(flan_responses)
flan_responses_wide = convert_flan_df_to_wide(flan_responses)
flan_responses_wide.to_csv(os.path.join(save_dir, '../flan_leuven_norms_broad_prompt.csv'))

In [4]:
leuven_animals = pd.read_csv('../data/leuven/animal_leuven_norms.csv')
leuven_artifacts = pd.read_csv('../data/leuven/artifacts_leuven_norms.csv')
flan_responses = pd.read_csv('../data/leuven/flan_leuven_norms.csv')

In [5]:
# print columns in leuven_animals that appear more than once
print(leuven_animals.columns[leuven_animals.columns.duplicated()])
# print columns in leuven_artifacts that appear more than once
print(leuven_artifacts.columns[leuven_artifacts.columns.duplicated()])

# print columns in leuven_animals that are also in leuven_artifacts
print(set(leuven_animals.columns).intersection(set(leuven_artifacts.columns)))





Index([], dtype='object')
Index([], dtype='object')
{'is_big', 'is_ugly', 'is_small', 'is_greasy', 'is_grey', 'is_rare', 'is_slippery', 'is_expensive', 'has_teeth', 'is_healthy', 'Unnamed: 0', 'is_green', 'has_a_flat_head', 'is_not_expensive', 'has_a_horn', 'has_wings', 'is_flat', 'has_a_head', 'is_red', 'is_blue', 'is_elongated', 'is_long', 'stinks', 'floats_on_water', 'is_beautiful', 'is_dangerous', 'is_strong', 'is_light', 'is_black', 'is_white', 'is_round', 'is_black_and_white', 'is_brown', 'has_a_tail'}


In [17]:
# make s csv file with one column as features and the other column as which dataset it belongs to
# for the features that are in both datasets, say 'both'
# for the features that are in only one dataset, say 'animal' or 'artifact'
# remove the column 'Unnamed: 0' from leuven_animals and leuven_artifacts
features_dict = {}
for feature in leuven_animals.columns:
    if feature != 'Unnamed: 0':
        features_dict[feature] = 'animal_only'
for feature in leuven_artifacts.columns:
    if feature != 'Unnamed: 0':
        if feature in features_dict:
            features_dict[feature] = 'both'
        else:
            features_dict[feature] = 'artifacts_only'
# let the index of the dataframe be numbers and not the feature names
# and let the column names be 'feature' and 'dataset'
features_df = pd.DataFrame.from_dict(features_dict, orient='index', columns=['dataset'])
features_df['feature'] = features_df.index
features_df = features_df.reset_index(drop=True)
# let the first column be 'feature' and the second column be 'dataset'
features_df = features_df[['feature', 'dataset']]

features_df.to_csv('../data/data_to_tim/leuven_features.csv')

In [232]:
# get the columns that are in leuven_animals and in leuven_artifacts
common_cols = list(set(leuven_animals.columns).intersection(set(leuven_artifacts.columns)))

# remove the column 'Unnamed: 0' from common_cols
common_cols.remove('Unnamed: 0')


leuven_norms = pd.merge(leuven_animals, leuven_artifacts, on='Unnamed: 0', how='outer').fillna(0)

for col in common_cols:
    leuven_norms[col] = leuven_norms[col + '_x'] + leuven_norms[col + '_y']
    leuven_norms.drop([col + '_x', col + '_y'], axis=1, inplace=True)

leuven_norms.to_csv('../data/leuven/leuven_norms.csv')


# Original counts

In [4]:
leuven_animals = pd.read_csv('../data/leuven/animal_leuven_norms_original_counts.csv')
leuven_artifacts = pd.read_csv('../data/leuven/artifacts_leuven_norms_original_counts.csv')


# get the columns that are in leuven_animals and in leuven_artifacts
common_cols = list(set(leuven_animals.columns).intersection(set(leuven_artifacts.columns)))

# remove the column 'Unnamed: 0' from common_cols
common_cols.remove('Unnamed: 0')


leuven_norms = pd.merge(leuven_animals, leuven_artifacts, on='Unnamed: 0', how='outer').fillna(0)

for col in common_cols:
    leuven_norms[col] = leuven_norms[col + '_x'] + leuven_norms[col + '_y']
    leuven_norms.drop([col + '_x', col + '_y'], axis=1, inplace=True)

leuven_norms.to_csv('../data/leuven/leuven_norms_original_counts.csv')
