In [1]:
import os
import pandas as pd

In [36]:
# @title Helper Functions

def preprocess_leuven_norms(leuven_dir, save_dir, animals, tools):
    '''This function reads the Leuven Norms, selecsts relevant concepts data and saves the dataframes as csv files.'''
    animal_leuven_norms = pd.read_csv(os.path.join(leuven_dir,'ANIMALSexemplarfeaturesbig.txt'), sep = "\t")
    artiacts_leuven_norms = pd.read_csv(os.path.join(leuven_dir,'ARTIFACTSexemplarfeaturesbig.txt'), sep = "\t", encoding='latin-1')


     # in animal norms, the first column is the word, the second column is the frequency, multiply all cells by the freqeuncy 
    animal_leuven_norms.iloc[:,2:] = animal_leuven_norms.iloc[:,2:].multiply(animal_leuven_norms.iloc[:,1], axis=0)
    animal_leuven_norms.iloc[:,2:] = animal_leuven_norms.iloc[:,2:].div(animal_leuven_norms.iloc[:,1].sum(), axis=0)

    artiacts_leuven_norms.iloc[:,2:] = artiacts_leuven_norms.iloc[:,2:].multiply(artiacts_leuven_norms.iloc[:,1], axis=0)
    artiacts_leuven_norms.iloc[:,2:] = artiacts_leuven_norms.iloc[:,2:].div(artiacts_leuven_norms.iloc[:,1].sum(), axis=0)

    # from the third column onwards, replace the values grater than 0 with 1 and 0 otherwise
    animal_leuven_norms.iloc[:,2:] = animal_leuven_norms.iloc[:,2:].applymap(lambda x: 1 if x > 0 else 0)
    artiacts_leuven_norms.iloc[:,2:] = artiacts_leuven_norms.iloc[:,2:].applymap(lambda x: 1 if x > 0 else 0)

    # select the first column and select columns that contain the animal names in the list animals and also the first column
    animal_leuven_norms = animal_leuven_norms[['feature/_exemplar_ENGLISH'] + [col for col in animal_leuven_norms.columns if any(animal in col for animal in animals)]]
    artiacts_leuven_norms = artiacts_leuven_norms[['Item'] + [col for col in artiacts_leuven_norms.columns if any(tool in col for tool in tools)]]

    # rename the first column to 'features'
    animal_leuven_norms.rename(columns={'feature/_exemplar_ENGLISH': 'features'}, inplace=True)
    artiacts_leuven_norms.rename(columns={'Item': 'features'}, inplace=True)

    # transpose the dataframe so that the features are in the rows and the animals are in the columns
    animal_leuven_norms = animal_leuven_norms.T
    artiacts_leuven_norms = artiacts_leuven_norms.T


    # let the first row be the column names and drop the first row
    animal_leuven_norms.columns = animal_leuven_norms.iloc[0]
    animal_leuven_norms = animal_leuven_norms.drop(animal_leuven_norms.index[0])
    # remove the row 'monitor_lizard' from the dataframe
    animal_leuven_norms = animal_leuven_norms.drop('monitor_lizard')

    artiacts_leuven_norms.columns = artiacts_leuven_norms.iloc[0]
    artiacts_leuven_norms = artiacts_leuven_norms.drop(artiacts_leuven_norms.index[0])

    # remove the row 'filing-knife' from the dataframe
    artiacts_leuven_norms = artiacts_leuven_norms.drop('filling-knife')
    artiacts_leuven_norms = artiacts_leuven_norms.drop('pickaxe')

    # save the dataframes as csv files
    animal_leuven_norms.to_csv(os.path.join(save_dir,'animal_leuven_norms.csv'))
    artiacts_leuven_norms.to_csv(os.path.join(save_dir,'artifacts_leuven_norms.csv'))

def load_leuven_norms(save_dir):
    '''This function loads the Leuven Norms dataframes.'''
    animal_leuven_norms = pd.read_csv(os.path.join(save_dir,'animal_leuven_norms.csv'), index_col=0)
    artiacts_leuven_norms = pd.read_csv(os.path.join(save_dir,'artifacts_leuven_norms.csv'), index_col=0)
    return animal_leuven_norms, artiacts_leuven_norms


def load_gpt_data_summer_verified(save_dir):
    '''This function loads the GPT dataframes.'''
    # load from one of the sheets in the excel file
    gpt_response_yes = pd.read_excel(os.path.join(save_dir,'davinci_features_all.xlsx'), sheet_name='all_yes', index_col=0)
    # for the column 'Concept', make the characters lowercase and replace the spaces with underscores
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.lower().str.replace(' ', '_')
    # replace 'boa_python' with 'boa' in the column "Concept"
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('boa_python', 'boa')
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('grinding_disk', 'grinding_disc') 
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('oil_can', 'oilcan')  
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('vacuum', 'vacuum_cleaner')
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('paint_brush', 'paintbrush')
    return gpt_response_yes

def load_gpt_data_summer_unverified(save_dir):
    '''This function loads the GPT dataframes.'''
    # load from one of the sheets in the excel file
    gpt_response_yes_1 = pd.read_excel(os.path.join(save_dir,'davinci_features_all.xlsx'), sheet_name='orig_list_raw')
    # chose columns 'Concept', 'Feature', 'Category' and only unique rows
    # add a column 'Yes/No' with the value 1
    gpt_response_yes_1 = gpt_response_yes_1[['Concept', 'Feature', 'Category']].drop_duplicates()
    gpt_response_yes_1['Yes/No'] = 1
    gpt_response_yes = gpt_response_yes_1
    # # chose columns 'Concept', 'Feature', 'Category' and 'Yes/No'
    # gpt_response_yes_1 = gpt_response_yes_1[['Concept', 'Feature', 'Category', 'Yes/No']]
    # len_1 = len(gpt_response_yes_1)
    # gpt_response_yes_2 = pd.read_excel(os.path.join(save_dir,'davinci_features_all.xlsx'), sheet_name='fv_raw')
    # # chose columns 'Concept', 'Feature', 'Category' and 'Yes/No'
    # gpt_response_yes_2 = gpt_response_yes_2[['Concept', 'Feature', 'Category', 'Yes/No']]
    # len_2 = len(gpt_response_yes_2)
    # gpt_response_yes = pd.concat([gpt_response_yes_1, gpt_response_yes_2], axis=0)
    # # chose only unique rows
    # gpt_response_yes = gpt_response_yes.drop_duplicates()    
    # len_3 = len(gpt_response_yes)
    # assert(len_1+len_2 == len_3)
    # for the column 'Concept', make the characters lowercase and replace the spaces with underscores
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.lower().str.replace(' ', '_')
    # replace 'boa_python' with 'boa' in the column "Concept"
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('boa_python', 'boa')
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('grinding_disk', 'grinding_disc') 
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('oil_can', 'oilcan')  
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('vacuum', 'vacuum_cleaner')
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('paint_brush', 'paintbrush')
    return gpt_response_yes

def load_flan(save_dir, fname):
    '''This function loads the GPT dataframes.'''
    gpt_response_yes_1 = pd.read_excel(os.path.join('../data/feature_listing','davinci_features_all.xlsx'), sheet_name='orig_list_raw')
    gpt_response_yes_1 = gpt_response_yes_1[['Concept', 'Feature', 'Category']].drop_duplicates()
    gpt_response_yes_1['Yes/No'] = 1
    gpt_response_verified = pd.read_csv(os.path.join(save_dir,fname))
    # chose rows where the column 'Yes/No' is 1
    gpt_response_verified = gpt_response_verified[gpt_response_verified['Yes/No'] == 1.0]
    # chose columns 'Concept', 'Feature', 'Category'
    gpt_response_verified = gpt_response_verified[['Concept', 'Feature', 'Category', 'Yes/No']]
    gpt_response_yes = pd.concat([gpt_response_yes_1, gpt_response_verified], axis=0)

    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.lower().str.replace(' ', '_')
    # replace 'boa_python' with 'boa' in the column "Concept"
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('boa_python', 'boa')
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('grinding_disk', 'grinding_disc') 
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('oil_can', 'oilcan')  
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('vacuum', 'vacuum_cleaner')
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('paint_brush', 'paintbrush')
    return gpt_response_yes
     


def load_gpt_data_summer_verified_temperature_0(save_dir):
    gpt_response_yes_1 = pd.read_excel(os.path.join(save_dir,'davinci_features_all.xlsx'), sheet_name='orig_list_raw')
    gpt_response_yes_1 = gpt_response_yes_1[['Concept', 'Feature', 'Category']].drop_duplicates()
    gpt_response_yes_1['Yes/No'] = 1
    gpt_response_verified = pd.read_csv(os.path.join(save_dir,'davinci_feature_listing_feature_list_temperature_0.csv'))
    # chose rows where the column 'Yes/No' is 1
    gpt_response_verified = gpt_response_verified[gpt_response_verified['Yes/No'] == 1.0]
    # chose columns 'Concept', 'Feature', 'Category'
    gpt_response_verified = gpt_response_verified[['Concept', 'Feature', 'Category', 'Yes/No']]
    gpt_response_yes = pd.concat([gpt_response_yes_1, gpt_response_verified], axis=0)

    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.lower().str.replace(' ', '_')
    # replace 'boa_python' with 'boa' in the column "Concept"
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('boa_python', 'boa')
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('grinding_disk', 'grinding_disc') 
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('oil_can', 'oilcan')  
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('vacuum', 'vacuum_cleaner')
    gpt_response_yes['Concept'] = gpt_response_yes['Concept'].str.replace('paint_brush', 'paintbrush')
    return gpt_response_yes


def convert_yes_df_to_wide(df, save_dir, save_fname, verified = True, matrix_completion = False):
    '''This function converts a dataframe to wide format. Fill the missing values with 0 and rest with 1'''
    if not matrix_completion:
        if verified:
            # add a column 'Response' with value 'Yes'
            df['Response'] = 1
             # the concept names are in the column 'Concept', the feature names are in the column 'Feature', the response is in the column 'Response'
            df_wide = df.pivot_table(index='Concept', columns='Feature', values='Response', aggfunc='first').fillna(0)
            df_wide.to_csv(os.path.join(save_dir,save_fname))
        else:
            print('unverified')
            # the concept names are in the column 'Concept', the feature names are in the column 'Feature', the response is in the column 'Response'
            df_wide = df.pivot_table(index='Concept', columns='Feature', values='Yes/No', aggfunc='first').fillna(0)
            df_wide.to_csv(os.path.join(save_dir,save_fname))
    return df_wide


def convert_summer_yes_df_to_wide(df, save_dir, verified = True, matrix_completion = False):
    '''This function converts a dataframe to wide format. Fill the missing values with 0 and rest with 1'''
    if not matrix_completion:
        if verified:
            # add a column 'Response' with value 'Yes'
            df['Response'] = 1
             # the concept names are in the column 'Concept', the feature names are in the column 'Feature', the response is in the column 'Response'
            df_wide = df.pivot_table(index='Concept', columns='Feature', values='Response', aggfunc='first').fillna(0)
            df_wide.to_csv(os.path.join(save_dir,'gpt_response_yes_wide_no_matrix_completion.csv'))
        else:
            print('unverified')
            # the concept names are in the column 'Concept', the feature names are in the column 'Feature', the response is in the column 'Response'
            df_wide = df.pivot_table(index='Concept', columns='Feature', values='Yes/No', aggfunc='first').fillna(0)
            df_wide.to_csv(os.path.join(save_dir,'gpt_response_unverified_wide_no_matrix_completion.csv'))
    return df_wide


# cogsci

In [30]:
leuven_dir = '../../data/LeuvenNorms'
save_dir = '../data/feature_listing'
animals = ['turtle', 'alligator', 'lizard', 'tortoise', 'cobra', 'snake', 'blindworm', 'gecko', 'boa', 'toad', 'crocodile', 'chameleon', 'caiman', 'salamander', 'dinosaur']
tools = ['hammer', 'screwdriver', 'grinding_disc', 'vacuum_cleaner', 'spanner', 'lawn_mower', 'axe', 'saw', 'knife', 'nail', 'chisel', 'shovel', 'anvil', 'oilcan', 'paintbrush']
preprocess_leuven_norms(leuven_dir, save_dir, animals, tools)
animal_leuven_norms, artiacts_leuven_norms = load_leuven_norms(save_dir)

  animal_leuven_norms.iloc[:,2:] = animal_leuven_norms.iloc[:,2:].applymap(lambda x: 1 if x > 0 else 0)
  artiacts_leuven_norms.iloc[:,2:] = artiacts_leuven_norms.iloc[:,2:].applymap(lambda x: 1 if x > 0 else 0)


In [32]:
summer_gpt_dir = '../data/feature_listing/summer_data'
gpt_response_yes = load_gpt_data_summer_verified_temperature_0(summer_gpt_dir)
convert_summer_yes_df_to_wide(gpt_response_yes, save_dir, verified=True, matrix_completion = False)

Feature,are_4.5_inches_in_diameter,are_Abrasive,are_Coated,are_Dense,are_Hard,are_Rigid,are_a_constrictor,are_a_good_swimmer,are_a_hand_tool,are_a_hard_plastic,...,lives_in_Central_America,lives_in_Europe,lives_in_South_America,lives_in_different_climates,lives_in_rainforests_of_South_America,lives_in_tropical_climate,lives_in_various_habitats,lives_in_warm_climate,lives_underground,made_of_aluminum_oxide
Concept,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
alligator,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
anvil,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
axe,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
blindworm,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
boa,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
caiman,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
chameleon,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
chisel,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
cobra,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
crocodile,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0


In [33]:
gpt_response_yes

Unnamed: 0,Concept,Feature,Category,Yes/No,Response
0,turtle,have_shell,Animal,1,1
1,turtle,have_head,Animal,1,1
2,turtle,have_eyes,Animal,1,1
3,turtle,have_neck,Animal,1,1
4,turtle,have_mouth,Animal,1,1
...,...,...,...,...,...
16586,vacuum_cleaner,have_variety_of_sizes,tool,1,1
16588,vacuum_cleaner,have_wheels,tool,1,1
16595,vacuum_cleaner,have_wheels that move it forward,tool,1,1
16604,vacuum_cleaner,lives_in_Australia,tool,1,1


In [18]:
gpt_response_unverified_summer = load_gpt_data_summer_unverified(summer_gpt_dir)
convert_summer_yes_df_to_wide(gpt_response_unverified_summer, save_dir, verified=False, matrix_completion = False)

unverified


Feature,are_4.5_inches_in_diameter,are_Abrasive,are_Coated,are_Dense,are_Hard,are_Rigid,are_a_constrictor,are_a_good_swimmer,are_a_hand_tool,are_a_hard_plastic,...,lives_in_Central_America,lives_in_Europe,lives_in_South_America,lives_in_different_climates,lives_in_rainforests_of_South_America,lives_in_tropical_climate,lives_in_various_habitats,lives_in_warm_climate,lives_underground,made_of_aluminum_oxide
Concept,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
alligator,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
anvil,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
axe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
blindworm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
boa,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
caiman,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chameleon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
chisel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cobra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
crocodile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# iclr

In [38]:
data_dir = '../../results/reptile_tool'
save_dir = '../data/feature_listing'
flan = load_flan(data_dir, 'flan_feature_listing_feature_list_temperature_0.csv')
convert_yes_df_to_wide(flan, save_dir, save_fname = 'flan_wide.csv', verified = True, matrix_completion = False)

FileNotFoundError: [Errno 2] No such file or directory: '../data/feature_listing/davinci_features_all.xlsx'

In [27]:
flan

Unnamed: 0,Concept,Feature,Yes/No,Category,prompt,gpt_response,Response
0,alligator,are_4.5_inches_in_diameter,0,reptile,"In one word, Yes/No: Are alligators 4.5 inches...",No,1
1,alligator,are_Abrasive,0,reptile,"In one word, Yes/No: Are alligators Abrasive ?",No,1
2,alligator,are_Coated,0,reptile,"In one word, Yes/No: Are alligators Coated ?",No,1
3,alligator,are_Dense,0,reptile,"In one word, Yes/No: Are alligators Dense ?",No,1
4,alligator,are_Hard,1,reptile,"In one word, Yes/No: Are alligators Hard ?",yes,1
...,...,...,...,...,...,...,...
16610,vacuum_cleaner,lives_in_tropical_climate,0,tool,"In one word, Yes/No: Do vacuums live in tropic...",No,1
16611,vacuum_cleaner,lives_in_various_habitats,0,tool,"In one word, Yes/No: Do vacuums live in variou...",No,1
16612,vacuum_cleaner,lives_in_warm_climate,0,tool,"In one word, Yes/No: Do vacuums live in warm c...",No,1
16613,vacuum_cleaner,lives_underground,0,tool,"In one word, Yes/No: Do vacuums live undergrou...",No,1
