In [None]:
!gdown '1bATOtMuAKIrdrFfFyJf5HSZA1nV-Fpl5'
!unzip -o results.zip

Downloading...
From: https://drive.google.com/uc?id=1bATOtMuAKIrdrFfFyJf5HSZA1nV-Fpl5
To: /content/results.zip
100% 127M/127M [00:05<00:00, 22.6MB/s]
Archive:  results.zip
  inflating: results/CogALex2.0-en_I01_23-10-19_18-11-37.txt  
  inflating: results/CogALex2.0-en_I01_23-10-19_18-11-37.csv  
  inflating: results/CogALex2.0-en_I00_23-10-19_17-59-29.txt  
  inflating: results/CogALex2.0-en_I00_23-10-19_17-59-29.csv  
  inflating: results/CogALex2.0-all_I00_23-06-11_13-25-14.txt  
  inflating: results/CogALex2.0-all_I01_23-06-11_13-54-16.txt  
  inflating: results/CogALexVI-zh_I00_23-06-02_09-15-37.csv  
  inflating: results/CogALexVI-zh_I00_23-06-02_11-37-09.txt  
  inflating: results/CogALex2.0-all_I01_23-06-11_13-54-16.csv  
  inflating: results/CogALex2.0-all_I00_23-06-11_13-25-14.csv  
  inflating: results/CogALexVI-en_I01_23-06-02_18-17-00.txt  
  inflating: results/CogALexVI-zh_I00_23-06-02_09-15-37.txt  
  inflating: results/MUSCLE-r-zh_I00_23-06-02_16-25-30.txt  
  inflating

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import glob
import ast
import re
import copy
import math
from scipy import stats

#**PROCESS FINE-TUNING RESULTS**

In [None]:
LIST_DIR_RES = ['/content/results/']

In [None]:
def get_dataset_name(param):
    dataset_name = ''
    if 'muscle' in param['train_file'].lower():
        if 'random_split' in param['train_file'].lower():
            dataset_name = 'muscle-r'
        elif 'semantic_split' in param['train_file'].lower():
            dataset_name = 'muscle-s'
        if param['model'].lower() == 'roberta-large':
            dataset_name = dataset_name + "-rob"
        if not param['langs'] is None:
            if len(param['langs']) == 3:
                dataset_name = dataset_name + "-en-zh-de"
            elif len(param['langs']) == 1:
                dataset_name = dataset_name + "-" + param['langs'][0]
        if param['bidirectional'] == 0:
            dataset_name = dataset_name + "-u"
    elif 'cogalexvi'in param['train_file'].lower():
        dataset_name = 'cogalexvi'
        if not param['langs'] is None and len(param['langs']) == 1:
            dataset_name = dataset_name + "-" + param['langs'][0]
    elif 'cogalex2'in param['train_file'].lower():
        dataset_name = 'cogalex2'
        if not param['langs'] is None and len(param['langs']) == 1:
            dataset_name = dataset_name + "-" + param['langs'][0]
    else:
        raise Exception("Error. Bad dataset name: " + param['train_dataset'])
    return dataset_name

def calculate_weighted_f1_no_random(dict_report):
    '''
    Function to calculate the weigthed f1-score by support of all labels
    except the random label, if the random label exists. If it not exists
    return -1
    '''
    except_list = ['accuracy', 'macro avg', 'weighted avg', 'std_accuracy', 'random']
    weighted_f1_no_random = -1
    if 'random' in dict_report.keys():
        total_support_no_random = 0
        for k in dict_report:
            if k.lower() not in except_list:
                weighted_f1_no_random += dict_report[k]['support']*dict_report[k]['f1-score']
                total_support_no_random += dict_report[k]['support']
        weighted_f1_no_random = weighted_f1_no_random/total_support_no_random

    return weighted_f1_no_random

def add_no_random_measures(dict_report):
    '''
    Given a report dictionary, dict_report, calculated with the 'classification_report'
    function (in sklearn package), it is added to dict_report the macro and
    weighted averages without considering the random label, if the random label exists.
    '''
    num_labels = len(dict_report.keys()) - 3
    dict_report['macro avg not random'] = {}
    dict_report['weighted avg not random'] = {}
    if 'random' in dict_report.keys():
        supp_random = dict_report['random']['support']
        for k in ['precision', 'recall', 'f1-score']:
            val_k_macro = dict_report['macro avg'][k]
            val_k_weigth = dict_report['weighted avg'][k]
            val_k_random = dict_report['random'][k]
            supp = dict_report['macro avg']['support']
            dict_report['macro avg not random'][k] = (val_k_macro*num_labels - val_k_random)/(num_labels-1)
            dict_report['weighted avg not random'][k] = (val_k_weigth*supp - supp_random*val_k_random)/(supp-supp_random)


def get_reports_langs(res_xl):
    dict_report_langs = {}
    langs = res_xl['lang'].unique().tolist()
    langs.sort()
    for lang in langs:
        res_lang = res_xl[res_xl['lang'] == lang]
        real = [l.lower() for l in res_lang['real_rel']]
        pred = [l.lower() for l in res_lang['pred_rel']]
        rep_lang = classification_report(y_true = real, y_pred = pred, output_dict=True)
        add_no_random_measures(rep_lang)
        dict_report_langs[lang] = rep_lang
    return dict_report_langs

In [None]:
dict_res = {}

for dir in LIST_DIR_RES:
    list_txt_files = glob.glob(dir + "*.txt")
    list_csv_files = [re.sub("txt$", "csv", f) for f in list_txt_files]
    for txt_file, csv_file in zip(list_txt_files, list_csv_files):
        # read and process txt results file
        with open(txt_file) as ftxt:
            # line 1: arguments
            par = ast.literal_eval(ftxt.readline())
            dataset_name = get_dataset_name(par)
            print('Processing dataset: ' + dataset_name)
            # line 2: ini and end dates
            ftxt.readline()
            # line 3: report
            report = ast.literal_eval(ftxt.readline().lower())
            add_no_random_measures(report)

        # read csv results file
        res_csv = pd.read_csv(csv_file, quotechar='"',keep_default_na=False)
        report_langs = {}
        if 'lang' in res_csv.columns.tolist():
            print('    Calculating language reports...')
            reports_langs = get_reports_langs(res_csv)
        # save csv and txt files to dict_res
        a = dict_res.setdefault(dataset_name.lower(), {})
        b = a.setdefault(par['model'].lower(), {})
        c = b.setdefault(par['train_templates'][0].lower(), {})
        #d1 = c.setdefault('res',[])
        d2 = c.setdefault('report',{})
        d3 = d2.setdefault('all',[])
        d3.append(report)
        for l in reports_langs.keys():
            d4 = d2.setdefault(l,[])
            d4.append(reports_langs[l])

Processing dataset: muscle-r-de
    Calculating language reports...
Processing dataset: cogalex2
    Calculating language reports...
Processing dataset: cogalexvi
    Calculating language reports...
Processing dataset: muscle-s-zh
    Calculating language reports...
Processing dataset: muscle-r-en
    Calculating language reports...
Processing dataset: muscle-s-rob-en
    Calculating language reports...
Processing dataset: muscle-r-rob-en
    Calculating language reports...
Processing dataset: muscle-r
    Calculating language reports...
Processing dataset: cogalex2
    Calculating language reports...
Processing dataset: muscle-s-en-zh-de
    Calculating language reports...
Processing dataset: muscle-s-de
    Calculating language reports...
Processing dataset: muscle-s-en
    Calculating language reports...
Processing dataset: muscle-s-u
    Calculating language reports...
Processing dataset: muscle-s
    Calculating language reports...
Processing dataset: muscle-s-zh
    Calculating l

In [None]:
def get_value(one_report, keys):
    val = one_report
    for k in keys:
        val = val[k]
    return val

def get_values(list_reports, keys):
    values = np.array([get_value(one_report, keys) for one_report in list_reports])
    return values

def calculateMeansRec(list_reports, one_dict, past_keys, exclude_keys):
    for k in list(one_dict.keys()):
        if k not in exclude_keys:
            copy_past_keys = copy.deepcopy(past_keys)
            copy_past_keys.append(k)
            if not isinstance(one_dict[k], dict):
                values = get_values(list_reports, copy_past_keys)
                one_dict[k]= values.mean()
                one_dict['std_'+k] = values.std()
            else:
                calculateMeansRec(list_reports, one_dict[k], copy_past_keys, exclude_keys)

def flat_list_reportsRec(list_reports, one_dict, past_keys, exclude_keys):
    for k in list(one_dict.keys()):
        if k not in exclude_keys:
            copy_past_keys = copy.deepcopy(past_keys)
            copy_past_keys.append(k)
            if not isinstance(one_dict[k], dict):
                values = get_values(list_reports, copy_past_keys)
                one_dict[k]= values.tolist()
            else:
                flat_list_reportsRec(list_reports, one_dict[k], copy_past_keys, exclude_keys)

def calculateMeans(list_reports):
    '''
    Given a list of structurally equal reports, the function
    returns a report with the means and stds. A report is a dictionary
    whose values are either a dictionary or a real number.
    '''
    means_report = copy.deepcopy(list_reports[0])
    if len(list_reports) > 1:
        calculateMeansRec(list_reports, means_report, past_keys=[], exclude_keys=['support'])

    return means_report

def flat_list_reports(list_reports):
    '''
    Given a list of structurally equal reports, the function returns
    a structurally equal report join in a list all values that are real numbers.
    A report is a dictionary whose values are either a dictionary or a real number.
    '''
    flat_report = copy.deepcopy(list_reports[0])
    flat_list_reportsRec(list_reports, flat_report, past_keys=[], exclude_keys=['support'])

    return flat_report

In [None]:
for d in dict_res:
    print(d.upper())
    for m in dict_res[d]:
        print(" -".join(['Calculating:', d, m]))
        for t in dict_res[d][m]:
            print(" -".join(["    ", t ]))
            dict_res[d][m][t]['mean_report'] = {}
            dict_res[d][m][t]['flat_reports'] = {}
            for l in dict_res[d][m][t]['report']:
                list_reports = dict_res[d][m][t]['report'][l]

                dict_res[d][m][t]['mean_report'][l] = calculateMeans(list_reports)
                dict_res[d][m][t]['flat_reports'][l] = flat_list_reports(list_reports)

MUSCLE-R-DE
Calculating: -muscle-r-de -xlm-roberta-large
     -' <w1> ' <sep> ' <w2> '
COGALEX2
Calculating: -cogalex2 -xlm-roberta-large
     -' <w1> ' <sep> ' <w2> '
COGALEXVI
Calculating: -cogalexvi -xlm-roberta-large
     -' <w1> ' <sep> ' <w2> '
MUSCLE-S-ZH
Calculating: -muscle-s-zh -xlm-roberta-large
     -' <w1> ' <sep> ' <w2> '
MUSCLE-R-EN
Calculating: -muscle-r-en -xlm-roberta-large
     -' <w1> ' <sep> ' <w2> '
MUSCLE-S-ROB-EN
Calculating: -muscle-s-rob-en -roberta-large
     -' <w1> ' <sep> ' <w2> '
MUSCLE-R-ROB-EN
Calculating: -muscle-r-rob-en -roberta-large
     -' <w1> ' <sep> ' <w2> '
MUSCLE-R
Calculating: -muscle-r -xlm-roberta-large
     -' <w1> ' <sep> ' <w2> '
MUSCLE-S-EN-ZH-DE
Calculating: -muscle-s-en-zh-de -xlm-roberta-large
     -' <w1> ' <sep> ' <w2> '
MUSCLE-S-DE
Calculating: -muscle-s-de -xlm-roberta-large
     -' <w1> ' <sep> ' <w2> '
MUSCLE-S-EN
Calculating: -muscle-s-en -xlm-roberta-large
     -' <w1> ' <sep> ' <w2> '
MUSCLE-S-U
Calculating: -muscle-s-u -xl

In [None]:
def get_dataframe_results(dict_res, dataset_name,
                          model='xlm-roberta-large',
                          template="' <w1> ' <sep> ' <w2> '",
                          list_sub_measures=['precision', 'recall', 'f1-score']):
    except_list = ['accuracy', 'std_accuracy']
    dict_df = {'lang':[]}
    col_tuples=[('','lang')]
    create_tuples = True

    mean_report = dict_res[dataset_name][model][template]['mean_report']
    for lang in mean_report:
        dict_df['lang'].append(lang)
        report_lang = mean_report[lang]
        for k in report_lang:
            if k not in except_list:
                for meas in list_sub_measures:
                    l = dict_df.setdefault(k+"_"+meas, [])
                    l.append(report_lang[k][meas])
                    if create_tuples:
                        col_tuples.append((k,meas))
        create_tuples = False
    res_df = pd.DataFrame.from_dict(dict_df)

    index_row = pd.MultiIndex.from_frame(res_df.iloc[:,[0]])
    index_column = pd.MultiIndex.from_tuples(col_tuples)
    res_df.index = index_row
    res_df.columns = index_column
    res_df = res_df.iloc[:,1:]
    return  res_df

##**Visualize experiment results**
Each dataset name corresponds to an experiment:

|dataset_name|model|experiment|
|--|--|--|
|`muscle-r`|`xlm-roberta-large`|Fine-tuning xlm-roberta-large with the random split of muscle|
|`muscle-s`|`xlm-roberta-large`|Fine-tuning xlm-roberta-large with the semantic split of muscle|
|`muscle-r-en-zh-de`|`xlm-roberta-large`|Fine-tuning xlm-roberta-large with the random split of muscle and using only languages: en, de, zh|
|`muscle-r-en`|`xlm-roberta-large`|Fine-tuning xlm-roberta-large with the random split of muscle and using only language: en|
|`muscle-r-de`|`xlm-roberta-large`|Fine-tuning xlm-roberta-large with the random split of muscle and using only language: de|
|`muscle-r-zh`|`xlm-roberta-large`|Fine-tuning xlm-roberta-large with the random split of muscle and using only language: zh|
|`muscle-s-en-zh-de`|`xlm-roberta-large`|Fine-tuning xlm-roberta-large with the semantic split of muscle and using only languages: en, de, zh|
|`muscle-s-en`|`xlm-roberta-large`|Fine-tuning xlm-roberta-large with the semantic split of muscle and using only language: en|
|`muscle-s-de`|`xlm-roberta-large`|Fine-tuning xlm-roberta-large with the semantic split of muscle and using only language: de|
|`muscle-s-zh`|`xlm-roberta-large`|Fine-tuning xlm-roberta-large with the semantic split of muscle and using only language: zh|
|`muscle-r-rob-en`|`roberta-large`|Fine-tuning roberta-large with the random split of muscle and using only languages: en|
|`muscle-s-rob-en`|`roberta-large`|Fine-tuning roberta-large with the semantic split of muscle and using only languages: en|
|`muscle-s-u`|`xlm-roberta-large`|Fine-tuning xlm-roberta-large with the semantic split of muscle converting hypo/hyper and mero/holo relations into inclusion/contain. Unidirectional experiments|
|`cogalexvi`|`xlm-roberta-large`|Fine-tuning xlm-roberta-large with CogALexVI dataset. It contains the following languages: en, de, zh|
|`cogalexvi-en`|`xlm-roberta-large`|Fine-tuning xlm-roberta-large with CogALexVI dataset trained only with en|
|`cogalexvi-zh`|`xlm-roberta-large`|Fine-tuning xlm-roberta-large with CogALexVI dataset trained only with zh|
|`cogalex2`|`xlm-roberta-large`|Fine-tuning xlm-roberta-large with CogALex 2.0 dataset. It contains the following languages: en, de, zh|
|`cogalex2-en`|`xlm-roberta-large`|Fine-tuning xlm-roberta-large with CogALex 2.0 dataset trained only with en.|

The model trained only with `de` in CogALexVI did not converge in any experiment.

To get the results of an experiment, use the bellow cell changing the values of the parameters `dataset_name`and `model`with the values of the above table.


In [None]:
list_sub_measures = ['f1-score'] # it can also be added 'precision' and/or 'recall
d = get_dataframe_results(dict_res,
                          dataset_name = 'cogalex2-en',
                          model='xlm-roberta-large',
                          list_sub_measures=list_sub_measures)
# avoid to show results of the random label
# comment the next two lines to show the random label results
columns_random = [('random',m) for m in list_sub_measures]
d = d.loc[:, [c not in columns_random for c in d.columns ]]

# avoid to show results of the macro avg (it includes random label)
columns_macro_avg = [('macro avg',m) for m in list_sub_measures]
d = d.loc[:, [c not in columns_macro_avg for c in d.columns ]]

# avoid to show results of the weighted avg (it includes random label)
columns_macro_avg = [('weighted avg',m) for m in list_sub_measures]
d = d.loc[:, [c not in columns_macro_avg for c in d.columns ]]

# reorder columns
cols = d.columns.tolist()
if "mero" in cols[4][0]:
    mero = cols[4]
    cols.remove(mero)
    cols.insert(2, mero)

d[cols].apply(round, ndigits=3)

Unnamed: 0_level_0,ant,hyp,syn,macro avg not random,weighted avg not random
Unnamed: 0_level_1,f1-score,f1-score,f1-score,f1-score,f1-score
lang,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
all,0.736,0.626,0.643,0.669,0.671
en,0.736,0.626,0.643,0.669,0.671


#**PROCESS CHATGPT RESULTS**

In [None]:
results_chatgpt = pd.read_csv('/content/results/chatgpt/chatgpt-results-en-dirty.csv', quotechar='"',keep_default_na=False)

In [None]:
results_chatgpt['chatgpt_label'].unique()

array(['random', 'hyponym of', 'hyperonym of', 'meronym of',
       'violates specifications', 'antonym of', 'holonym of'],
      dtype=object)

In [None]:
results_chatgpt

Unnamed: 0,rowid,source,target,labels,labels_code,lang,subject_id,object_id,prop_id,id,chatgpt_label,chatgpt_raw
0,0,Akkadian,tempo,random,5,en,Q35518,Q189214,random,4181,random,unrelated to
1,1,programming language,pseudocode,hyperonym for,2,en,Q9143,Q189224,P279_inv,1682,hyponym of,hyponym of
2,2,Phocidae,travel,random,5,en,Q25587,Q61509,random,4831,random,unrelated to
3,3,peptidase,thiol,random,5,en,Q212410,Q220410,random,7736,hyponym of,hyponym of
4,4,mixture,lubricant,hyperonym for,2,en,Q169336,Q323840,P279_inv,5957,random,unrelated to
...,...,...,...,...,...,...,...,...,...,...,...,...
7836,7836,spaceport,parity,random,5,en,Q194188,Q230967,random,6795,random,"""spaceport"" unrelated to ""parity"""
7837,7837,sambo,jade,random,5,en,Q106500,Q175089,random,5889,random,"""sambo"" unrelated to ""jade"""
7838,7838,hydrology,hydrography,holonym for,1,en,Q42250,Q182468,P527,939,hyponym of,"""hydrology"" hyponym of ""hydrography"""
7839,7839,chemical formula,Crohn's disease,random,5,en,Q83147,Q1472,random,7300,random,"""chemical formula"" unrelated to ""Crohn's disease"""


In [None]:
filter_violates = ['violate' not in l.strip() for l in results_chatgpt['chatgpt_label']]
real = [lab.strip().split(' ')[0] for lab in results_chatgpt[filter_violates]['labels']]
pred = [lab.strip().split(' ')[0] for lab in results_chatgpt[filter_violates]['chatgpt_label']]

In [None]:
dict_res_chatgpt = classification_report(y_true=real, y_pred=pred,output_dict=True)
add_no_random_measures(dict_res_chatgpt)

In [None]:
dict_res['muscle-chatgpt']={}
dict_res['muscle-chatgpt']['chatgpt'] = {}
dict_res['muscle-chatgpt']['chatgpt']['Fill blank'] = {}
dict_res['muscle-chatgpt']['chatgpt']['Fill blank']['mean_report'] = {}
dict_res['muscle-chatgpt']['chatgpt']['Fill blank']['mean_report']['en'] = dict_res_chatgpt


In [None]:
list_sub_measures = ['f1-score'] # it can also be added 'precision' and/or 'recall
d = get_dataframe_results(dict_res,
                          dataset_name = 'muscle-chatgpt',
                          model='chatgpt',
                          template='Fill blank',
                          list_sub_measures=list_sub_measures)
# avoid to show results of the random label
# comment the next two lines to show the random label results
columns_random = [('random',m) for m in list_sub_measures]
d = d.loc[:, [c not in columns_random for c in d.columns ]]

# avoid to show results of the macro avg (it includes random label)
columns_macro_avg = [('macro avg',m) for m in list_sub_measures]
d = d.loc[:, [c not in columns_macro_avg for c in d.columns ]]

# avoid to show results of the weighted avg (it includes random label)
columns_macro_avg = [('weighted avg',m) for m in list_sub_measures]
d = d.loc[:, [c not in columns_macro_avg for c in d.columns ]]

d.apply(round,ndigits=3)

Unnamed: 0_level_0,antonym,holonym,hyperonym,hyponym,meronym,macro avg not random,weighted avg not random
Unnamed: 0_level_1,f1-score,f1-score,f1-score,f1-score,f1-score,f1-score,f1-score
lang,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
en,0.626,0.16,0.383,0.466,0.175,0.362,0.342
