#**Risk Metrics**
This notebook find the risk metrics defined in the paper with a word tokenizer. To reproduce the results with XLM-R and ChatGPT tokenizers, uncomment and comment the lines in a bellow cell with the following code:

```
# uncomment the following two lines for XLM-R tokenizer
# model_name = 'xlm-roberta-large'
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# uncomment the next line for tokenizer chatGPT
# tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

# uncomment the next line for a punctuation pretokenizer
tokenizer = PunctuationPreTokenizer()
```

It takes about $45$ minutes to run all cells in Google Colab.

In [16]:
# download MUSCLE and CogALex-VI datasets
!gdown '16f6cDL4OKhRCnUAA68mnFuGq99xjvhgP'
!unzip -o muscle_cogalexvi.zip

# download lexical relation datasets: K&H+N, BLESS, ROOT09, EVALution,CogALexV, CogALex2.0
!gdown 1Fm_eTh3XmGAsOgZL_-yKraVuwpQoQn1P
!unzip -o other_lexical_relation_dataset.zip

# download the f1-score results for other models in the literature
!gdown '1qmroi2GeJBdHaZ9By7Y6PZTaFkpDK475'
!unzip -o other_models_results.zip

Downloading...
From: https://drive.google.com/uc?id=16f6cDL4OKhRCnUAA68mnFuGq99xjvhgP
To: /content/muscle_cogalexvi.zip
100% 22.5M/22.5M [00:00<00:00, 80.2MB/s]
Archive:  muscle_cogalexvi.zip
  inflating: CogALexVI/all/italian_test.tsv  
  inflating: CogALexVI/all/test.tsv  
  inflating: CogALexVI/all/train.tsv  
  inflating: CogALexVI/all/val.tsv   
  inflating: CogALexVI/chinese/test.tsv  
  inflating: CogALexVI/chinese/train.tsv  
  inflating: CogALexVI/chinese/val.tsv  
  inflating: CogALexVI/english/test.tsv  
  inflating: CogALexVI/english/train.tsv  
  inflating: CogALexVI/english/val.tsv  
  inflating: CogALexVI/german/test.tsv  
  inflating: CogALexVI/german/train.tsv  
  inflating: CogALexVI/german/val.tsv  
  inflating: CogALexVI/italian/test.tsv  
  inflating: MUSCLE/random_split/test.tsv  
  inflating: MUSCLE/random_split/train.tsv  
  inflating: MUSCLE/semantic_split/test.tsv  
  inflating: MUSCLE/semantic_split/train.tsv  
Downloading...
From: https://drive.google.com/uc

In [2]:
!pip install transformers
!pip install datasets
!pip install --upgrade tiktoken

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/7.2 MB[0m [31m42.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/7.2 MB[0m [31m36.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m4.4/7.2 MB[0m [31m42.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.2/7.2 MB[0m [31m52.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.2/7.2 MB[0m [31m52.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggin

In [3]:
from transformers import AutoTokenizer
from tokenizers import normalizers, pre_tokenizers
from datasets import load_dataset
import pandas as pd
import numpy as np
import re
import statsmodels.api as sm
import copy
from ast import literal_eval
from scipy.stats import entropy
import tiktoken
import math

In [4]:
class PunctuationPreTokenizer:
    def __init__ (self):
        self.normalizer = normalizers.BertNormalizer(lowercase=True, strip_accents=False,)
        self.pre_tok = pre_tokenizers.BertPreTokenizer()
    def encode (self, w):
        w = self.normalizer.normalize_str(w)
        l = self.pre_tok.pre_tokenize_str(w)
        res = [t for t, pos in l]
        return res

# model_name = 'xlm-roberta-large'
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# tokenizer chatGPT
# tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

# a punctuation pretokenizer
tokenizer = PunctuationPreTokenizer()

In [5]:
def verb_row(row, template, template_source, template_target, tokenizer):
    w1 = str(row['source'])
    w2 = str(row['target'])
    #lab = str(row['rel']).lower()
    verb_source = re.sub("<W1>", w1, template_source)
    verb_target = re.sub("<W2>", w2, template_target)
    sentence = re.sub("<W1>", w1, template)
    sentence = re.sub("<W2>", w2, sentence)
    if (not type(tokenizer) is tiktoken.core.Encoding) and (not type(tokenizer) is PunctuationPreTokenizer):
        sentence = re.sub("<SEP>", tokenizer.sep_token, sentence)
    else:
        sentence = re.sub("<SEP>", ' ', sentence)

    return {'verb_both':sentence, 'verb_source':verb_source, 'verb_target':verb_target}

def find_distribution_tokens(data, tokenizer):
    distribution = {}
    distribution['both'] = {}
    distribution['source'] = {}
    distribution['target'] = {}

    label_names = [l.lower() for l in data.features['labels'].names]
    distribution['labels'] = label_names

    total_labels = data.features['labels'].num_classes

    for r in data:
        toks_source = set(tokenizer.encode(r['verb_source']))
        toks_target = set(tokenizer.encode(r['verb_target']))
        toks = set(tokenizer.encode(r['verb_both']))
        label = r['labels']

        for tok in toks:
            if not (tok in distribution['both']):
                distribution['both'][tok] = {}
                distribution['both'][tok]['dist'] = np.zeros(total_labels)
                distribution['both'][tok]['type'] = 4
            distribution['both'][tok]['dist'][label] += 1
        for tok_s in toks_source:
            if not (tok_s in distribution['source']):
                distribution['source'][tok_s] = {}
                distribution['source'][tok_s]['dist'] = np.zeros(total_labels)
                distribution['source'][tok_s]['type'] = 4
            distribution['source'][tok_s]['dist'][label] += 1
        for tok_t in toks_target:
            if not (tok_t in distribution['target']):
                distribution['target'][tok_t] = {}
                distribution['target'][tok_t]['dist'] = np.zeros(total_labels)
                distribution['target'][tok_t]['type'] = 4
            distribution['target'][tok_t]['dist'][label] += 1

    return distribution

def calculate_token_type(dist_dict_train, dist_dict_test, split, beta=0.7):
    '''
    function 'calculate_token_type' returns a tuple:
        (total_tokens_in_test, total_indicators, total_indicators_random, total_distractors, total_independent, total_neutral)
    '''
    is_random = 'random' in dist_dict_train['labels']
    is_false = 'false' in dist_dict_train['labels']
    if is_random:
        random_indx = np.where(np.array(dist_dict_train['labels']) == 'random')
    elif is_false:
        random_indx = np.where(np.array(dist_dict_train['labels']) == 'false')
    is_random = is_random or is_false

    tokens_train = set(dist_dict_train[split].keys())
    tokens_test = set(dist_dict_test[split].keys())
    all_tokens = tokens_train.union(tokens_test)
    tokens_intersection = tokens_train.intersection(tokens_test)
    total_toks_intersection = len(tokens_intersection)
    tokens_independent = tokens_test.difference(tokens_intersection)
    for t in tokens_independent:
        dist_dict_test[split][t]['type'] = 3
    total_independent = len(tokens_test) - total_toks_intersection
    val = -math.log(1.0/len(dist_dict_train['labels']))/2
    total_indicators = 0
    total_indicators_random = 0
    total_distractors = 0
    total_neutral = 0
    for t in tokens_intersection:
        dist_train_t = dist_dict_train[split][t]['dist']
        dist_test_t = dist_dict_test[split][t]['dist']

        if np.max(dist_train_t/sum(dist_train_t)) >= beta and np.max(dist_test_t/sum(dist_test_t)) >= beta and np.argmax(dist_test_t) == np.argmax(dist_train_t):
            if is_random and np.argmax(np.array(dist_train_t)) == random_indx:
                total_indicators_random = total_indicators_random + 1
                dist_dict_test[split][t]['type'] = 1
            else:
                total_indicators = total_indicators + 1
                dist_dict_test[split][t]['type'] = 0
        elif np.max(dist_train_t/sum(dist_train_t)) >= beta and np.max(dist_test_t/sum(dist_test_t)) >= beta and np.argmax(dist_test_t) != np.argmax(dist_train_t):
            total_distractors = total_distractors + 1
            dist_dict_test[split][t]['type'] = 2

    return (len(tokens_test),total_indicators, total_indicators_random, total_distractors, total_independent, total_neutral)


In [6]:
datasets_rel = ['K&H+N', 'BLESS', 'ROOT09', 'EVALution', 'CogALexV', 'CogALexVI/all', 'CogALex2.0/all', 'MUSCLE-s', 'MUSCLE-r']

dir_datasets = ['/content/' + f + '/' for f in datasets_rel[:-2]]
dir_datasets.append('/content/MUSCLE/semantic_split/')
dir_datasets.append('/content/MUSCLE/random_split/')

template_source = " <W1>"
template_target = " <W2>"
template = "" + template_source + " " + template_target + ""

res_distributions = {}
total_obs = {}
for f, dir in zip(datasets_rel, dir_datasets):
    train_file = dir + '/train.tsv'
    test_file = dir +'/test.tsv'
    col_names = ['source', 'target', 'labels']
    if 'muscle' in f.lower():
       col_names = ['source', 'target', 'labels', 'lang', 'subject_id', 'object_id', 'prop_id']
    elif f == 'CogALexVI/all' or f == 'CogALex2.0/all':
        col_names = ['source', 'target', 'labels', 'lang']

    data_all = load_dataset('csv', data_files={'train':train_file, 'test':test_file},
                            sep='\t',
                            header=None,
                            names=col_names,
                            keep_default_na=False)

    data_all = data_all.map(lambda x: {'rel':x['labels']})
    data_all = data_all.class_encode_column('labels')
    data_all = data_all.map(verb_row, fn_kwargs={'tokenizer':tokenizer,
                                                'template':template,
                                                'template_source':template_source,
                                                'template_target':template_target})
    langs = ['all']
    if 'muscle' in f.lower() or f == 'CogALexVI/all' or f == 'CogALex2.0/all':
        langs.extend(list(set(data_all['train']['lang'])))

    for lang in langs:
        print("*********")
        print(f + ": "+ lang)
        print("*********")
        if lang != 'all':
            data_all_filt = data_all.filter(lambda x: x['lang'] == lang)
        else:
            data_all_filt = data_all

        res_distributions[f + "_" + lang] = {}
        total_obs[f + "_" + lang] = {}

        dist_toks_train = find_distribution_tokens(data_all_filt['train'], tokenizer)
        dist_toks_test = find_distribution_tokens(data_all_filt['test'], tokenizer)

        for split in ['source', 'target', 'both']:
            res_distributions[f + "_" + lang][split] = calculate_token_type(dist_toks_train, dist_toks_test, split)
            total_obs[f + "_" + lang][split] = np.zeros((data_all_filt['test'].num_rows,4))
            for i, r in enumerate(data_all_filt['test']):
                split_toks = set(tokenizer.encode(r['verb_' + split]))
                for t in split_toks:
                    if sum(dist_toks_test[split][t]['dist']) >= 0:
                        token_type = dist_toks_test[split][t]['type']
                        if 'cogalex' not in f.lower() and 'muscle' not in f.lower():
                            if token_type >= 1:
                                token_type -= 1
                            total_obs[f + "_" + lang][split][i,token_type] += 1
                        else:
                            if token_type != 1:
                                token_type -= 1
                                if token_type < 0:
                                    token_type = 0
                                total_obs[f + "_" + lang][split][i,token_type] += 1



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-33485337dbe8ece2/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-33485337dbe8ece2/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/40256 [00:00<?, ? examples/s]

Map:   0%|          | 0/14377 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/40256 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/14377 [00:00<?, ? examples/s]

Map:   0%|          | 0/40256 [00:00<?, ? examples/s]

Map:   0%|          | 0/14377 [00:00<?, ? examples/s]

*********
K&H+N: all
*********
Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-db1d163c4fdb25dc/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-db1d163c4fdb25dc/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/18582 [00:00<?, ? examples/s]

Map:   0%|          | 0/6637 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/18582 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/6637 [00:00<?, ? examples/s]

Map:   0%|          | 0/18582 [00:00<?, ? examples/s]

Map:   0%|          | 0/6637 [00:00<?, ? examples/s]

*********
BLESS: all
*********
Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-4e7544e930b4eded/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-4e7544e930b4eded/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/8933 [00:00<?, ? examples/s]

Map:   0%|          | 0/3191 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/8933 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3191 [00:00<?, ? examples/s]

Map:   0%|          | 0/8933 [00:00<?, ? examples/s]

Map:   0%|          | 0/3191 [00:00<?, ? examples/s]

*********
ROOT09: all
*********
Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-9b3b954f8b879790/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-9b3b954f8b879790/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/5160 [00:00<?, ? examples/s]

Map:   0%|          | 0/1846 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/5160 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1846 [00:00<?, ? examples/s]

Map:   0%|          | 0/5160 [00:00<?, ? examples/s]

Map:   0%|          | 0/1846 [00:00<?, ? examples/s]

*********
EVALution: all
*********
Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-ce7d1b4ae40bf6b1/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-ce7d1b4ae40bf6b1/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/3054 [00:00<?, ? examples/s]

Map:   0%|          | 0/4260 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3054 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/4260 [00:00<?, ? examples/s]

Map:   0%|          | 0/3054 [00:00<?, ? examples/s]

Map:   0%|          | 0/4260 [00:00<?, ? examples/s]

*********
CogALexV: all
*********
Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-dd037f23c11238b4/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-dd037f23c11238b4/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/12606 [00:00<?, ? examples/s]

Map:   0%|          | 0/4204 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/12606 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/4204 [00:00<?, ? examples/s]

Map:   0%|          | 0/12606 [00:00<?, ? examples/s]

Map:   0%|          | 0/4204 [00:00<?, ? examples/s]

*********
CogALexVI/all: all
*********
*********
CogALexVI/all: de
*********


Filter:   0%|          | 0/12606 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4204 [00:00<?, ? examples/s]

*********
CogALexVI/all: en
*********


Filter:   0%|          | 0/12606 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4204 [00:00<?, ? examples/s]

*********
CogALexVI/all: zh
*********


Filter:   0%|          | 0/12606 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4204 [00:00<?, ? examples/s]

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-93dc6b73058fb939/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-93dc6b73058fb939/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/12606 [00:00<?, ? examples/s]

Map:   0%|          | 0/4204 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/12606 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/4204 [00:00<?, ? examples/s]

Map:   0%|          | 0/12606 [00:00<?, ? examples/s]

Map:   0%|          | 0/4204 [00:00<?, ? examples/s]

*********
CogALex2.0/all: all
*********
*********
CogALex2.0/all: de
*********


Filter:   0%|          | 0/12606 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4204 [00:00<?, ? examples/s]

*********
CogALex2.0/all: en
*********


Filter:   0%|          | 0/12606 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4204 [00:00<?, ? examples/s]

*********
CogALex2.0/all: zh
*********


Filter:   0%|          | 0/12606 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4204 [00:00<?, ? examples/s]

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-6b4ece9886d3ad55/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-6b4ece9886d3ad55/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/190400 [00:00<?, ? examples/s]

Map:   0%|          | 0/196025 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/190400 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/196025 [00:00<?, ? examples/s]

Map:   0%|          | 0/190400 [00:00<?, ? examples/s]

Map:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: all
*********
*********
MUSCLE-s: it
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: zh
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: sr
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: uk
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: id
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: cs
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: de
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: ca
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: ko
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: hu
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: da
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: es
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: ru
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: fi
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: tr
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: ar
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: pl
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: ja
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: en
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: fa
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: he
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: sv
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: pt
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: fr
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

*********
MUSCLE-s: nl
*********


Filter:   0%|          | 0/190400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/196025 [00:00<?, ? examples/s]

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-7ff394d9363fcdb9/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-7ff394d9363fcdb9/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/343825 [00:00<?, ? examples/s]

Map:   0%|          | 0/343850 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/343825 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/343850 [00:00<?, ? examples/s]

Map:   0%|          | 0/343825 [00:00<?, ? examples/s]

Map:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: all
*********
*********
MUSCLE-r: it
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: zh
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: sr
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: uk
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: id
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: cs
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: de
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: ca
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: ko
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: hu
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: da
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: es
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: ru
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: fi
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: tr
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: ar
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: pl
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: ja
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: en
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: fa
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: he
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: sv
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: pt
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: fr
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

*********
MUSCLE-r: nl
*********


Filter:   0%|          | 0/343825 [00:00<?, ? examples/s]

Filter:   0%|          | 0/343850 [00:00<?, ? examples/s]

In [7]:
data_type_tokes = pd.DataFrame.from_dict(res_distributions).T
list_dataset_names = data_type_tokes.index.tolist()
list_dataset_muscle = [f for f in list_dataset_names if 'MUSCLE' in f]
list_dataset_names_sorted = [f for f in list_dataset_names if 'MUSCLE' not in f]
list_dataset_muscle.sort()
list_dataset_names_sorted.extend(list_dataset_muscle)
data_type_tokes.reindex(list_dataset_names_sorted)
data_type_tokes

Unnamed: 0,source,target,both
K&H+N_all,"(1365, 406, 326, 20, 27, 0)","(6548, 735, 2684, 91, 2797, 0)","(6984, 525, 2934, 70, 2717, 0)"
BLESS_all,"(200, 0, 0, 0, 0, 0)","(3322, 691, 1019, 77, 1236, 0)","(3329, 523, 1019, 76, 1236, 0)"
ROOT09_all,"(816, 0, 0, 224, 53, 0)","(1558, 236, 172, 263, 467, 0)","(2003, 54, 156, 360, 456, 0)"
EVALution_all,"(972, 87, 0, 144, 98, 0)","(882, 177, 0, 111, 117, 0)","(1331, 88, 0, 96, 69, 0)"
CogALexV_all,"(429, 0, 1, 0, 426, 0)","(1030, 3, 321, 81, 84, 0)","(1165, 1, 370, 52, 182, 0)"
...,...,...,...
MUSCLE-r_he,"(4972, 192, 2058, 256, 860, 0)","(4885, 142, 2199, 211, 801, 0)","(6376, 132, 2857, 248, 653, 0)"
MUSCLE-r_sv,"(5070, 210, 2057, 307, 1038, 0)","(4929, 169, 2168, 267, 938, 0)","(6656, 146, 2853, 304, 758, 0)"
MUSCLE-r_pt,"(4763, 164, 1994, 241, 832, 0)","(4620, 131, 2094, 206, 759, 0)","(6070, 109, 2739, 247, 606, 0)"
MUSCLE-r_fr,"(4753, 167, 1990, 236, 843, 0)","(4588, 127, 2088, 198, 735, 0)","(6049, 114, 2727, 237, 609, 0)"


In [9]:
data_type_tokes_perc = data_type_tokes.copy()
for i in range(data_type_tokes_perc.shape[0]):
    for j in range(data_type_tokes_perc.shape[1]):
        data_type_tokes_perc.iloc[i,j] = np.around(np.array(data_type_tokes_perc.iloc[i,j])/data_type_tokes_perc.iloc[i,j][0],2)[1:-1]
data_type_tokes_perc_viz = data_type_tokes_perc.copy()
for i in range(data_type_tokes_perc_viz.shape[0]):
    for j in range(data_type_tokes_perc_viz.shape[1]):
        data_type_tokes_perc_viz.iloc[i,j] = ["%.2f" % y for y in data_type_tokes_perc_viz.iloc[i,j] ]

In [10]:
data_type_tokes_perc_viz

Unnamed: 0,source,target,both
K&H+N_all,"[0.30, 0.24, 0.01, 0.02]","[0.11, 0.41, 0.01, 0.43]","[0.08, 0.42, 0.01, 0.39]"
BLESS_all,"[0.00, 0.00, 0.00, 0.00]","[0.21, 0.31, 0.02, 0.37]","[0.16, 0.31, 0.02, 0.37]"
ROOT09_all,"[0.00, 0.00, 0.27, 0.06]","[0.15, 0.11, 0.17, 0.30]","[0.03, 0.08, 0.18, 0.23]"
EVALution_all,"[0.09, 0.00, 0.15, 0.10]","[0.20, 0.00, 0.13, 0.13]","[0.07, 0.00, 0.07, 0.05]"
CogALexV_all,"[0.00, 0.00, 0.00, 0.99]","[0.00, 0.31, 0.08, 0.08]","[0.00, 0.32, 0.04, 0.16]"
...,...,...,...
MUSCLE-r_he,"[0.04, 0.41, 0.05, 0.17]","[0.03, 0.45, 0.04, 0.16]","[0.02, 0.45, 0.04, 0.10]"
MUSCLE-r_sv,"[0.04, 0.41, 0.06, 0.20]","[0.03, 0.44, 0.05, 0.19]","[0.02, 0.43, 0.05, 0.11]"
MUSCLE-r_pt,"[0.03, 0.42, 0.05, 0.17]","[0.03, 0.45, 0.04, 0.16]","[0.02, 0.45, 0.04, 0.10]"
MUSCLE-r_fr,"[0.04, 0.42, 0.05, 0.18]","[0.03, 0.46, 0.04, 0.16]","[0.02, 0.45, 0.04, 0.10]"


In [11]:
datos_obs = pd.DataFrame.from_dict(total_obs).T

datos_obs_viz = copy.deepcopy(datos_obs)

for i in range(datos_obs_viz.shape[0]):
    for j in range(datos_obs_viz.shape[1]):
        mat = datos_obs.iloc[i,j]
        for k in range(mat.shape[0]):
                v = mat[k,:-1]
                for l in range(len(v)):
                    if v[l] != 0 and sum(v) == v[l]:
                        datos_obs.iloc[i,j][k,l] = 1
                    else:
                        datos_obs.iloc[i,j][k,l] = 0

        datos_obs_viz.iloc[i,j] = np.mean(np.where(datos_obs.iloc[i,j] != 0, 1, 0),axis=0)

for i in range(datos_obs_viz.shape[0]):
    for j in range(datos_obs_viz.shape[1]):
        datos_obs_viz.iloc[i,j] = [round(100*round(y,3),1) for y in datos_obs_viz.iloc[i,j][:-1] ]


In [12]:
datos_obs_viz.sort_index()

Unnamed: 0,source,target,both
BLESS_all,"[0.0, 0.0, 0.0]","[65.2, 1.5, 20.1]","[56.0, 1.5, 20.1]"
CogALex2.0/all_all,"[1.6, 2.9, 10.3]","[5.5, 20.0, 30.4]","[3.9, 20.0, 29.7]"
CogALex2.0/all_de,"[0.1, 0.4, 7.3]","[3.6, 25.7, 38.9]","[2.6, 24.0, 39.2]"
CogALex2.0/all_en,"[0.2, 0.5, 10.9]","[3.3, 21.1, 28.7]","[2.9, 19.8, 30.3]"
CogALex2.0/all_zh,"[7.2, 12.8, 15.4]","[13.7, 7.6, 19.5]","[8.5, 12.8, 12.1]"
...,...,...,...
MUSCLE-s_sv,"[0.3, 0.7, 95.4]","[0.2, 1.6, 94.0]","[0.0, 0.1, 99.4]"
MUSCLE-s_tr,"[0.1, 2.2, 85.2]","[0.2, 2.7, 84.7]","[0.0, 0.6, 96.0]"
MUSCLE-s_uk,"[0.2, 1.8, 91.4]","[0.1, 1.8, 90.4]","[0.0, 0.2, 98.1]"
MUSCLE-s_zh,"[0.2, 4.7, 33.9]","[0.1, 5.0, 30.7]","[0.0, 3.4, 45.1]"


In [14]:
def get_max_dataframe(raw_token_type):
    list_vals_max_source_target = []
    list_indp = []
    for i, d in raw_token_type.iterrows():
        if 'muscle' not in i:
            list_vals_max_source_target.append(np.max(np.stack(d), axis=0)[:-1])
            list_indp.append(d[2][2])

    max_token_type = np.stack(list_vals_max_source_target)
    indps = np.array(list_indp)
    max_token_type = np.c_[max_token_type, indps]
    max_token_type=pd.DataFrame(max_token_type)
    max_token_type.index = raw_token_type.index
    max_token_type.columns = ['inds', 'dist', 'indp']

    return max_token_type

token_type_df = get_max_dataframe(datos_obs_viz)
token_type_df.sort_index()

Unnamed: 0,inds,dist,indp
BLESS_all,65.2,1.5,20.1
CogALex2.0/all_all,5.5,20.0,29.7
CogALex2.0/all_de,3.6,25.7,39.2
CogALex2.0/all_en,3.3,21.1,30.3
CogALex2.0/all_zh,13.7,12.8,12.1
...,...,...,...
MUSCLE-s_sv,0.3,1.6,99.4
MUSCLE-s_tr,0.2,2.7,96.0
MUSCLE-s_uk,0.2,1.8,98.1
MUSCLE-s_zh,0.2,5.0,45.1


In [17]:
res_1 = pd.read_csv('/content/other_models_results/res_models_khn_bless_eval_root_cog.csv', index_col=0)
res_2 = pd.read_csv('/content/other_models_results/res_cogv.csv', index_col = 0)

In [18]:
print(res_1.mean())
print(res_2.mean())

K&H+N_all        0.9812
BLESS_all        0.9298
EVALution_all    0.6690
ROOT09_all       0.8796
CogALexV_all     0.5644
dtype: float64
CogALexVI/all_en     0.601500
CogALexVI/all_de     0.630333
CogALexVI/all_zh     0.902667
CogALex2.0/all_en    0.793333
CogALex2.0/all_de    0.685000
dtype: float64


In [19]:
def print_evidence_for(token_type, token_type_stats, list_res):
    list_v = []
    for res in list_res:
        for d in res.columns:
            m = np.stack([token_type_stats.loc[d,token_type] for i in range(res.shape[0])])
            m = np.c_[m,res[d]]
            list_v.append(m)

    total = np.concatenate(list_v, axis = 0)
    total = total[np.logical_not(np.isnan(total).any(axis=1))]

    # linear regression
    X = total[:,:-1]
    y = total[:,-1]
    X = sm.add_constant(X)
    mod = sm.OLS(y, X)
    res = mod.fit()
    print(res.summary())

In [20]:
print_evidence_for('inds', token_type_df, [res_1,res_2])

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.543
Model:                            OLS   Adj. R-squared:                  0.531
Method:                 Least Squares   F-statistic:                     46.34
Date:                Thu, 06 Jul 2023   Prob (F-statistic):           3.95e-08
Time:                        11:36:21   Log-Likelihood:                 31.822
No. Observations:                  41   AIC:                            -59.64
Df Residuals:                      39   BIC:                            -56.22
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6520      0.025     26.240      0.0

In [21]:
print_evidence_for('dist', token_type_df, [res_1,res_2])

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.216
Model:                            OLS   Adj. R-squared:                  0.196
Method:                 Least Squares   F-statistic:                     10.74
Date:                Thu, 06 Jul 2023   Prob (F-statistic):            0.00221
Time:                        11:36:24   Log-Likelihood:                 20.758
No. Observations:                  41   AIC:                            -37.52
Df Residuals:                      39   BIC:                            -34.09
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.8691      0.038     22.713      0.0

In [22]:
print_evidence_for('indp', token_type_df, [res_1,res_2])

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.132
Model:                            OLS   Adj. R-squared:                  0.109
Method:                 Least Squares   F-statistic:                     5.908
Date:                Thu, 06 Jul 2023   Prob (F-statistic):             0.0198
Time:                        11:36:29   Log-Likelihood:                 18.661
No. Observations:                  41   AIC:                            -33.32
Df Residuals:                      39   BIC:                            -29.90
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.8997      0.059     15.294      0.0