# GOLD STANDARD - HUMAN AGREEMENTS

## Utilities

In [1]:
import nltk
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize
import os
import json
import re
import string
import pandas as pd

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/raky/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import sys
sys.path.insert(0, '../../')
from commons_init import SYNSET_DA_EVITARE, SUPPORTED_POS

## Agreement

### Estrazione e processing dataset
1. Dato un file .json, ricaviamo il dizionario associato.
2. Splittiamo il campo dict['dataset'] in tre campi: 'synset', 'term', 'definition'.
3. All'interno del dizionario sanitizziamo i campi categoriali:
    - Campo 'synset': Synset('war.n.1') --> "war.n.1"
    - Campo 'termine': 'war, warfare' --> ['war', 'warfare']
    - Campo 'definition': tokenizzazione e rimozione di stop-words e punteggiatura
4. Costruiamo un DataFrame con il dizionario cosí ottenuto.

In [3]:
FOLDER_PATH = "./gold_standard"

def get_json_dictionary(folder_path, filename):
    file_path = os.path.join(folder_path, filename)
    f =  open(file_path, "r")
    dictionary = json.load(f)
    f.close()
    return dictionary
   
def filter_dictionary(dictionary):
    synsets, terms, definitions = [], [], []

    for entry in dictionary['dataset']:
        entry = entry.split(':')
        synsets.append(entry[0][8:-2])
        terms.append(word_tokenize(re.sub(',', '', entry[1][:-12].strip())))
        definition = ''.join(char for char in entry[2] if char not in string.punctuation)
        definition = [word for word in word_tokenize(definition) if word not in stop_words]
        definitions.append(definition)
    
    for key in ['i','date','dataset']:
        dictionary.pop(key)
    
    dictionary["synset"] = synsets
    dictionary["term"] = terms
    dictionary["definition"] = definitions

    return dictionary

def get_dataframe(folder_path, filename):
    df = pd.DataFrame(filter_dictionary(get_json_dictionary(folder_path, filename)))
    return df

def get_all_dataframes(folder_path):
    dataframes = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):    
            dataframes.append(get_dataframe(folder_path, filename))
    return dataframes

#### Agreement tra annotatori
L'agreement viene calcolato per ogni synset andando a prendere per ogni annotatore come questo lo ha classificato. Andiamo quindi a contare quante volte il synset è stato annotato come 'basic' e dividiamo rispetto al numero totale di annotatori.  
Per esempio "war" è stato classificato da tutti gli annotatori come "basic" dunque l'agreement score sarà 10/10 = 1.

In [4]:
def compute_agreement_mean(folder_path):
    lista_dataframe = get_all_dataframes(folder_path)
    # print(lista_dataframe[0][:10])

    ordine_colonne = lista_dataframe[0].columns
    nuova_lista_dataframe = [df.reindex(columns=ordine_colonne) for df in lista_dataframe]
    ordine_colonne = nuova_lista_dataframe[0].columns
    
    agreement_scores_dict, isHard_dict, timeDiffs_dict = {}, {}, {}
    for df in nuova_lista_dataframe:
        for colonna in df.columns:
            data = pd.DataFrame({f'df{i+1}': df[colonna] for i, df in enumerate(nuova_lista_dataframe)})
            if colonna == 'answers':
                data['agreement'] = data.eq('basic').mean(axis=1)
                agreement_scores_dict = {synset: score for synset, score in zip(nuova_lista_dataframe[0]['synset'], data['agreement'])}
                agreements_list = data['agreement'].tolist()
            elif colonna == 'isHard':
                data['isHard'] = data.eq(True).mean(axis=1)
                isHard_dict = {synset: answ for synset, answ in zip(nuova_lista_dataframe[0]['synset'], data['isHard'])}
                isHard_list = data['isHard'].tolist()
            elif colonna == 'timeDiffs':
                data['timeDiffs'] = data.mean(axis=1)
                timeDiffs_dict = {synset: answ for synset, answ in zip(nuova_lista_dataframe[0]['synset'], data['timeDiffs'])}
                timeDiffs_list = data['timeDiffs'].tolist()
                #normalizzazione
                max_timeDiffs = max(data['timeDiffs'])
                min_timeDiffs = 0.0
                timeDiffs_dict_norm = {synset: (answ - min_timeDiffs) / (max_timeDiffs - min_timeDiffs) for synset, answ in zip(nuova_lista_dataframe[0]['synset'], data['timeDiffs'])}
                
    return agreements_list, agreement_scores_dict, isHard_list, isHard_dict, timeDiffs_list, timeDiffs_dict, timeDiffs_dict_norm


In [5]:
AGREEMENT_SCORES, AGREEMENT_SCORES_DICT, isHard_list, isHard_dict, timeDiffs_list, timeDiffs_dict, timeDiffs_dict_norm = compute_agreement_mean(FOLDER_PATH)
print(AGREEMENT_SCORES_DICT)

{'war.n.01': 1.0, 'fiefdom.n.01': 0.0, 'bed.n.03': 1.0, 'return_on_invested_capital.n.01': 0.0, 'texture.n.02': 0.8, 'news.n.01': 1.0, 'look.n.02': 1.0, 'caddy.n.01': 0.2, 'weeder.n.01': 0.0, 'avenue.n.02': 0.4, 'adar.n.01': 0.0, 'bedtime.n.01': 0.8, 'inversion.n.08': 0.7, 'yak.n.01': 0.1, 'breath.n.05': 0.9, 'executive_clemency.n.01': 0.0, 'muse.n.02': 0.3, 'effect.n.06': 1.0, 'quickening.n.02': 0.1, 'sleeper.n.09': 0.5, 'caravanning.n.01': 0.0, 'jotter.n.01': 0.0, 'armageddon.n.02': 0.2, 'compass_point.n.01': 0.1, 'blackwater_fever.n.01': 0.0, 'respect.n.03': 1.0, 'position.n.06': 1.0, 'message.n.02': 1.0, 'arrest.n.02': 0.8, 'motivation.n.01': 0.9, 'day.n.04': 1.0, 'nose_cone.n.01': 0.0, 'discussion.n.02': 0.9, 'glow.n.05': 0.5, 'alcalde.n.01': 0.0, 'draft_board.n.01': 0.2, 'multitude.n.03': 0.4, 'hour.n.02': 1.0, 'book.n.02': 1.0, 'degree.n.02': 1.0, 'show-stopper.n.01': 0.0, 'military_position.n.01': 0.2, 'top.n.09': 1.0, 'pillar_of_islam.n.01': 0.1, 'power.n.05': 1.0, 'dail_eirea

In [6]:
SYNSET_WITH_AGREEMENT_LIST = list(AGREEMENT_SCORES_DICT.keys())
print(SYNSET_WITH_AGREEMENT_LIST)

['war.n.01', 'fiefdom.n.01', 'bed.n.03', 'return_on_invested_capital.n.01', 'texture.n.02', 'news.n.01', 'look.n.02', 'caddy.n.01', 'weeder.n.01', 'avenue.n.02', 'adar.n.01', 'bedtime.n.01', 'inversion.n.08', 'yak.n.01', 'breath.n.05', 'executive_clemency.n.01', 'muse.n.02', 'effect.n.06', 'quickening.n.02', 'sleeper.n.09', 'caravanning.n.01', 'jotter.n.01', 'armageddon.n.02', 'compass_point.n.01', 'blackwater_fever.n.01', 'respect.n.03', 'position.n.06', 'message.n.02', 'arrest.n.02', 'motivation.n.01', 'day.n.04', 'nose_cone.n.01', 'discussion.n.02', 'glow.n.05', 'alcalde.n.01', 'draft_board.n.01', 'multitude.n.03', 'hour.n.02', 'book.n.02', 'degree.n.02', 'show-stopper.n.01', 'military_position.n.01', 'top.n.09', 'pillar_of_islam.n.01', 'power.n.05', 'dail_eireann.n.01', 'record.n.05', 'pace.n.03', 'american_labor_party.n.01', 'nook_and_cranny.n.01', "lady's_smock.n.01", 'obstacle_race.n.01', 'clasp.n.02', 'stuff.n.07', 'succubus.n.01', 'form.n.03', 'quantity.n.02', 'commercial_ente

## Calcolo agreement dei synset

In [7]:
def compute_synsets_agreement():
    agreement_dict = {}
    
    for pos in SUPPORTED_POS:
        for synset in wordnet.all_synsets(pos):
            
            try:
                if synset in SYNSET_DA_EVITARE or synset.pos() == 's':
                    continue

                synset_name = synset.name()
                if synset_name in SYNSET_WITH_AGREEMENT_LIST:
                    agreement_score = AGREEMENT_SCORES_DICT[synset_name]
                    ishard = isHard_dict[synset_name]
                    timeDiffs = timeDiffs_dict[synset_name]
                    timeDiffs_norm = timeDiffs_dict_norm[synset_name]
                else:
                    agreement_score = None
                    ishard = None
                    timeDiffs = None
                    timeDiffs_norm = None
                
                agreement_dict[synset] = agreement_score, ishard, timeDiffs, timeDiffs_norm
                
            except StopIteration:
                break
            
    return agreement_dict

In [8]:
agreement_dict = compute_synsets_agreement()

In [9]:
data_df = {
    'Synset': [str(key) for key in agreement_dict.keys()],
    'Agreement score': [item[0] for item in agreement_dict.values()],
    'isHard mean': [item[1] for item in agreement_dict.values()],
    'timeDiffs mean': [item[2] for item in agreement_dict.values()],
    'Normalized timeDiffs mean': [item[3] for item in agreement_dict.values()]
}
df_agreement = pd.DataFrame(data_df)

## Agreement di un synset specifico

In [10]:
synset = wordnet.synset('fiefdom.n.01')
synset_name = synset.name()

if synset_name in SYNSET_WITH_AGREEMENT_LIST:
    synset_agreement_dict = agreement_dict.get(synset, 'Synset non classificato')
    agreement = synset_agreement_dict[0]
    isHard_mean = synset_agreement_dict[1]
    timeDiffs_mean = synset_agreement_dict[2]
    timeDiffs_mean_norm = synset_agreement_dict[3]
    print(f'Agreement score: {agreement}')
    print(f'isHard mean: {isHard_mean}')
    print(f'timeDiffs mean: {timeDiffs_mean}')
    print(f'timeDiffs mean normalized: {timeDiffs_mean_norm}')
else:
    print('Synset senza agreement')

Agreement score: 0.0
isHard mean: 0.0
timeDiffs mean: 2.8927
timeDiffs mean normalized: 0.0020679478168270198


## Salvataggio su file

In [11]:
path_output = "../../features/df/"
df_file_output = path_output + 'df_agreement.csv'
df_agreement.to_csv(df_file_output, index=False)

## Risultati ottenuti

In [12]:
print(df_agreement[:10])

                           Synset  Agreement score  isHard mean   
0           Synset('entity.n.01')              NaN          NaN  \
1  Synset('physical_entity.n.01')              NaN          NaN   
2      Synset('abstraction.n.06')              NaN          NaN   
3            Synset('thing.n.12')              NaN          NaN   
4           Synset('object.n.01')              NaN          NaN   
5            Synset('whole.n.02')              1.0          0.0   
6         Synset('congener.n.03')              NaN          NaN   
7     Synset('living_thing.n.01')              NaN          NaN   
8         Synset('organism.n.01')              NaN          NaN   
9          Synset('benthos.n.02')              NaN          NaN   

   timeDiffs mean  Normalized timeDiffs mean  
0             NaN                        NaN  
1             NaN                        NaN  
2             NaN                        NaN  
3             NaN                        NaN  
4             NaN          