# Utilities

In [1]:
import nltk
from nltk.corpus import wordnet
import pandas as pd

In [2]:
import sys
sys.path.insert(0, '../')
from commons_init import SYNSET_DA_EVITARE, SUPPORTED_POS

## Sinonimia
Synset con un numero inferiore di sinonimi saranno considerati più *basic* rispetto a quelli con un numero maggiore di sinonimi. Quindi un termine è più basic se ha un basso score di sinonimia.
- Conteggio dei sinonimi di un synset.
- I sinonimi di un synset sono i lemmi che compongono quel synset.
- La feature della sinonimia è stata interpretata come outlier feature.

In [3]:
def get_min_max_synonymy_in_wn():
    min_synonymy = float('inf')
    max_synonymy = 0
    for synset in wordnet.all_synsets():
        num_synonyms = len(synset.lemmas())
        min_synonymy = min(min_synonymy, num_synonyms)
        max_synonymy = max(max_synonymy, num_synonyms)
    return min_synonymy, max_synonymy

In [4]:
MIN_SYNONYMY_COUNT, MAX_SYNONYMY_COUNT = get_min_max_synonymy_in_wn()
print('Valore minimo di sinonimia:', MIN_SYNONYMY_COUNT)
print('Valore massimo di sinonimia:', MAX_SYNONYMY_COUNT)

Valore minimo di sinonimia: 1
Valore massimo di sinonimia: 28


In [5]:
def get_synset_synonymy(synset):
    lemma_names = [lemma.name() for lemma in synset.lemmas()]
    synonymy_count = len(lemma_names)
    normalized_synonymy_count = 1 - (synonymy_count - MIN_SYNONYMY_COUNT) / (MAX_SYNONYMY_COUNT - MIN_SYNONYMY_COUNT)
    return synonymy_count, normalized_synonymy_count

In [6]:
base_concept = wordnet.synset('dog.n.01')
synonymy_count, normalized_synonymy_count = get_synset_synonymy(base_concept)
print('synonymy_count:', synonymy_count)
print('normalized_synonymy_count:', normalized_synonymy_count)

synonymy_count: 3
normalized_synonymy_count: 0.9259259259259259


## Calcolo sinonimia dei synset

In [7]:
def compute_synsets_synonymy():
    synset_synonymy_dict = {}
    
    for pos in SUPPORTED_POS:
        for synset in wordnet.all_synsets(pos):
            
            try:
                if synset in SYNSET_DA_EVITARE or synset.pos() == 's':
                    continue
                
                synset_synonymy, norm_synonymy_score = get_synset_synonymy(synset)
                
                synset_synonymy_dict[synset] = synset_synonymy, norm_synonymy_score
            
            except StopIteration:
                break
            
    return synset_synonymy_dict

In [8]:
synset_synonymy_dict = compute_synsets_synonymy()

In [9]:
data_df = {
    'Synset': [str(key) for key in synset_synonymy_dict.keys()],
    'Synonymy': [item[0] for item in synset_synonymy_dict.values()],
    'Normalized Synonymy': [item[1] for item in synset_synonymy_dict.values()]
}
df_synonymy = pd.DataFrame(data_df)

## Sinonimia di un synset specifico

In [10]:
synset = wordnet.synset('animal.n.01')
synonymy_dict = synset_synonymy_dict.get(synset, 'Synset non classificato')
synonymy = synonymy_dict[0]
synonymy_norm = synonymy_dict[1]
print(synonymy)
print(synonymy_norm)

6
0.8148148148148149


## Salvataggio su file

In [11]:
path_output = "../features/df/"
df_file_output = path_output + 'df_synonymy.csv'
df_synonymy.to_csv(df_file_output, index=False)

# Risultati ottenuti

In [12]:
print(df_synonymy[:10])

                           Synset  Synonymy  Normalized Synonymy
0           Synset('entity.n.01')         1             1.000000
1  Synset('physical_entity.n.01')         1             1.000000
2      Synset('abstraction.n.06')         2             0.962963
3            Synset('thing.n.12')         1             1.000000
4           Synset('object.n.01')         2             0.962963
5            Synset('whole.n.02')         2             0.962963
6         Synset('congener.n.03')         1             1.000000
7     Synset('living_thing.n.01')         2             0.962963
8         Synset('organism.n.01')         2             0.962963
9          Synset('benthos.n.02')         1             1.000000


## Primi 10 synset con valore di sinonimia DECRESCENTE (che hanno quindi PIÙ sinonimi)

In [13]:
df_sorted = df_synonymy.sort_values(by='Synonymy', ascending=False)
print(df_sorted[:10])

                           Synset  Synonymy  Normalized Synonymy
30587     Synset('buttocks.n.01')        28             0.000000
73749        Synset('batch.n.02')        27             0.037037
71515       Synset('boodle.n.01')        19             0.333333
73556        Synset('three.n.01')        18             0.370370
17485       Synset('doodad.n.01')        18             0.370370
32405       Synset('kernel.n.03')        16             0.444444
35776        Synset('adieu.n.01')        15             0.481481
59405  Synset('dostoyevsky.n.01')        15             0.481481
41894       Synset('bomber.n.03')        15             0.481481
52511          Synset('ace.n.03')        15             0.481481


## Primi 10 synset con valore di sinonimia CRESCENTE (che hanno quindi MENO sinonimi)

In [14]:
df_sorted = df_synonymy.sort_values(by='Synonymy', ascending=True)
print(df_sorted[:10])

                             Synset  Synonymy  Normalized Synonymy
0             Synset('entity.n.01')         1                  1.0
43703        Synset('trilogy.n.01')         1                  1.0
43704           Synset('room.n.04')         1                  1.0
43707      Synset('trip_wire.n.01')         1                  1.0
43708       Synset('trimurti.n.01')         1                  1.0
43710    Synset('triumvirate.n.01')         1                  1.0
43711         Synset('troika.n.03')         1                  1.0
43712        Synset('turnout.n.01')         1                  1.0
43718  Synset('quadrumvirate.n.01')         1                  1.0
43726   Synset('power_couple.n.01')         1                  1.0
