### Imports


In [296]:
import pandas as pd
import sys
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk import pos_tag
sys.path.append('..')
nltk.download('stopwords')
from utils import utils as u
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from collections import Counter
from prettytable import PrettyTable
from colorama import Fore, Style
from nltk.wsd import lesk
nltk.download('wordnet')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\r.borra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\r.borra\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\r.borra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\r.borra\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Creo la struttura dati

La struttura dati utilizzata avrà la seguente forma: 
$$data[word] = [definition_i, definition_{i+1}, ..., definition_n]$$

In [297]:
file_path = '../datasets/TLN-definitions-23.tsv'
df = pd.read_csv(file_path, sep='\t')

data = {}
definitions = []
for col in df.columns:
    definitions = []
    if col != '1':
        data[col] = {} ## data[ladybug]
        for riga in df[col]:
            if col != '1':
                definitions.append(riga)
        data[col] = definitions

## Tokenizzazione e pulizia delle definizioni

In [298]:

def cleaning_definition_token(data):
    stop_words = set(stopwords.words('english'))

# Frase di esempio
    words= []
    words_for_every_label = {}

    for item,content in data.items():
        words= []
        for definition in content:
            tokens = word_tokenize(definition)
            words_clean = [token.lower() for token in tokens if token.isalpha()
                    and token.lower() not in stop_words]
            tagged_words = pos_tag(words_clean)
            nouns = [word for word, pos in tagged_words if pos == 'NN']
            for n in nouns:
                words.append(n)
        
        words_for_every_label[item] = words
    return words_for_every_label

## Tokenizzazione e pulizia delle definizioni degli iponimi

In [299]:

def cleaning_definition_token_hypo(data):
    stop_words = set(stopwords.words('english'))

    words= []
    words_for_every_label = {}
    list_of_tuple=[]
    for label, content in data.items():
        list_of_tuple=[]
        for tupla in data[label]:

            tokens = word_tokenize(tupla[1])
                
            words_clean = [token.lower() for token in tokens if token.isalpha()
                       and token.lower() not in stop_words]
            
            tagged_words = pos_tag(words_clean)
          
            nouns = [word for word, pos in tagged_words if pos == 'NN']
        
            list_of_tuple.append((tupla[0],nouns))
        words_for_every_label[label] = list_of_tuple
    return words_for_every_label

## Ricerca genus

In [300]:
def get_genus(words_for_every_label,k_genus):

    top_tokens_for_labels = {}
    genus = []
    for label, label_tokens in words_for_every_label.items():
        genus = []
 
        token_counts = Counter(label_tokens)

        top_tokens = token_counts.most_common(k_genus)

        for t in top_tokens:
            genus.append(t[0])
  
            top_tokens_for_labels[label] = genus
 
    return top_tokens_for_labels




## ricerca dei genus, mantenendo il numero di frequenza

In [301]:
def get_genus_with_score(words_for_every_label,k_genus):
   
    top_tokens_for_labels = {}
    genus = []
    for label, label_tokens in words_for_every_label.items():
        genus = []
        token_counts = Counter(label_tokens)
        top_tokens = token_counts.most_common(k_genus)

        for t in top_tokens:
            genus.append((t[0],t[1]))
            top_tokens_for_labels[label] = genus
    return top_tokens_for_labels




## Algoritmo di lesk per la WSD dei genus trovati metodo 1

In [302]:
def lesk_for_disambiguation(genus_for_every_label, definition_clean):
    genus_best_sense = {}
    best_senses = []
    for label1,genus in genus_for_every_label.items():
        best_senses = []
        for label2,tokens in definition_clean.items():
            if label1 == label2:
                for gen in genus_for_every_label[label1]:
                    if lesk(tokens,gen)  is not None:
                        best_senses.append((gen,lesk(definition_clean[label1],gen)))
        genus_best_sense[label1] = best_senses
    return genus_best_sense

## Algoritmo di lesk per la WSD dei genus trovati metodo 2

In [303]:
def lesk_for_disambiguation_metod2(genus_for_every_label, definition_clean):
    genus_best_sense = {}
    best_senses = []
    for label1,genus in genus_for_every_label.items():
        best_senses = []
        for label2,tokens in definition_clean.items():
            if label1 == label2:
                for gen in genus_for_every_label[label1]:
    
                    if lesk(tokens,gen[0])  is not None:
                        
                        best_senses.append((gen[0],lesk(definition_clean[label1],gen[0])))

        genus_best_sense[label1] = best_senses
 
    return genus_best_sense

## Ricerca dei genus (metodo 2)

In [304]:
def get_synset_genus_metodo2(genus_for_every_label):
    genus_sense = {}
    best_senses = []
    for label1,genus in genus_for_every_label.items():
        best_senses = []
        for gen in genus_for_every_label[label1]:
             best_senses.append((gen[0],wn.synsets(gen[0])))

        genus_sense[label1] = best_senses
  
    return genus_sense

## Ricerca degli iponimi fino alle foglie dell'albero di wordnet

In [305]:
def get_all_hyponyms(genus_synset):
    stak = []
    hy = []

    for g in genus_synset:
        if g.name().split('.')[1] == 'n':
            stak.append(g)

    while len(stak) != 0:
        hypo = stak.pop()

        new_hyponims = hypo.hyponyms()

        for h in new_hyponims:
            if h not in stak and h.name().split('.')[1] == 'n':
                stak.append(h)


        hy.append(hypo)
   

    
    return hy

## Ricerca iponimi

In [306]:
def get_hypo(synsets_genus):
    hyponyms_genus={}

    for l,bestsense_genus in synsets_genus.items():
        hyponyms = []
        for genus_bs in bestsense_genus:
            
            #hyponyms_ = get_all_hyponyms(genus_bs[1].hyponyms())
            hyponyms_ = genus_bs[1].hyponyms()
            if hyponyms_ != []:
                for h in hyponyms_ :
                    hyponyms.append(h)
       
        hyponyms_genus[l]= hyponyms
    return hyponyms_genus

## Ricerca delle definizioni degli iponimi

In [307]:
def get_definitions_hypo(synsets_hypo):
    hypo_definitions={}
 
    for label, content in synsets_hypo.items():
        definitions=[]
        for synset in synsets_hypo[label]:
            if synset.definition() is not None:
                definitions.append((synset,synset.definition()))
                
        hypo_definitions[label] = definitions


    return hypo_definitions
        

## Intersezione tra definizioni target e definizioni degli iponimi

In [308]:
def token_intersection_definitions(def_hypo,def_targets):
    synset_score={}
    tupla_score=[]
    for label,content in def_hypo.items():
        tupla_score=[]
        for token_def in def_hypo[label]:
            interc = set(token_def[1]).intersection(set(def_targets[label]))
            score = len(interc) 
            tupla_score.append((score,token_def[0]))
        synset_score[label] = tupla_score
    return synset_score
        
        
    

## Ricerca del best score

In [309]:
def search_best_score(hypo_score):
    max_score = 1
    best_synsets_list = []
    best_synsets_score = {}
    for label, c in hypo_score.items():
        max_score = 0
        best_synsets_list = []
        for content in hypo_score[label]:
            if  content[0] >= max_score:
                if best_synsets_list != [] :
                    last_elem= best_synsets_list[-1][1]
                    if last_elem < content[0]:
                        best_synsets_list.clear()
                max_score = content[0]
                best_synsets_list.append((content[1],content[0]))
        best_synsets_score[label] =  best_synsets_list
    return best_synsets_score
    

## Conteggio dei genus all'interno delle definizioni

In [310]:
def count_of_genus_in_hypo_definition(genus_with_score,hypo_defs):
    tupla3_to_append = []
    genus_score_in_def = {}
    count_score =0
    for label,c in genus_with_score.items():
        tupla3_to_append = []
        for synset_def in hypo_defs[label]:
            final_score= 0
            for genus in genus_with_score[label]:
                count_score =0
                count_score = synset_def[1].count(genus[0]) 
                if count_score != 0:
                    final_score = final_score + genus[1] + count_score
            tupla3_to_append.append((final_score,synset_def[0]))
        genus_score_in_def[label] = tupla3_to_append
    return genus_score_in_def
    
    

## Calcolo della distanza nell'albero di wordnet tra la parola target e la parola trovata dall'algoritmo

In [311]:
def similarity_synsets(s1,s2):
    

    similarity_score = s1.path_similarity(s2)
    similarity_score = "{:.2f}".format(similarity_score)
   
    return similarity_score


In [312]:
def search_synset_target_words(data):
    synset_target_word = {}
    for label,c in data.items():
        synset_target_word[label] = wn.synsets(label)[0]
    return synset_target_word

## Pulizia definizioni e concatenazione di tutti i nomi contenuti nell'insieme delle definizioni

In [313]:
definition_clean = cleaning_definition_token(data)


## Numero di genus in input

In [314]:
k_genus = 10

## METODO 1

In [315]:

##METODO 1
genus_for_every_label = get_genus(definition_clean,k_genus)
synsets_genus= lesk_for_disambiguation(genus_for_every_label, definition_clean)
hypo_for_every_genus = get_hypo(synsets_genus)
definitions_hypo = get_definitions_hypo(hypo_for_every_genus)
definitions_hypo_cleaning= cleaning_definition_token_hypo(definitions_hypo)
hypo_score = token_intersection_definitions(definitions_hypo_cleaning,definition_clean)
print(hypo_score)
targets_words = search_best_score(hypo_score)


{'door': [(1, Synset('anechoic_chamber.n.01')), (3, Synset('anteroom.n.01')), (1, Synset('back_room.n.01')), (1, Synset('ballroom.n.01')), (1, Synset('barroom.n.01')), (1, Synset('bathroom.n.01')), (1, Synset('bedroom.n.01')), (1, Synset('belfry.n.02')), (1, Synset('billiard_room.n.01')), (1, Synset('boardroom.n.01')), (1, Synset('cardroom.n.01')), (1, Synset('cell.n.06')), (1, Synset('cell.n.07')), (1, Synset('chamber.n.03')), (1, Synset('checkroom.n.01')), (2, Synset('classroom.n.01')), (1, Synset('clean_room.n.01')), (1, Synset('cloakroom.n.02')), (1, Synset('closet.n.04')), (1, Synset('clubroom.n.01')), (3, Synset('compartment.n.02')), (1, Synset('conference_room.n.01')), (1, Synset('control_room.n.01')), (1, Synset('court.n.02')), (1, Synset('cubby.n.01')), (1, Synset('cutting_room.n.01')), (1, Synset('darkroom.n.01')), (1, Synset('den.n.04')), (1, Synset('dinette.n.01')), (1, Synset('dining_room.n.01')), (2, Synset('door.n.05')), (1, Synset('dressing_room.n.01')), (1, Synset('dur

## METODO 2

In [316]:


##METODO 2
genus_for_every_label_with_Score = get_genus_with_score(definition_clean,k_genus)
synsets_genus= get_synset_genus_metodo2(genus_for_every_label_with_Score)
synsets_genus_disambiguated = lesk_for_disambiguation_metod2(synsets_genus, definition_clean)
hypo_for_every_genus_metodo2 = get_hypo(synsets_genus_disambiguated )
definitions_hypo_metodo2 = get_definitions_hypo(hypo_for_every_genus_metodo2)
definitions_hypo_cleaning_metodo2= cleaning_definition_token_hypo(definitions_hypo_metodo2)
count_of_genus = count_of_genus_in_hypo_definition(genus_for_every_label_with_Score,definitions_hypo_cleaning_metodo2)
targets_words_met2 = search_best_score(count_of_genus)



In [317]:
synsets_target_word = search_synset_target_words(data)

## STAMPA RISULTATI

In [318]:
table = PrettyTable()
table.field_names = [Fore.BLUE +"Parola" , Fore.GREEN + "Metodo1" + Style.RESET_ALL, Fore.YELLOW + "Metodo2" + Style.RESET_ALL]
for i,c in targets_words.items():

    table.add_row([Fore.BLUE + i ,Fore.GREEN + str(targets_words[i][0][0]) + ": "+ str(similarity_synsets(targets_words[i][0][0],synsets_target_word[i]))+ Fore.YELLOW, str(targets_words_met2[i][0][0]) + ": "+str(similarity_synsets(targets_words_met2[i][0][0],synsets_target_word[i])) +  Style.RESET_ALL])
  


print(table)


+------------+-------------------------------------+---------------------------------+
|   [34mParola   |               [32mMetodo1[0m               |             [33mMetodo2[0m             |
+------------+-------------------------------------+---------------------------------+
|    [34mdoor    |     [32mSynset('doorway.n.01'): 0.10[33m    |   Synset('doorway.n.01'): 0.10[0m  |
|  [34mladybug   | [32mSynset('reddish_orange.n.01'): 0.05[33m |  Synset('good_luck.n.02'): 0.07[0m |
|    [34mpain    |   [32mSynset('discomfort.n.02'): 0.08[33m   |    Synset('anger.n.01'): 0.08[0m   |
| [34mblurriness |    [32mSynset('likeness.n.02'): 0.07[33m    | Synset('reflection.n.05'): 0.07[0m |
+------------+-------------------------------------+---------------------------------+


## test lesk

In [319]:

for gen in genus_for_every_label["ladybug"]:
    print("GENUS ", gen)
    print("GENUS SYNSETS ",wn.synsets(gen))
# Definizione per "ladybug"
    definition_ladybug = set(definition_clean["ladybug"])
    print("CONTESTO", definition_ladybug)

# Parola ambigua: "ladybug"
    ambiguous_word = gen


# Applica la funzione lesk
    meaning = lesk(definition_ladybug, ambiguous_word)

# Stampa il significato identificato
    print("Significato identificato:", meaning)


GENUS  insect
GENUS SYNSETS  [Synset('insect.n.01'), Synset('worm.n.02')]
CONTESTO {'luck', 'pattern', 'spot', 'shape', 'insect', 'harmless', 'orange', 'bug', 'yellow', 'control', 'head', 'culture', 'family', 'coat', 'person', 'color', 'fly', 'insectivore', 'round'}
Significato identificato: Synset('worm.n.02')
GENUS  luck
GENUS SYNSETS  [Synset('fortune.n.04'), Synset('luck.n.02'), Synset('luck.n.03')]
CONTESTO {'luck', 'pattern', 'spot', 'shape', 'insect', 'harmless', 'orange', 'bug', 'yellow', 'control', 'head', 'culture', 'family', 'coat', 'person', 'color', 'fly', 'insectivore', 'round'}
Significato identificato: Synset('luck.n.03')
GENUS  color
GENUS SYNSETS  [Synset('color.n.01'), Synset('color.n.02'), Synset('color.n.03'), Synset('color.n.04'), Synset('semblance.n.01'), Synset('coloring_material.n.01'), Synset('color.n.07'), Synset('color.n.08'), Synset('color.v.01'), Synset('tinge.v.01'), Synset('color.v.03'), Synset('color.v.04'), Synset('color.v.05'), Synset('discolor.v.03')