### Imports


In [149]:
import pandas as pd

from utils import utils as u

from nltk import pos_tag

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from collections import Counter
import nltk
from nltk.wsd import lesk
nltk.download('wordnet')



[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rossellaborra/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Creo la struttura dati

La struttura dati utilizzata avrà la seguente forma: 
$$data[word] = [definition_i, definition_{i+1}, ..., definition_n]$$

In [None]:
file_path = '../datasets/TLN-definitions-23.tsv'
df = pd.read_csv(file_path, sep='\t')

data = {}
definitions = []
for col in df.columns:
    definitions = []
    if col != '1':
        data[col] = {} ## data[ladybug]
        for riga in df[col]:
            if col != '1':
                definitions.append(riga)
        data[col] = definitions

## Tokenizzazione e pulizia delle definizioni

In [201]:

def cleaning_definition_token(data):
    stop_words = set(stopwords.words('english'))

# Frase di esempio
    words= []
    words_for_every_label = {}

    for item,content in data.items():
        words= []
        for definition in content:
            tokens = word_tokenize(definition)
            words_clean = [token.lower() for token in tokens if token.isalpha()
                    and token.lower() not in stop_words]
            tagged_words = pos_tag(words_clean)
            nouns = [word for word, pos in tagged_words if pos == 'NN']
            for n in nouns:
                words.append(n)
        
        words_for_every_label[item] = words
    return words_for_every_label

In [315]:

def cleaning_definition_token_hypo(data):
    stop_words = set(stopwords.words('english'))

    # Frase di esempio
    words= []
    words_for_every_label = {}
    list_of_tuple=[]
    for label, content in data.items():
        list_of_tuple=[]
        for tupla in data[label]:

            tokens = word_tokenize(tupla[1])
            #print(tokens)
                
            words_clean = [token.lower() for token in tokens if token.isalpha()
                       and token.lower() not in stop_words]
            #print(words_clean)
            
            tagged_words = pos_tag(words_clean)
            #print(tagged_words)
            nouns = [word for word, pos in tagged_words if pos == 'NN']
        
            #print(nouns)
            list_of_tuple.append((tupla[0],nouns))
        words_for_every_label[label] = list_of_tuple
    return words_for_every_label

In [333]:
def get_genus(words_for_every_label):
# Dizionario per memorizzare i primi 5 token più frequenti per ogni label
    top_tokens_for_labels = {}
    genus = []
    for label, label_tokens in words_for_every_label.items():
        genus = []
    # Conta la frequenza di ciascun token per la label corrente
        token_counts = Counter(label_tokens)

    # Ottieni i primi 5 token più frequenti per la label corrente
        top_tokens = token_counts.most_common(100)

        for t in top_tokens:
            genus.append(t[0])
    # Aggiungi i risultati al dizionario top_tokens_for_labels
            top_tokens_for_labels[label] = genus
    #for l,c in top_tokens_for_labels.items():
        #print(l,"--->", top_tokens_for_labels[l])
    return top_tokens_for_labels




In [466]:
def get_genus_with_score(words_for_every_label):
    # Dizionario per memorizzare i primi 5 token più frequenti per ogni label
    top_tokens_for_labels = {}
    genus = []
    for label, label_tokens in words_for_every_label.items():
        genus = []
        # Conta la frequenza di ciascun token per la label corrente
        token_counts = Counter(label_tokens)
        # Ottieni i primi 5 token più frequenti per la label corrente
        top_tokens = token_counts.most_common(100)

        for t in top_tokens:
            genus.append((t[0],t[1]))
            # Aggiungi i risultati al dizionario top_tokens_for_labels
            top_tokens_for_labels[label] = genus
    #for l,c in top_tokens_for_labels.items():
    #print(l,"--->", top_tokens_for_labels[l])
    return top_tokens_for_labels




In [199]:
def lesk_for_disambiguation(genus_for_every_label, definition_clean):
    genus_best_sense = {}
    best_senses = []
    for label1,genus in genus_for_every_label.items():
        #print(label1,genus)
        best_senses = []
        for label2,tokens in definition_clean.items():
            #print(label2,tokens)
            if label1 == label2:
                #print("ok")
                
                for gen in genus_for_every_label[label1]:
                    if lesk(tokens,gen)  is not None:
                        best_senses.append((gen,lesk(tokens,gen)))
                    
                #print(best_sense)
        genus_best_sense[label1] = best_senses
    #for l,bs in genus_best_sense.items():
        #print(l, genus_best_sense[l])
    return genus_best_sense

In [359]:
def lesk_for_disambiguation_metod2(genus_for_every_label, definition_clean):
    genus_best_sense = {}
    best_senses = []
    for label1,genus in genus_for_every_label.items():
        #print(label1,genus)
        best_senses = []
        for label2,tokens in definition_clean.items():
            #print(label2,tokens)
            if label1 == label2:
                #print("ok")

                for gen in genus_for_every_label[label1]:
                    
                    if lesk(tokens,gen[0])  is not None:
                        
                        best_senses.append((gen[0],lesk(tokens,gen[0])))

                #print(best_sense)
        genus_best_sense[label1] = best_senses
    #for l,bs in genus_best_sense.items():
    #print(l, genus_best_sense[l])
    return genus_best_sense

In [373]:
def get_synset_genus_metodo2(genus_for_every_label):
    genus_sense = {}
    best_senses = []
    for label1,genus in genus_for_every_label.items():
        best_senses = []
        for gen in genus_for_every_label[label1]:
             best_senses.append((gen[0],wn.synsets(gen[0])))

                #print(best_sense)
        genus_sense[label1] = best_senses
    #for l,bs in genus_best_sense.items():
    #print(l, genus_best_sense[l])
    return genus_sense

In [198]:
def get_hypo(synsets_genus):
    hyponyms_genus={}
   
    for l,bestsense_genus in synsets_genus.items():
        hyponyms = []
        #print("LABEL --> ",l)
        for genus_bs in bestsense_genus:
            #print("GENUS -->",genus_bs[0]," ",genus_bs[1],type(genus_bs[1]))
            hyponyms_ = genus_bs[1].hyponyms()
            #print(hyponyms_)
            if hyponyms_ != []:
                for h in hyponyms_ :
                    hyponyms.append(h)
        #print(l,bestsense,hyponyms)
        hyponyms_genus[l]= hyponyms
    #for l,content in hyponyms_genus.items():
        #print(l, "---->",hyponyms_genus[l])
    return hyponyms_genus

In [374]:
def get_hypo_metodo2(synsets_genus):
    hyponyms_genus={}

    for l,bestsense_genus in synsets_genus.items():
        hyponyms = []
        #print("LABEL --> ",l)
        for genus_bs in bestsense_genus:
            #print("GENUS -->",genus_bs[0]," ",genus_bs[1],type(genus_bs[1]))
            syns_list_genus = genus_bs[1]
            for s in syns_list_genus:
                hyponyms_ = s.hyponyms()
            #print(hyponyms_)
                if hyponyms_ != []:
                    for h in hyponyms_ :
                        hyponyms.append(h)
        #print(l,bestsense,hyponyms)
        hyponyms_genus[l]= hyponyms
    #for l,content in hyponyms_genus.items():
    #print(l, "---->",hyponyms_genus[l])
    return hyponyms_genus

In [231]:
def get_definitions_hypo(synsets_hypo):
    hypo_definitions={}
 
    for label, content in synsets_hypo.items():
        definitions=[]
        for synset in synsets_hypo[label]:
            if synset.definition() is not None:
                #for d in synset.definition():
                definitions.append((synset,synset.definition()))
                
        hypo_definitions[label] = definitions

  
    #for l,content in hypo_definitions.items():
        #print("----------------------------------------")
        #print(l, "---->",hypo_definitions[l])
    return hypo_definitions
        

In [265]:
def token_intersection_definitions(def_hypo,def_targets):
    synset_score={}
    tupla_score=[]
    for label,content in def_hypo.items():
        tupla_score=[]
        for token_def in def_hypo[label]:
            interc = set(token_def[1]).intersection(set(def_targets[label]))
            score = len(interc) 
            tupla_score.append((score,token_def[0]))
        synset_score[label] = tupla_score
    return synset_score
        
        
    

In [329]:
def search_best_score(hypo_score):
    max_score = 1
    best_synsets_list = []
    best_synsets_score = {}
    for label, c in hypo_score.items():
        max_score = 0
        best_synsets_list = []
        for content in hypo_score[label]:
            if  content[0] >= max_score:
                if best_synsets_list != [] :
                    last_elem= best_synsets_list[-1][1]
                    if last_elem < content[0]:
                        best_synsets_list.clear()
                max_score = content[0]
                best_synsets_list.append((content[1],content[0]))
        best_synsets_score[label] =  best_synsets_list
    return best_synsets_score
    

In [439]:
def count_of_genus_in_hypo_definition(genus_with_score,hypo_defs):
    tupla3_to_append = []
    genus_score_in_def = {}
    count_score =0
    for label,c in genus_with_score.items():
        tupla3_to_append = []
        
        for synset_def in hypo_defs[label]:

            
            final_score= 0
            for genus in genus_with_score[label]:
                count_score =0
                #print("confronto")
                #print(synset_def[1])
                #print(genus[0])
                #print(genus[1])
                count_score = synset_def[1].count(genus[0]) 
                if count_score != 0:
                    final_score = final_score + genus[1] + count_score
            tupla3_to_append.append((final_score,synset_def[0]))
        genus_score_in_def[label] = tupla3_to_append
    return genus_score_in_def
    
    




In [468]:
## PULIZIA DEFINIZIONI (TOKENIZZAZIONE, RIMOZIONE STOP WORDS , ESTRAZIONE DI NOMI)
definition_clean = cleaning_definition_token(data)


##METODO 1

genus_for_every_label = get_genus(definition_clean)
synsets_genus= lesk_for_disambiguation(genus_for_every_label, definition_clean)
hypo_for_every_genus = get_hypo(synsets_genus)
definitions_hypo = get_definitions_hypo(hypo_for_every_genus)
definitions_hypo_cleaning= cleaning_definition_token_hypo(definitions_hypo)
hypo_score = token_intersection_definitions(definitions_hypo_cleaning,definition_clean)
targets_words = search_best_score(hypo_score)


##METODO 2
genus_for_every_label_with_Score = get_genus_with_score(definition_clean)
synsets_genus= get_synset_genus_metodo2(genus_for_every_label_with_Score)
synsets_genus_disambiguated = lesk_for_disambiguation_metod2(synsets_genus, definition_clean)
hypo_for_every_genus_metodo2 = get_hypo(synsets_genus_disambiguated )
definitions_hypo_metodo2 = get_definitions_hypo(hypo_for_every_genus_metodo2)
definitions_hypo_cleaning_metodo2= cleaning_definition_token_hypo(definitions_hypo_metodo2)
count_of_genus = count_of_genus_in_hypo_definition(genus_for_every_label_with_Score,definitions_hypo_cleaning_metodo2)
targets_words_met2 = search_best_score(count_of_genus)


   
        


In [469]:
from prettytable import PrettyTable
from colorama import Fore, Style

# Installa colorama se non l'hai già fatto
# pip install colorama
table = PrettyTable()
table.field_names = [Fore.BLUE +"Parola" , Fore.GREEN + "Metodo 1" + Style.RESET_ALL, Fore.YELLOW + "Metodo 2" + Style.RESET_ALL]
for i,c in targets_words.items():
# Crea una tabella PrettyTable
    
    
    table.add_row([Fore.BLUE + i ,Fore.GREEN + str(targets_words[i][0][0]) + " --> score" + Fore.YELLOW, str(targets_words_met2[i][0][0]) + " --> score"+ Style.RESET_ALL])
   # table.add_row(["Riga 2, Colonna 1", Fore.YELLOW + "Riga 2, Colonna 2" + Style.RESET_ALL])

# Stampa la tabella
print(table)


+------------+-----------------------------------------+-------------------------------------+
|   [34mParola   |                 [32mMetodo 1[0m                |               [33mMetodo 2[0m              |
+------------+-----------------------------------------+-------------------------------------+
|    [34mdoor    |     [32mSynset('doorway.n.01') --> score[33m    |   Synset('doorway.n.01') --> score[0m  |
|  [34mladybug   | [32mSynset('reddish_orange.n.01') --> score[33m |  Synset('good_luck.n.02') --> score[0m |
|    [34mpain    |      [32mSynset('glow.v.05') --> score[33m      |    Synset('glow.v.05') --> score[0m    |
| [34mblurriness |    [32mSynset('likeness.n.02') --> score[33m    | Synset('reflection.n.05') --> score[0m |
+------------+-----------------------------------------+-------------------------------------+
