### Imports


In [13]:
import nltk
from nltk.corpus import wordnet as wn
import json
import polars as pl

from collections import Counter

# Download WordNet data
nltk.download('wordnet')

import sys
sys.path.append('..')
from utils import utils as u

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\r.borra\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Funzione per ricavare il Genus di una frase

In [14]:
def get_genus(sentence):
    genus = []

    comma_split = sentence.split(',')

    splitted_sent = []

    for cs in comma_split:
        if cs.split('.') != '':
            splitted_sent += cs.split('.')

    if '' in splitted_sent:
        splitted_sent.remove('')

    # Pattern di identificazione del genus
    targets = {
        1: [["it"], ["is", "'s"], ["a", "an", "the", "for"]],
        2: [["that", "of", "to"]],
        3: [["a", "an", "for", "to"]]
    }

    for sentence in splitted_sent:
        tokenized_def = u.get_lemmatized_tokens_list_pos(u.tokenizer(sentence))
        # print(tokenized_def)

        for i in range(0, len(tokenized_def)):
            if i > 0 and tokenized_def[i][0].lower() in targets[2][0]:
                if tokenized_def[i-1][0] not in genus and tokenized_def[i-1][1] == 'n':
                    genus.append(tokenized_def[i-1][0])

            if (i < len(tokenized_def)-1):
                if tokenized_def[i][1] == 'a' and tokenized_def[i+1][1] == 'n' and tokenized_def[i+1][0] not in genus:
                    genus.append(tokenized_def[i+1][0])

            if i < len(tokenized_def)-2 and tokenized_def[i][0].lower() in targets[3][0]:
                if tokenized_def[i+1][0] not in genus and tokenized_def[i+1][1] == 'n':
                    genus.append(tokenized_def[i+1][0])

                elif tokenized_def[i+1][1] == 'a' and tokenized_def[i+2][1] == 'n':
                    genus.append(tokenized_def[i+2][0])

            if (i < len(tokenized_def)-3):
                if tokenized_def[i][0].lower() in targets[1][0] and (tokenized_def[i+1][0] in targets[1][1]) and (tokenized_def[i+2][0] in targets[1][2]):
                    if tokenized_def[i+3][0] not in genus:
                        genus.append(tokenized_def[i+3][0])
    return genus


### Creo la struttura dati

La struttura dati utilizzata avrà la seguente forma: 
$$data[word] = [definition_i, definition_{i+1}, ..., definition_n]$$

In [15]:
df = json.loads(pl.read_csv(
    "../datasets/TLN-definitions-23.tsv", separator='\t').write_json())

data = {}

for col in df['columns']:
    if col['name'] != '1':
        data[col['name']] = []
        for v in col['values']:
            data[col['name']].append(v)
            

### Per ogni definizione, trovo il genus e poi prendo i 2 più frequenti

In [16]:
def get_more_frequent_genus(sentences):
    genus_freq_dict = {}

    for sent in sentences:
        genus_list = get_genus(sent)
        for genus in genus_list:
            if genus in genus_freq_dict:
                genus_freq_dict[genus] += 1
            else:
                genus_freq_dict[genus] = 1
    genus_freq_dict = sorted(genus_freq_dict.items(), key=lambda x: x[1], reverse=True)

    return genus_freq_dict[0], genus_freq_dict[1]
        

### Tra i synsets dei genus più frequenti, restituisco il più lontano dalla radice (che dovrebbe essere il più specifico)

In [17]:
def get_farther_from_root_genus(synset_g1, synset_g2):
    root  = wn.synset('entity.n.01')
    if synset_g1[0].shortest_path_distance(root) > synset_g2[0].shortest_path_distance(root):
        return synset_g1
    else:
        return synset_g2

### Per ogni senso nel synset del genus, restutisco tutti gli iponimi fino ad arrivare alle foglie

In [18]:
def get_all_hyponyms(genus_synset):
    stak = []
    hy = []

    for g in genus_synset:
        if g.name().split('.')[1] == 'n':
            stak.append(g)

    while len(stak) != 0:
        hypo = stak.pop()

        new_hyponims = hypo.hyponyms()

        for h in new_hyponims:
            if h not in stak and h.name().split('.')[1] == 'n':
                stak.append(h)


        hy.append(hypo)
        # print(hy)

    
    return hy

### Funzione per il calcolo dell'overlap tra due definizioni (quella dell'esercitazione 1)

In [19]:
def compute_similarity(def1, def2):

    definition_1_tokens = u.noise_reduction_en(def1)
    definition_2_tokens = u.noise_reduction_en(def2)

    min_len = 0

    if len(definition_1_tokens) > len(definition_2_tokens):
        min_len = len(definition_2_tokens)
    else:
        min_len = len(definition_1_tokens)

    return (len(set(definition_1_tokens).intersection(set(definition_2_tokens))) / min_len)

### Percorro ogni ramo originato dal nodo del genus fino ad arrivare alle foglie, e per ogni nodo calcolo l'overlap lessicale tra la definizione del nodo e tutte le quelle a mia disposizione, memorizzando quelle con overlap maggiore

In [20]:
results = {}
for word in data:
    genus_1, genus_2 = get_more_frequent_genus(data[word])
    target_genus_synset = get_farther_from_root_genus(wn.synsets(u.lemmatize_word(
        genus_1[0], 'n')), wn.synsets(u.lemmatize_word(genus_2[0], 'n')))
    print(target_genus_synset)
    target_hyponyms_set = get_all_hyponyms(target_genus_synset)

    similarity = 0
    result_sim = 0

    results[word] = {}
    

    for sense in target_hyponyms_set:
        sense_definition = sense.definition()

        prov_list = []

        for definition in data[word]:
            temp_similarity = compute_similarity(sense_definition, definition)
            if (temp_similarity > 0 and temp_similarity < 1 and temp_similarity >= similarity):
                if temp_similarity > similarity:
                    prov_list.clear()
                    similarity = temp_similarity
                

                temp_obj = {}

                temp_obj["definition"] = sense_definition
                temp_obj["definition_prof"] = definition
                temp_obj["similarity"] = temp_similarity

                prov_list.append(temp_obj)
                # print(prov_list)

        if len(prov_list) > 0 and prov_list[0]['similarity'] >= result_sim:
            if prov_list[0]['similarity'] > result_sim:
                result_sim = prov_list[0]['similarity']
                results[word].clear()
                
            results[word][sense] = prov_list


[Synset('entree.n.02'), Synset('access.n.02'), Synset('access.n.03'), Synset('access.n.04'), Synset('access.n.05'), Synset('access.n.06'), Synset('access.v.01'), Synset('access.v.02')]
[Synset('insect.n.01'), Synset('worm.n.02')]
[Synset('sensation.n.01'), Synset('ace.n.03'), Synset('sensation.n.03'), Synset('sensation.n.04'), Synset('sense.n.03')]
[Synset('image.n.01'), Synset('persona.n.02'), Synset('picture.n.01'), Synset('prototype.n.01'), Synset('trope.n.01'), Synset('double.n.03'), Synset('image.n.07'), Synset('image.n.08'), Synset('effigy.n.01'), Synset('image.v.01'), Synset('visualize.v.01')]


### Stampa dei risultati

In [21]:
for word in results:
    print(word)
    print('----------------')
    
    for sense in results[word]:
        print(sense, '-', sense.definition())
        for tuple in results[word][sense]:
            print(tuple['similarity']  , '/', tuple['definition_prof'])

    print()

door
----------------
Synset('access.n.03') - a way of entering or leaving
0.6666666666666666 / The way to let people enter or exit a room
Synset('gateway.n.01') - an entrance that can be closed by a gate
0.6666666666666666 / An object that opens or close the entrance of a room
Synset('doorway.n.01') - the entrance (the space in a wall) through which you enter or leave a room or building; the space that a door can close
0.6666666666666666 / A wall that can be opened and closed at will
0.6666666666666666 / entance  to a building or room
Synset('dutch_door.n.01') - an exterior door divided in two horizontally; either half can be closed or open independently
0.6666666666666666 / It's an opening, it can be opened or closed.
0.6666666666666666 / A wall that can be opened and closed at will

ladybug
----------------
Synset('two-spotted_ladybug.n.01') - red ladybug with a black spot on each wing
0.75 / A red insect with black spots

pain
----------------
Synset('sensitivity.n.01') - (physiolo

### Calcolo lo score dell'intersezione dei due insiemi di tokens

Utilizzo un dizionario è della forma: 
$$dict[token_i] = count(token_i) / count(all\_tokens)$$
per tenere traccia della frequenza del token in funzione del totale dei tokens, e poi sommo le frequenze dei token persenti nell'intersezione per ottenere uno score per l'intersezione

In [22]:
def compute_intersection_score(definitions_union_tokens, definition_tokens, frequency_dict):

    pt = 0

    interc = set(definitions_union_tokens).intersection(set(definition_tokens))

    for token in interc:
        pt += frequency_dict[token]

    return pt/len(definitions_union_tokens)


### Versione che utilizza tutte le definizioni a disposizione


In [23]:
results = {}
for word in data:
    genus_1, genus_2 = get_more_frequent_genus(data[word])
    target_genus_synset = get_farther_from_root_genus(wn.synsets(u.lemmatize_word(
        genus_1[0], 'n')), wn.synsets(u.lemmatize_word(genus_2[0], 'n')))
    target_hyponyms_set = get_all_hyponyms(target_genus_synset)

    results[word] = {}

    tokenized_definition_union_list = []

    for definition in data[word]:
        tokenized_definition_union_list += u.noise_reduction_en(definition)

    frequency_dict = dict(Counter(tokenized_definition_union_list))

    for sense in target_hyponyms_set:
        sense_definition = sense.definition()

        results[word][sense] = compute_intersection_score(
            tokenized_definition_union_list, u.noise_reduction_en(sense_definition), frequency_dict)
    results[word] = sorted(results[word].items(),
                           key=lambda x: x[1], reverse=True)


### Stampa dei Risultati

In [24]:
for word in results:
    print(word)
    print('----------------')
    for i in range(0, 10):
        print(results[word][i], '-', results[word][i][0].definition())

    print()


door
----------------
(Synset('doorway.n.01'), 0.1956521739130435) - the entrance (the space in a wall) through which you enter or leave a room or building; the space that a door can close
(Synset('dutch_door.n.01'), 0.10434782608695652) - an exterior door divided in two horizontally; either half can be closed or open independently
(Synset('exterior_door.n.01'), 0.08260869565217391) - a doorway that allows entrance to or exit from a building
(Synset('access.n.04'), 0.07391304347826087) - a code (a series of characters or digits) that must be entered in some way (typed or dialed or spoken) to get the use of something (a telephone line or a computer or a local area network etc.)
(Synset('back_door.n.01'), 0.06521739130434782) - a secret or underhand means of access (to a place or a position)
(Synset('entrance.n.01'), 0.06521739130434782) - something that provides access (to get in or get out)
(Synset('back_door.n.02'), 0.06086956521739131) - an undocumented way to get access to a compute