In [2]:
import nltk
from nltk.corpus import wordnet as wn
import json
import polars as pl

# Download WordNet data
nltk.download('wordnet')

import sys
sys.path.append('..')
from utils import utils as u

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\riccardo.gino\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\riccardo.gino\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\riccardo.gino\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def get_genus(sentence):
    genus = []

    comma_split = sentence.split(',')

    splitted_sent = []

    for cs in comma_split:
        if cs.split('.') != '':
            splitted_sent += cs.split('.')

    if '' in splitted_sent:
        splitted_sent.remove('')

    # Pattern di identificazione del genus
    targets = {
        1: [["it"], ["is", "'s"], ["a", "an", "the", "for"]],
        2: [["that", "of", "to"]],
        3: [["a", "an", "for", "to"]]
    }

    for sentence in splitted_sent:
        tokenized_def = u.get_lemmatized_tokens_list_pos(u.tokenizer(sentence))
        # print(tokenized_def)

        for i in range(0, len(tokenized_def)):
            if i > 0 and tokenized_def[i][0].lower() in targets[2][0]:
                if tokenized_def[i-1][0] not in genus and tokenized_def[i-1][1] == 'n':
                    genus.append(tokenized_def[i-1][0])

            if (i < len(tokenized_def)-1):
                if tokenized_def[i][1] == 'a' and tokenized_def[i+1][1] == 'n' and tokenized_def[i+1][0] not in genus:
                    genus.append(tokenized_def[i+1][0])

            if i < len(tokenized_def)-2 and tokenized_def[i][0].lower() in targets[3][0]:
                if tokenized_def[i+1][0] not in genus and tokenized_def[i+1][1] == 'n':
                    genus.append(tokenized_def[i+1][0])

                elif tokenized_def[i+1][1] == 'a' and tokenized_def[i+2][1] == 'n':
                    genus.append(tokenized_def[i+2][0])

            if (i < len(tokenized_def)-3):
                if tokenized_def[i][0].lower() in targets[1][0] and (tokenized_def[i+1][0] in targets[1][1]) and (tokenized_def[i+2][0] in targets[1][2]):
                    if tokenized_def[i+3][0] not in genus:
                        genus.append(tokenized_def[i+3][0])
    return genus


In [4]:
get_genus("Auxiliary structure built on blank spaces of walls and used to access different areas of building.")

['structure', 'space', 'access', 'area']

### Creo la struttura dati

In [5]:
df = json.loads(pl.read_csv(
    "../datasets/TLN-definitions-23.tsv", separator='\t').write_json())

data = {}

for col in df['columns']:
    if col['name'] != '1':
        data[col['name']] = []
        for v in col['values']:
            data[col['name']].append(v)
            

### Per ogni definizione, trovo il genus e poi prendo i 2 più frequenti

In [43]:
def get_more_frequent_genus(sentences):
    genus_freq_dict = {}

    for sent in sentences:
        genus_list = get_genus(sent)
        for genus in genus_list:
            if genus in genus_freq_dict:
                genus_freq_dict[genus] += 1
            else:
                genus_freq_dict[genus] = 1
    genus_freq_dict = sorted(genus_freq_dict.items(), key=lambda x: x[1], reverse=True)

    return genus_freq_dict[0], genus_freq_dict[1]
        

In [36]:
test = get_more_frequent_genus(data['door'])

test

[('object', 11), ('access', 10)]

### Tra i synsets dei genus più frequenti, restituisco il più lontano dalla radice (che dovrebbe essere il più specifico)

In [39]:
def get_farther_from_root_genus(synset_g1, synset_g2):
    root  = wn.synset('entity.n.01')
    if synset_g1[0].shortest_path_distance(root) > synset_g2[0].shortest_path_distance(root):
        return synset_g1
    else:
        return synset_g2

In [40]:
get_farther_from_root_genus(wn.synsets(test[0][0], 'n'), wn.synsets(test[1][0], 'n'))

[Synset('entree.n.02'),
 Synset('access.n.02'),
 Synset('access.n.03'),
 Synset('access.n.04'),
 Synset('access.n.05'),
 Synset('access.n.06')]

### Per ogni senso nel synset, restutisco tutti gli iponimi fino ad arrivare alle foglie

In [50]:
def get_all_hyponyms(genus_synset):
    stak = []
    hy = []

    for g in genus_synset:
        if g.name().split('.')[1] == 'n':
            stak.append(g)

    while len(stak) != 0:
        hypo = stak.pop()

        new_hyponims = hypo.hyponyms()

        for h in new_hyponims:
            if h not in stak and h.name().split('.')[1] == 'n':
                stak.append(h)


        hy.append(hypo)
        # print(hy)

    
    return hy

### Percorro ogni ramo originato dal nodo del genus fino ad arrivare alle foglie, e per ogni nodo calcolo l'overlap lessicale tra la definizione del nodo e tutte le quelle a mia disposizione, memorizzando quelle con overlap maggiore

In [53]:
for word in data:
    genus_1, genus_2 = get_more_frequent_genus(data[word])
    target_genus_synset = get_farther_from_root_genus(wn.synsets(u.lemmatize_word(genus_1[0], 'n')), wn.synsets(u.lemmatize_word(genus_2[0], 'n')))
    target_hyponyms_set = get_all_hyponyms(target_genus_synset)
        
    print(len(target_hyponyms_set))

28
456
116
64


In [11]:
def get_target_term(sentence):
    genus = get_genus(sentence)

    #print(genus)

    # prendo gli iponimi dei sensi del genus
    hy = []
    for g in genus:
        genus_synset = wn.synsets(u.lemmatize_word(g, 'n'))
        # print(genus_synset)
        for syns in genus_synset:
            for h in syns.hyponyms():
                hy.append(h)

    #print(hy)

    # vedo tra gli esempi quali sostantivi sono ricorrenti, e ne ricavo il synset
    results = {}
    for h in hy:
        # prendo gli esempi
        h_examples = h.examples() + [h.definition()]
        # print(h_examples)

        if len(h_examples) > 0:
            for ex in h_examples:
                if len(ex) > 1:
                    for token in u.noise_reduction_en_pos(ex):

                        if token[1] == 'n':
                            if token[0] not in results:
                                results[token[0]] = 1
                            else:
                                results[token[0]] += 1
    if len(results) == 0:
        return None

    
    
    #print(average_value)

    results = sorted(results.items(), key=lambda item: item[1], reverse=True)
    risultato = {chiave: valore for chiave,
                 valore in results if valore > 1}
    total_values = sum(risultato.values())
    if len(risultato) == 0:
        return None
    average_value = total_values/len(risultato)
    risultato = {chiave: valore for chiave,
                 valore in results if valore > average_value}

    return risultato


In [12]:
get_target_term("A construction used to divide two rooms, temporarily closing the passage between them")

{'structure': 58,
 'building': 11,
 'construction': 11,
 'wall': 5,
 'ship': 5,
 'room': 5,
 'part': 5,
 'people': 5}

In [13]:
final_results = {}

for word in data:
    i = 0
    for sentence in data[word]:
        target_terms = get_target_term(sentence)
        if target_terms is not None:
            if word in target_terms:
                i += 1
    final_results[word] = i/len(data[word])

final_results


{'door': 0.23333333333333334,
 'ladybug': 0.0,
 'pain': 0.06666666666666667,
 'blurriness': 0.0}