In [1]:
import nltk
from nltk.corpus import wordnet as wn
import json
import polars as pl

# Download WordNet data
nltk.download('wordnet')

import sys
sys.path.append('..')
from utils import utils as u

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\riccardo.gino\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\riccardo.gino\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\riccardo.gino\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def get_genus(sentence):
    # print(sentence)
    genus = []
    # split definition for , or .

    comma_split = sentence.split(',')

    splitted_sent = []

    for cs in comma_split:
        if cs.split('.') != '':
            splitted_sent += cs.split('.')

    if '' in splitted_sent:
        splitted_sent.remove('')

    targets = {
        1: [["it"], ["is", "'s"], ["a", "an", "the", "for"]],
        2: [["that", "of", "to"]],
        3: [["a", "an", "for", "to"]]
    }

    for sentence in splitted_sent:
        tokenized_def = u.get_lemmatized_tokens_list_pos(u.tokenizer(sentence))

        for i in range(0, len(tokenized_def)):
            # if tokenized_def[i][0] not in checked_tokens:
            # print(i, tokenized_def[i][0].lower(), targets[2][0])
            if i > 0 and tokenized_def[i][0].lower() in targets[2][0]:
                if tokenized_def[i-1][0] not in genus and tokenized_def[i-1][1] == 'n':
                    genus.append(tokenized_def[i-1][0])

            if (i < len(tokenized_def)-1):
                if tokenized_def[i][1] == 'a' and tokenized_def[i+1][1] == 'n' and tokenized_def[i+1][0] not in genus:
                    genus.append(tokenized_def[i+1][0])

            if i < len(tokenized_def)-2 and tokenized_def[i][0].lower() in targets[3][0]:
                if tokenized_def[i+1][0] not in genus and tokenized_def[i+1][1] == 'n':
                    genus.append(tokenized_def[i+1][0])

                elif tokenized_def[i+1][1] == 'a' and tokenized_def[i+2][1] == 'n':
                    genus.append(tokenized_def[i+2][0])

            if (i < len(tokenized_def)-3):
                if tokenized_def[i][0].lower() in targets[1][0] and (tokenized_def[i+1][0] in targets[1][1]) and (tokenized_def[i+2][0] in targets[1][2]):
                    if tokenized_def[i+3][0] not in genus:
                        genus.append(tokenized_def[i+3][0])

    # print(genus, differentia)
    return genus


In [3]:
def get_target_term(sentence):
    genus = get_genus(sentence)

    #print(genus)

    # prendo gli iponimi dei sensi del genus
    hy = []
    for g in genus:
        genus_synset = wn.synsets(u.lemmatize_word(g, 'n'))
        # print(genus_synset)
        for syns in genus_synset:
            for h in syns.hyponyms():
                hy.append(h)

    #print(hy)

    # vedo tra gli esempi quali sostantivi sono ricorrenti, e ne ricavo il synset
    results = {}
    for h in hy:
        # prendo gli esempi
        h_examples = h.examples() + [h.definition()]
        # print(h_examples)

        if len(h_examples) > 0:
            for ex in h_examples:
                if len(ex) > 1:
                    for token in u.noise_reduction_en_pos(ex):

                        if token[1] == 'n':
                            if token[0] not in results:
                                results[token[0]] = 1
                            else:
                                results[token[0]] += 1
    if len(results) == 0:
        return None

    
    
    #print(average_value)

    results = sorted(results.items(), key=lambda item: item[1], reverse=True)
    risultato = {chiave: valore for chiave,
                 valore in results if valore > 1}
    total_values = sum(risultato.values())
    if len(risultato) == 0:
        return None
    average_value = total_values/len(risultato)
    risultato = {chiave: valore for chiave,
                 valore in results if valore > average_value}

    return risultato


In [4]:
get_target_term("A construction used to divide two rooms, temporarily closing the passage between them")

{'structure': 58,
 'building': 11,
 'construction': 11,
 'wall': 5,
 'ship': 5,
 'room': 5,
 'part': 5,
 'people': 5}

In [5]:
def compute_similarity(definition_1_tokens, definition_2_tokens):

    min_len = 0

    if len(definition_1_tokens) > len(definition_2_tokens):
        min_len = len(definition_2_tokens)
    else:
        min_len = len(definition_1_tokens)

    return (len(set(definition_1_tokens).intersection(set(definition_2_tokens))) / min_len)

In [21]:
def get_target_term_v2(sentence):
    genus = get_genus(sentence)
    target_def = u.noise_reduction_en_pos(sentence)
    #print(target_def)

    # controllo tra tutti gli iponimi del genus quello con definizione più simile alla definizione data
    hy = []
    for g in genus:
        genus_synset = wn.synsets(u.lemmatize_word(g, 'n'))
        # print(genus_synset)
        for syns in genus_synset:
            for h in syns.hyponyms():
                hy.append(h)
                for hh in h.hyponyms():
                    hy.append(hh)
    target_term_dict = {}

    for h in hy:
        context = u.noise_reduction_en_pos(h.definition())
        for ex in h.examples():
            if len(ex) > 1:
                context += u.noise_reduction_en_pos(ex)
        print(context)
        score = compute_similarity(target_def, context)/len(context)
        target_term_dict[h] = score

    target_term_dict = sorted(target_term_dict.items(),
                              key=lambda item: item[1], reverse=True)

    risultato = {chiave: valore for chiave,
                 valore in target_term_dict if valore > 0}

    return risultato


In [22]:
get_target_term_v2("A rectangular object (usually made out of wood) use to lock access to a room or a house.")

[('anything', 'n'), ('catch', 'v'), ('especially', 'r'), ('worth', 'a'), ('catch', 'v'), ('share', 'v'), ('catch', 'n'), ('others', 'n')]
[('something', 'n'), ('believe', 'v'), ('bring', 'v'), ('good', 'a'), ('luck', 'n')]
[('trinket', 'n'), ('piece', 'n'), ('jewelry', 'n'), ('usually', 'r'), ('hang', 'v'), ('neck', 'n'), ('think', 'v'), ('magical', 'a'), ('protection', 'n'), ('evil', 'n'), ('disease', 'n')]
[('charm', 'n'), ('superstitiously', 'r'), ('believe', 'v'), ('embody', 'v'), ('magical', 'a'), ('power', 'n')]
[('object', 'n'), ('coin', 'n'), ('postage', 'n'), ('stamp', 'n'), ('make', 'v'), ('mark', 'v'), ('event', 'n'), ('honor', 'v'), ('person', 'n')]
[('something', 'n'), ('unusual', 'a'), ('perhaps', 'r'), ('worthy', 'unk'), ('collect', 'v')]
[('miscellaneous', 'a'), ('curio', 'n')]
[('thing', 'n'), ('consider', 'v'), ('worth', 'a'), ('collect', 'v'), ('necessarily', 'r'), ('valuable', 'a'), ('antique', 'n')]
[('outstanding', 'a'), ('item', 'n'), ('prize', 'n'), ('piece', 'n

{Synset('workroom.n.01'): 0.25,
 Synset('family_room.n.01'): 0.125,
 Synset('artifact.n.01'): 0.1111111111111111,
 Synset('grail.n.01'): 0.1111111111111111,
 Synset('prepositional_object.n.01'): 0.1111111111111111,
 Synset('retained_object.n.01'): 0.1111111111111111,
 Synset('infatuation.n.03'): 0.1111111111111111,
 Synset('misapplication.n.01'): 0.1111111111111111,
 Synset('substance_abuse.n.01'): 0.1111111111111111,
 Synset('witching.n.01'): 0.1111111111111111,
 Synset('rehash.v.01'): 0.1111111111111111,
 Synset('share.v.02'): 0.1111111111111111,
 Synset('spare.v.04'): 0.1111111111111111,
 Synset('address.v.07'): 0.1111111111111111,
 Synset('motel_room.n.01'): 0.1111111111111111,
 Synset('billiard_room.n.01'): 0.1111111111111111,
 Synset('cell.n.07'): 0.1111111111111111,
 Synset('conference_room.n.01'): 0.1111111111111111,
 Synset('cubby.n.01'): 0.1111111111111111,
 Synset('darkroom.n.01'): 0.1111111111111111,
 Synset('den.n.04'): 0.1111111111111111,
 Synset('dining_room.n.01'): 0.11

In [8]:
df = json.loads(pl.read_csv(
    "../datasets/TLN-definitions-23.tsv", separator='\t').write_json())

data = {}

for col in df['columns']:
    if col['name'] != '1':
        data[col['name']] = []
        for v in col['values']:
            data[col['name']].append(v)
            

In [9]:
final_results = {}

for word in data:
    i = 0
    for sentence in data[word]:
        target_terms = get_target_term(sentence)
        if target_terms is not None:
            if word in target_terms:
                i += 1
    final_results[word] = i/len(data[word])

final_results


{'door': 0.23333333333333334,
 'ladybug': 0.0,
 'pain': 0.06666666666666667,
 'blurriness': 0.0}