In [13]:
import ast
from collections import defaultdict, OrderedDict
from itertools import chain, product
import string
from pprint import pprint

import numpy as np
from urllib.parse import quote
import urllib3
from scipy.spatial.distance import cosine
from tqdm.autonotebook import tqdm

import plwn

In [2]:
class Wordnet:
    
    def __init__(self, wordnet):
        self._wordnet = wordnet
        self._lexical_units = self._valid_lexical_units()
        
    def _valid_lexical_units(self):
        return [(lu.lemma, lu.synset, lu.definition) for lu in tqdm(self._wordnet.lexical_units(), 
                                                                    desc='Filtering polish') if lu.is_polish]
    
    def senses(self, word):
        return [lu for lu in self._lexical_units if lu[0] == word]
    
    def senses_verbose(self, word):
        def reshape_senses(senses):
            if senses:
                return list(np.asarray(senses)[:,0].astype(int))
            return []
        
        def verbose_part(sense):    
            return dict([[k, reshape_senses(v)]
                         for k, v in sense[1].to_dict()['related'].items()])
        
        return [(*sense, verbose_part(sense)) 
                for sense in self.senses(word)]
    
    @property
    def orig(self):
        return self._wordnet

In [3]:
class StopWord:
    
    def __init__(self, file):
        self._stop_words = self._load_file(file)
    
    @property
    def stop_words(self):
        return self._stop_words
    
    def _load_file(self, file):
        f = open(file, encoding='utf-8')
        lines = f.read().splitlines()
        return lines
    
    def is_stop_word(self, word):
        return word.lower() in self._stop_words

In [17]:
class RemoteWordEmbedding:
    
    def __init__(self, address):
        self._address = address
        self._http = urllib3.PoolManager()
        
    def get_embedding(self, word):
        word = quote(word)
        target = f'{self._address}/word_emb/{word}'
        data = self._http.request('GET', target)
        if data.status == 200:
            return ast.literal_eval(data.data.decode('ascii'))
        else:
            raise Exception(f'A problem occured during getting an embedding for word: {word}...')

In [5]:
class RemoteSenseEmbedding:
    
    def __init__(self, address):
        self._address = address
        self._http = urllib3.PoolManager()
        
    def get_embedding(self, synset_id):
        target = f'{self._address}/sense_emb/{synset_id}'
        data = self._http.request('GET', target)
        if data.status == 200:
            return ast.literal_eval(data.data.decode('ascii'))
        else:
            raise Exception(f'A problem occured during getting an embedding for synset id: {synset_id}...')

In [18]:
remote_embeddings_address = 'http://10.17.5.15:4000'
stop_words_file = 'polish.stopwords.txt'

sw = StopWord(stop_words_file)
wn = Wordnet(plwn.load_default())
we = RemoteWordEmbedding(remote_embeddings_address)
se = RemoteSenseEmbedding(remote_embeddings_address)

HBox(children=(IntProgress(value=0, description='Filtering polish', max=504102, style=ProgressStyle(descriptio…




In [19]:
we.get_embedding('góra')
# se.get_embedding(1234)

http://10.17.5.15:4000/word_emb/g%C3%B3ra


[-0.00526798889040947,
 0.2318645864725113,
 0.06431606411933899,
 0.09118449687957764,
 0.09561485797166824,
 -0.2845797538757324,
 0.5002983212471008,
 -0.13274794816970825,
 -0.2651445269584656,
 0.13124078512191772,
 0.0005359433125704527,
 0.10206595808267593,
 0.012797323986887932,
 -0.026633350178599358,
 0.022827332839369774,
 -0.1445096880197525,
 -0.010917897336184978,
 -0.00679526012390852,
 0.1555798351764679,
 -0.2990945279598236,
 0.44120216369628906,
 -0.1583249568939209,
 0.03469482809305191,
 0.12148593366146088,
 0.43526381254196167,
 0.06914525479078293,
 -0.550913393497467,
 -0.18576383590698242,
 -0.30630794167518616,
 0.06487590819597244,
 0.23038145899772644,
 0.36285528540611267,
 -0.5347830057144165,
 -0.04094468057155609,
 -0.01520390436053276,
 -0.1483423113822937,
 0.14404188096523285,
 0.0583166778087616,
 0.12850326299667358,
 -0.020545467734336853,
 -0.2679474949836731,
 0.2783542275428772,
 -0.2191409468650818,
 -0.20301111042499542,
 -0.0619571432471275

In [None]:
def clean_text(text, sw):
    text = ' '.join(word for word in text.split() if not sw.is_stop_word(word))
    return text.translate(str.maketrans({key: None for key in string.punctuation}))

In [None]:
# def get_senses(word: str):
#     return dict([[x[1].id, x[2]] for x in wn.senses(word)])

In [None]:
# def get_gloss(word: str, synset_id: int):
#     return get_senses(word).get(synset_id)

In [None]:
# def analyse_text_with_morfeusz(text: str):
#     return morf.analyse(clean_text(text, sw))

In [None]:
# def get_lemmas(morf_output):
#     return set(map(lambda word: word.split(':')[0], [data[2][1] for data in morf_output]))

In [None]:
# def get_lemmas(morf_output):
#     words = defaultdict(set)
#     for data in morf_output:
#         words[data[2][0]].add(data[2][1].split(':')[0])
#     return list(words.values())

In [None]:
# def get_words(text):
#     return get_lemmas(analyse_text_with_morfeusz(text))

In [None]:
# def related_syn_ids(word, synset_id, *relations):
#     related = [related[3] for related in wn.senses_verbose(word) if related[1].id == synset_id]
#     if related:
#         related = chain(*[v for k, v in related[0].items() 
#                           for relation in relations if k.startswith(relation)])
#     return list(map(int, related))

In [None]:
# def glosses_from_syn_ids(syn_ids):
#     def only_polish_def(syn_id):
#         data = wn.orig.synset_by_id(syn_id)
#         return data.to_dict()['units'][0]['definition'] if data.is_polish else None
#     glosses = filter(lambda x: x not in [None, '.'], map(only_polish_def, syn_ids))
#     return list(glosses)

In [None]:
# def sorted_by_senses_count(words):
#     sorted_words = {}
    
#     for word in words:
#         count = len(wn.senses(word))
        
#         if count:
#             sorted_words[word] = count
            
#     return OrderedDict(sorted(sorted_words.items(), key=lambda x: x[1])).keys()

In [None]:
# def C_w(W_wout_W, disambiguated):
#     def _get_vector(w):
#         return se[disambiguated[w]] if w in disambiguated.keys() else we.wv[w]
#     return np.average(list(map(_get_vector, W_wout_W)), axis=0)

In [None]:
# def G_s(word, synset_id, use_related=True, *relations):
#     if use_related:
#         synset_ids = related_syn_ids(word, synset_id, *relations)
#         glosses = glosses_from_syn_ids(synset_ids)
#         gloss = list(chain(*filter(lambda words: len(words) == 1,
#                             chain(map(list, chain(*map(get_words, glosses)))))))
#     else:
#         gloss = get_gloss(word, synset_id)
#     return np.average(list(map(we.wv.get_vector, gloss)), axis=0) if gloss else 0.0

In [None]:
# def wsd(W, use_related=True, relations=['hiperonimia', 'synonimia']):
    
#     best_scores = []
    
#     for lemmas_set in tqdm(list(product(*get_words(W))), 'Lemmas set'):
#         best_senses = {}
        
#         W_temp = list(sorted_by_senses_count(lemmas_set)) # sortowanie po liczbie sensów
#         sum_scores = 0.0
        
#         for w in tqdm(W_temp, 'Word', leave=False):
#             word_score = {}
            
#             best_sense, best_score = None, 0.0

#             W_wout_w = W_temp.copy() 
#             W_wout_w.remove(w) 
#             c_w = C_w(W_wout_w, best_senses)

#             for word, synset, gloss in wn.senses(w):

#                 if se.get(synset.id) is None:
#                     continue

#                 g_s = G_s(word, synset.id, use_related, *relations) 

#                 first_cos = 1 - cosine(g_s, c_w) 
#                 second_cos = 1 - cosine(se[synset.id], c_w)
#                 score = first_cos + second_cos # suma podobieństw cos

#                 word_score[synset.id] = score

#                 if score > best_score:
#                     best_score = score
#                     best_sense = synset.id
                    
#             if best_sense:
#                 best_senses[w] = best_sense
                
#             sum_scores += best_score
            
#         best_scores.append((sum_scores, best_senses))
    
#     best_senses = sorted(best_scores, key=lambda data: data[0], reverse=True)[0][1]
#     return dict(map(lambda data: (data[0], get_gloss(*data)), best_senses.items()))

In [None]:
# wsd_out = wsd('Język (łac. lingua) – wieloczynnościowy narząd, twór mięśniowy jamy gębowej kręgowców. Głównym zadaniem języka jest podsuwanie pokarmu pod zęby, mieszanie pokarmu w czasie żucia i przesuwanie kęsów pokarmu do gardła, lecz niektóre gatunki używają go również do innych celów.')

In [None]:
# wsd_out