In [40]:
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:85% !important; }</style>"))

In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import nltk
import itertools


import adagram
from lxml import html
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from pymorphy2 import MorphAnalyzer
from string import punctuation
import json, os
from collections import Counter
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import warnings

warnings.filterwarnings('ignore')

In [3]:

morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))



def tokenize(text):
    global stops
    words = [word.strip(punct) for word in text.lower().split() if word and word not in stops]
    words = [word for word in words if word]

    return words

def normalize(text):
    
    words = tokenize(text)
    words = [morph.parse(word)[0].normal_form for word in words if word]
    
    return words

## Задание № 1. Протестировать адаграм в определении перефразирования

Векторизуйте пары текстов с помощью Адаграма, обучите любую модель и оцените качество (кросс-валидацией). 

За основу возьмите код из предыдущего семинара/домашки, только в функции get_embedding вам нужно выбирать вектор нужного значения (импользуйте model.disambiguate и model.sense_vector). Отдельные векторы усредните как и в предыдущем семинаре.

In [4]:
with open('paraphraser/paraphrases.xml', 'rb') as file:
    corpus_xml = html.fromstring(file.read())

In [5]:
texts_1 = []
texts_2 = []
classes = []

for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])
    
data = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

In [6]:
vm = adagram.VectorModel.load("out.pkl")

In [7]:
vm.word_sense_probs('земля')

[(0, 0.9999317918286611)]

In [8]:
vm.disambiguate('мир', ['новый', 'славный'])

array([4.89643997e-04, 9.99510356e-01, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00])

In [9]:
data['text_1_norm'] = data['text_1'].apply(normalize)
data['text_2_norm'] = data['text_2'].apply(normalize)

In [10]:
to_compare = [[0, [1, 2, 3]],
 [1, [0, 2, 3, 4]],
 [2, [0, 1, 3, 4, 5]],
 [3, [0, 1, 2, 4, 5, 6]],
 [4, [1, 2, 3, 5, 6, 7]],
 [5, [2, 3, 4, 6, 7, 8]],
 [6, [3, 4, 5, 7, 8, 9]],
 [7, [4, 5, 6, 8, 9]],
 [8, [5, 6, 7, 9]],
 [9, [6, 7, 8]]]

to_compare

[[0, [1, 2, 3]],
 [1, [0, 2, 3, 4]],
 [2, [0, 1, 3, 4, 5]],
 [3, [0, 1, 2, 4, 5, 6]],
 [4, [1, 2, 3, 5, 6, 7]],
 [5, [2, 3, 4, 6, 7, 8]],
 [6, [3, 4, 5, 7, 8, 9]],
 [7, [4, 5, 6, 8, 9]],
 [8, [5, 6, 7, 9]],
 [9, [6, 7, 8]]]

In [11]:
def get_words_in_context(words, window=3):
    words_len = len(words)
    words_in_context = []
    for i in range(words_len):
        word = words[i]
        left_context = words[max(0, i - window):i]
        right_context = words[i + 1:min(words_len, i + window + 1)]
        context = left_context + right_context
        words_in_context.append([word, context])
    return words_in_context

In [12]:
words = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [13]:
get_words_in_context(words)

[[0, [1, 2, 3]],
 [1, [0, 2, 3, 4]],
 [2, [0, 1, 3, 4, 5]],
 [3, [0, 1, 2, 4, 5, 6]],
 [4, [1, 2, 3, 5, 6, 7]],
 [5, [2, 3, 4, 6, 7, 8]],
 [6, [3, 4, 5, 7, 8, 9]],
 [7, [4, 5, 6, 8, 9]],
 [8, [5, 6, 7, 9]],
 [9, [6, 7, 8]]]

In [14]:
get_words_in_context(words) == to_compare

True

In [15]:
def get_sense_vector(word, context, model):
    most_likely_sense = model.disambiguate(word, context).argmax()
    sense_vec = model.sense_vector(word, most_likely_sense)
    return sense_vec

In [16]:
def get_embedding_adagram(text, model, window=3, dim=100):
    
    word2context = get_words_in_context(text, window)
    
    
    vectors = np.zeros((len(word2context), dim))
    
    for i, (word, context) in enumerate(word2context):
        
        try:
            v = get_sense_vector(word, context, model)
            vectors[i] = v
        
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector
        

In [17]:
get_embedding_adagram('тренировочный мир тестов', vm)

array([ 0.01474951,  0.15916309, -0.00517532,  0.07148482, -0.11557754,
        0.00151955, -0.06875914, -0.04772396, -0.03014633,  0.11318449,
       -0.05010392,  0.07272814,  0.06377294,  0.06653478,  0.10099462,
       -0.00716796,  0.00876474,  0.18644314,  0.19070874, -0.00914995,
       -0.07892463, -0.00369429, -0.07506094, -0.09971022, -0.1280301 ,
       -0.08694893, -0.08521983, -0.06966679, -0.0385777 ,  0.00251285,
       -0.06280887, -0.09142431,  0.02111946, -0.07578302, -0.04335929,
        0.02006594, -0.03932559, -0.08505101, -0.13847218, -0.00089411,
        0.06361622,  0.07585328,  0.0174655 ,  0.25460323, -0.0474697 ,
        0.04015753,  0.01645942,  0.14489075,  0.04684166, -0.11665628,
        0.00724955,  0.00441676, -0.05104768,  0.1117997 ,  0.03522702,
       -0.04775833,  0.10879791, -0.0637923 ,  0.0181433 , -0.10177186,
        0.20217654,  0.04528808, -0.00873872, -0.06189587, -0.00435417,
       -0.00492449,  0.05316143,  0.02388541, -0.07703598, -0.11

In [18]:
X_text_1 = [get_embedding_adagram(text, vm) for text in data['text_1_norm']]
X_text_2 = [get_embedding_adagram(text, vm) for text in data['text_2_norm']]

X_text = np.concatenate([X_text_1, X_text_2], axis=1)

In [19]:
y = data.label.values

In [20]:
clf = LogisticRegression(C=1, solver='liblinear', penalty='l1')

In [21]:
np.mean(cross_val_score(clf, X_text, y, scoring='f1_micro', cv=5))

0.4258777919219092

In [22]:
len(list(wn.all_synsets()))

117659

### Задание 2. Реализовать алгоритм Леска и проверить его на реальном датасете

Ворднет можно использовать для дизамбигуации. Самый простой алгоритм дизамбигуации - алгоритм Леска. В нём нужное значение слова находится через пересечение слов контекста, в котором употреблено это слово, с определениями значений слова из ворднета. Значение с максимальным пересечением - нужное.

In [23]:
stops = set(stopwords.words('english'))

In [24]:
from nltk.stem import WordNetLemmatizer

In [25]:
lemmatizer = WordNetLemmatizer()

In [26]:
def normalize(text):    
    words = tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word]   
    return words

In [27]:
def has_examples(syn):
    if syn.examples():
        return True
    else:
        return False

In [28]:
def has_definition(syn):
    if syn.definition():
        return True
    else:
        return False

In [29]:
def lesk(word, sentence):
    bestsense = 0
    maxoverlap = 0
    
    synsets = wn.synsets(word)
    if len(synsets) < 2:
        return bestsense
    
    sentence = set(sentence)
    definitions_dict = {}
    
    for i, syns in enumerate(synsets):
        if has_definition(syns):
            definitions_dict[i] = set(normalize(syns.definition()))
        else:
            definitions_dict[i] = set()
        
    for i in range(len(definitions_dict)):
        overlap = len(sentence.intersection(definitions_dict[i]))
        if overlap > maxoverlap:
            maxoverlap = overlap
            bestsense = i
    
    return bestsense

In [30]:
with open('corpus_wsd_50k.txt') as file:
    corpus = file.read().split('\n\n')

In [31]:
corpus_wsd = []
for sent in corpus:
    corpus_wsd.append([s.split('\t') for s in sent.split('\n')])

In [32]:
def eval_sentence(sentence):
    lemmas = [word[1] for word in sentence]
    word_contexts = get_words_in_context(lemmas)
    total = 0 # number of ambiguous words in the sentence
    correct = 0 # number of correctly predicted ambiguous words
    for i, word_context in enumerate(word_contexts):
        word = word_context[0]
        context = word_context[1]
        sense = sentence[i][0]
        sense_predicted = lesk(word, context)
        if sense:
            total += 1
            correct += validate_sense(word, sense, sense_predicted)
    return correct, total

In [33]:
def validate_sense(word, sense, sense_predicted):
    syn = wn.synsets(word)
    
    if syn:
        prediction = syn[sense_predicted]
    else:
        prediction = None
        
    true_sense = wn.lemma_from_key(sense).synset()
        
#     print(prediction)
#     print(true_sense)
    
    if not prediction:
        return False
    return prediction == true_sense

In [34]:
eval_sentence(corpus_wsd[0])

(3, 7)

In [35]:
corpus_wsd[0]

[['', 'how', 'How'],
 ['long%3:00:02::', 'long', 'long'],
 ['', 'have', 'has'],
 ['', 'it', 'it'],
 ['be%2:42:03::', 'be', 'been'],
 ['', 'since', 'since'],
 ['', 'you', 'you'],
 ['review%2:31:00::', 'review', 'reviewed'],
 ['', 'the', 'the'],
 ['objective%1:09:00::', 'objective', 'objectives'],
 ['', 'of', 'of'],
 ['', 'you', 'your'],
 ['benefit%1:21:00::', 'benefit', 'benefit'],
 ['', 'and', 'and'],
 ['service%1:04:07::', 'service', 'service'],
 ['program%1:09:01::', 'program', 'program'],
 ['', '?', '?']]

In [36]:
def test_lesk(n):
    result = []
    for sentence in corpus_wsd[:min(n, len(corpus_wsd))]:
        result.append(eval_sentence(sentence))

    result = np.array(result)
    result_sum = np.sum(result, axis=0)
    accuracy = result_sum[0]/result_sum[1]
    return accuracy

In [37]:
%%time

print(f'Accuracy of current Lesk algorithm realisation is {test_lesk(10000)}')

Accuracy of current Lesk algorithm realisation is 0.5236189462480042
CPU times: user 1min 15s, sys: 4.78 s, total: 1min 20s
Wall time: 1min 20s


Попробуем доработать алгоритм, используя не только определения, но и примеры, если они есть.

In [38]:
def lesk(word, sentence):
    bestsense = 0
    maxoverlap = 0
    
    synsets = wn.synsets(word)
    if len(synsets) < 2:
        return bestsense
    
    sentence = set(sentence)
    context = {}
    
    for i, syns in enumerate(synsets):
        if has_definition(syns):
            context[i] = set(normalize(syns.definition()))
            if has_examples(syns):
                examples_set = set()
                for example in syns.examples():
                    examples_set.update(normalize(example))
                context[i].update(examples_set)
        else:
            context[i] = set()
        
    for i in range(len(context)):
        overlap = len(sentence.intersection(context[i]))
        if overlap > maxoverlap:
            maxoverlap = overlap
            bestsense = i
    
    return bestsense

Получаем небольшой прирост в качестве.

In [39]:
test_lesk(10000)

0.5309526343799894