In [1]:
import sys
sys.path.append('../../taxonomy-enrichment/baselines/ruwordnet')
sys.path.append('../../taxonomy-enrichment/baselines')

In [2]:
import re
import numpy as np
import pandas as pd
import fasttext
from ruwordnet_reader import RuWordnet
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import xml.etree.ElementTree as ET

In [3]:
ruwordnet = RuWordnet(db_path="../../dialogue2020_shared_task_hypernyms/dataset/ruwordnet.db", ruwordnet_path=None)

In [4]:
public_test = []
with open('../../dialogue2020_shared_task_hypernyms/dataset/public/verbs_public_no_labels.tsv', 'r', encoding="utf-8") as f:
    for line in f:
        line = line.rstrip()
        public_test.append(line)

In [5]:
private_test = []
with open('../../dialogue2020_shared_task_hypernyms/dataset/private/verbs_private_no_labels.tsv', 'r', encoding="utf-8") as f:
    for line in f:
        line = line.rstrip()
        private_test.append(line)

In [6]:
public_test[:3], private_test[:3]

(['АБСОЛЮТИЗИРОВАТЬ', 'АКТИВИРОВАТЬ', 'АМЕРИКАНИЗИРОВАТЬ'],
 ['АДСОРБИРОВАТЬ', 'АКАТЬ', 'АКТИРОВАТЬ'])

In [7]:
nouns = {}
nouns_list = []
for sense_id, synset_id, text in ruwordnet.get_all_senses():
    if synset_id.endswith("V"):
        ltext = text.lower()
        if ltext not in nouns:
            nouns_list.append(ltext)
        nouns.setdefault(ltext, []).append(synset_id)
len(nouns), len(nouns_list)

(26538, 26538)

In [8]:
synset2words = {}
for sense_id, synset_id, text in ruwordnet.get_all_senses():
    if synset_id.endswith("V"):
        synset2words.setdefault(synset_id, []).append(text.lower())
len(synset2words)

7521

In [9]:
list(nouns.items())[:3]

[('льет дождь', ['4223-V']),
 ('дождь льет', ['4223-V']),
 ('лить как из ведра', ['4223-V'])]

In [10]:
df_test = pd.DataFrame(data={'word': public_test + private_test})
df_test.shape

(525, 1)

In [11]:
df_test['private'] = [1 if x in private_test else 0 for x in df_test['word']]

In [12]:
df_test['private'].value_counts()

1    350
0    175
Name: private, dtype: int64

In [13]:
df_test.head()

Unnamed: 0,word,private
0,АБСОЛЮТИЗИРОВАТЬ,0
1,АКТИВИРОВАТЬ,0
2,АМЕРИКАНИЗИРОВАТЬ,0
3,АНОДИРОВАТЬ,0
4,БИНТОВАТЬ,0


In [14]:
wiktionarydump = "ruwiktionary-20200120-pages-articles-multistream.xml"

In [15]:
title2doc = {}

In [16]:
doc = {}
fields = {
    "timestamp": "timestamp",
    "title": "title",
    "text": "text",
    "redirect title": "redirect_title",
}
cnt = 0
for _, elem in tqdm(ET.iterparse(wiktionarydump, events=("end",))):
    prefix, has_namespace, postfix = elem.tag.partition('}')
    tag = postfix if postfix else prefix
    if tag in fields:
        doc[fields[tag]] = elem.text
    if tag == "page":
        elem.clear()
        cnt += 1
        title2doc[doc["title"]] = doc
        doc = {}

35866269it [02:25, 246403.81it/s]


In [17]:
# longest article by lowercased word
ltitle2doc = {}
for x in title2doc.keys():
    if x.lower() in ltitle2doc:
        if len(title2doc[x]['text']) > len(ltitle2doc[x.lower()]['text']):
            ltitle2doc[x.lower()] = title2doc[x]
    else:
        ltitle2doc[x.lower()] = title2doc[x]
ltitle_list = list(ltitle2doc.keys())

In [18]:
# longest article by lowercased word
ltitle2docs = {}
for x in title2doc.keys():
    ltitle2docs.setdefault(x.lower(), []).append(title2doc[x])
ltitle_list = list(ltitle2docs.keys())

In [19]:
df_test['wikt_in'] = [1 if x.lower() in ltitle2doc else 0 for x in df_test['word']]
print(df_test[df_test['private']==0]['wikt_in'].value_counts())
print(df_test[df_test['private']==1]['wikt_in'].value_counts())

1    173
0      2
Name: wikt_in, dtype: int64
1    350
Name: wikt_in, dtype: int64


In [21]:
ftmodel = fasttext.load_model("../../dialogue2020_shared_task_hypernyms/baselines/models/cc.ru.300.bin")




In [22]:
ftwords_list = ftmodel.get_words()
ftwords = set(ftwords_list)

In [23]:
lword2word = {word.lower(): word for word in ftwords_list}
len(lword2word), len(ftwords_list)

(1674899, 2000000)

In [24]:
df_test['ft_in'] = [1 if x.lower() in lword2word else 0 for x in df_test['word']]
print(df_test[df_test['private']==0]['ft_in'].value_counts())
print(df_test[df_test['private']==1]['ft_in'].value_counts())

1    140
0     35
Name: ft_in, dtype: int64
1    279
0     71
Name: ft_in, dtype: int64


In [25]:
nouns_vectors = np.zeros((len(nouns_list), ftmodel.get_dimension()))
for i, word in enumerate(tqdm(nouns_list)):
    nouns_vectors[i] = ftmodel.get_sentence_vector(word)

100%|█████████████████████████████████████████████████████████████████████████| 26538/26538 [00:01<00:00, 21649.73it/s]


In [26]:
ltitle_vectors = np.zeros((len(ltitle_list), ftmodel.get_dimension()))
for i, word in enumerate(tqdm(ltitle_list)):
    ltitle_vectors[i] = ftmodel.get_sentence_vector(word)

100%|█████████████████████████████████████████████████████████████████████| 2177428/2177428 [00:53<00:00, 41035.03it/s]


In [27]:
def get_top_k_similar(vectors, vector, k=1):
    res = []
    dots = np.dot(vectors, vector)
    for i in range(k):
        idx = np.argmax(dots)
        res.append(idx)
        dots[idx] = 0
    return res

In [28]:
i = 1
lword = public_test[i].lower()
idxs = get_top_k_similar(ltitle_vectors, ftmodel.get_sentence_vector(lword), k=5)
for idx in idxs:
    print(lword, ltitle_list[idx])

активировать активировать
активировать деактивировать
активировать активироваться
активировать реактивировать
активировать разблокировать


In [29]:
df_test['wn_top10'] = [
    [nouns_list[x] for x in get_top_k_similar(nouns_vectors, ftmodel.get_sentence_vector(word.lower()), k=10)]
    for word in tqdm(df_test['word'])
]

100%|███████████████████████████████████████████████████████████████████████████████| 525/525 [00:02<00:00, 231.82it/s]


In [30]:
df_train = pd.DataFrame(data={'word': [x[0].upper() for x in df_test['wn_top10']]})
# df_train = pd.DataFrame(data={'word': []})
df_train.shape

(525, 1)

In [31]:
# skip self
df_train['wn_top10'] = [
    [nouns_list[x] for x in get_top_k_similar(nouns_vectors, ftmodel.get_sentence_vector(word.lower()), k=11) if nouns_list[x] != word.lower()]
    for word in tqdm(df_train['word'])
]

100%|███████████████████████████████████████████████████████████████████████████████| 525/525 [00:02<00:00, 247.37it/s]


In [32]:
for df in [
    df_test,
    df_train
]:
    df['wikt_top10'] = [
        [ltitle_list[x] for x in get_top_k_similar(ltitle_vectors, ftmodel.get_sentence_vector(word.lower()), k=10)]
        for word in tqdm(df['word'])
    ]

100%|████████████████████████████████████████████████████████████████████████████████| 525/525 [02:51<00:00,  3.05it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 525/525 [02:51<00:00,  3.05it/s]


In [33]:
def clean_markup(text):
    return text.replace("[[", "").replace("]]", "").replace("{{aslinks|", "")

def parse_item(text):
    items = []
    if text.startswith("# ") and len(line) > 2:
        items.extend([
            clean_markup(x).replace("?", "").replace(";", "").replace("'", "").strip() 
            for x in re.split(',|;', text[2:]) if x not in {'-', '?', '—', ''}
        ])
    return items

def parse_translation(trans):
    res = {}
    for line in trans.split('\n'):
        if line.startswith('|'):
            l, r = line.split('=')
            res[l[1:]] = r.replace('[[', '').replace(']]', '')
    return res

def parse_wiktionary(text):
    res = {'hypernym': [], 'synonym': [], 'meaning': []}
    h1 = ""
    texts = []
    for line in text.split("\n"):
        if line.startswith("= ") and line.endswith(" ="):
            h1 = line
        if h1 == '= {{-ru-}} =':
            texts.append(line)
    text = "\n".join(texts)
    for par in text.split("\n\n"):
        for h, f in [('==== Гиперонимы ====', 'hypernym'), ('==== Синонимы ====', 'synonym')]:
            if h in par:
                res[f] = [w for line in par.split("\n") for w in parse_item(line)]
        for h, f in [('==== Значение ====', 'meaning')]:
            if h in par:
                res[f] = [clean_markup(line[2:]) for line in par.split("\n") if line.startswith('# ') and len(line) > 2]
        if '=== Перевод ===' in par:
            res['translation'] = par.replace('=== Перевод ===\n', '')
    return res

In [34]:
for df in [df_test, df_train]:
    df['wikt_hypernyms_text'] = [
        parse_wiktionary(ltitle2doc[word.lower()]['text'])['hypernym'] if word.lower() in ltitle2doc else []
        for word in tqdm(df['word'])
    ]

100%|███████████████████████████████████████████████████████████████████████████████| 525/525 [00:00<00:00, 638.39it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 525/525 [00:00<00:00, 917.13it/s]


In [35]:
for df in [df_test, df_train]:
    df['wikt_top1_hypernyms_text'] = [
        parse_wiktionary(ltitle2doc[words[0].lower()]['text'])['hypernym']
        for words in tqdm(df['wikt_top10'])
    ]

100%|██████████████████████████████████████████████████████████████████████████████| 525/525 [00:00<00:00, 9571.17it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 525/525 [00:00<00:00, 8808.50it/s]


In [36]:
sum([len(x) for x in df_test['wikt_top1_hypernyms_text']]), sum([len(x) for x in df_train['wikt_top1_hypernyms_text']])

(176, 211)

In [37]:
for df in [df_test, df_train]:
    res = []
    for words in tqdm(df['wikt_top10']):
        res_el = []
        for doc in ltitle2docs[words[0].lower()]:
            res_el.extend(parse_wiktionary(doc['text'])['hypernym'])
        res.append(res_el)
    df['wikt_top1_hypernyms_text_docs'] = res

100%|██████████████████████████████████████████████████████████████████████████████| 525/525 [00:00<00:00, 1789.57it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 525/525 [00:00<00:00, 1636.32it/s]


In [38]:
sum([len(x) for x in df_test['wikt_top1_hypernyms_text_docs']]), sum([len(x) for x in df_train['wikt_top1_hypernyms_text_docs']])

(176, 211)

In [39]:
for df in [df_test, df_train]:
    for i in range(10):
        df['wn_top%d_hypernyms' % (i + 1)] = [
            [hyp for synset_id in nouns[words[i]] for hyp in ruwordnet.get_hypernyms_by_id(synset_id)]
            for words in tqdm(df['wn_top10'])
        ]

100%|███████████████████████████████████████████████████████████████████████████████| 525/525 [00:00<00:00, 903.38it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 525/525 [00:00<00:00, 6430.30it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 525/525 [00:00<00:00, 6011.75it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 525/525 [00:00<00:00, 8231.87it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 525/525 [00:00<00:00, 10154.76it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 525/525 [00:00<00:00, 11475.97it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 525/525 [00:00<00:00, 11697.57it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 525/525 [00:00<00:00, 8128.56it/s]
100%|███████████████████████████████████

In [40]:
import os
from collections import Counter
import pymorphy2
from nltk.tokenize import word_tokenize

In [41]:
morph = pymorphy2.MorphAnalyzer()

In [42]:
for df in [df_test, df_train]:
    cnt = 0
    lens = []
    wikt_top1_hypernyms = []
    for word, hypernyms_text in zip(df['word'], df['wikt_top1_hypernyms_text_docs']):
        res = []
        for hypernym in hypernyms_text:
            lhypernym = hypernym.lower().replace('ё', 'е')
            if lhypernym in nouns:
                res.extend(sorted(nouns[lhypernym]))
            else:
                parsed = morph.parse(lhypernym)
                if 'plur' in parsed[0].tag and parsed[0].normal_form in nouns:
                    res.extend(sorted(nouns[parsed[0].normal_form]))
                cnt += 1
        lens.append(len(res))
        wikt_top1_hypernyms.append(res)
    df['wikt_top1_hypernyms_docs'] = wikt_top1_hypernyms
    print(cnt)

16
21


In [43]:
df_test.head(6)

Unnamed: 0,word,private,wikt_in,ft_in,wn_top10,wikt_top10,wikt_hypernyms_text,wikt_top1_hypernyms_text,wikt_top1_hypernyms_text_docs,wn_top1_hypernyms,wn_top2_hypernyms,wn_top3_hypernyms,wn_top4_hypernyms,wn_top5_hypernyms,wn_top6_hypernyms,wn_top7_hypernyms,wn_top8_hypernyms,wn_top9_hypernyms,wn_top10_hypernyms,wikt_top1_hypernyms_docs
0,АБСОЛЮТИЗИРОВАТЬ,0,1,1,"[преувеличивать, переоценивать, идеализировать...","[абсолютизировать, абсолютизируя, абсолютизиро...",[],[],[],[116390-V],"[116115-V, 139096-V, 107602-V, 115033-V, 11503...",[116751-V],[116390-V],"[106882-V, 106517-V]",[116390-V],"[115032-V, 131407-V, 132922-V, 116112-V, 13909...",[116390-V],"[117017-V, 107137-V, 115247-V]","[117017-V, 106875-V, 118713-V]",[]
1,АКТИВИРОВАТЬ,0,1,1,"[разблокировать, дезактивировать, отключить, з...","[активировать, деактивировать, активироваться,...",[],[],[],"[153231-V, 121460-V]","[144157-V, 4661-V, 6444-V]","[107417-V, 111842-V]","[111022-V, 120174-V, 106473-V, 107441-V, 10744...","[111668-V, 110751-V, 111668-V]","[106631-V, 106638-V, 106531-V, 107417-V]","[149898-V, 106490-V, 106490-V, 106493-V, 13284...","[106585-V, 106698-V, 106709-V, 118698-V, 10649...","[106533-V, 106704-V, 106882-V, 116223-V, 11956...",[110474-V],[]
2,АМЕРИКАНИЗИРОВАТЬ,0,1,0,"[урбанизировать, гофрировать, индустриализиров...","[американизировать, американизироваться, амери...",[изменять],[изменять],[изменять],"[112075-V, 116640-V]",[124516-V],[923-V],[135851-V],"[106501-V, 7237-V]","[106501-V, 7237-V]","[106494-V, 145128-V]","[121336-V, 107410-V]",[116636-V],[107417-V],"[106631-V, 111281-V, 117315-V]"
3,АНОДИРОВАТЬ,0,1,0,"[газировать, гофрировать, оцинковать, наполиро...","[анодировать, анодироваться, анодировав, аноди...",[обрабатывать],[обрабатывать],[обрабатывать],[111435-V],[124516-V],[107325-V],"[111769-V, 111785-V, 146751-V]",[107325-V],"[106494-V, 145128-V]",[106950-V],[107325-V],[111109-V],[107325-V],"[106534-V, 106535-V]"
4,БИНТОВАТЬ,0,1,1,"[перевязывать, заматывать, обматывать, растира...","[бинтовать, перебинтовать, забинтовать, бинтов...","[перевязывать, обматывать, обёртывать]","[перевязывать, обматывать, обёртывать]","[перевязывать, обматывать, обёртывать]","[111427-V, 111467-V, 120969-V, 116334-V]","[114599-V, 118663-V, 112000-V]","[107325-V, 112000-V]","[111781-V, 115942-V, 118570-V]","[107325-V, 110813-V, 115187-V, 106888-V]","[107011-V, 114441-V]","[106472-V, 106529-V]","[111427-V, 111467-V, 120969-V, 116334-V]",[108856-V],[111427-V],"[111428-V, 117117-V, 127479-V, 115187-V, 11623..."
5,БОДРИТЬСЯ,0,1,1,"[бодрить, ободриться, приободриться, храбритьс...","[бодриться, бодрить, ободриться, приободриться...",[],[],[],[118574-V],"[107253-V, 125736-V]","[107253-V, 125736-V]",[115900-V],[118574-V],"[106714-V, 119928-V, 106484-V, 106531-V, 14940...","[106565-V, 123840-V, 148270-V]","[128286-V, 116244-V, 124452-V]",[120955-V],"[107253-V, 125736-V]",[]


In [44]:
df_train.head(6)

Unnamed: 0,word,wn_top10,wikt_top10,wikt_hypernyms_text,wikt_top1_hypernyms_text,wikt_top1_hypernyms_text_docs,wn_top1_hypernyms,wn_top2_hypernyms,wn_top3_hypernyms,wn_top4_hypernyms,wn_top5_hypernyms,wn_top6_hypernyms,wn_top7_hypernyms,wn_top8_hypernyms,wn_top9_hypernyms,wn_top10_hypernyms,wikt_top1_hypernyms_docs
0,ПРЕУВЕЛИЧИВАТЬ,"[преуменьшать, приуменьшать, недооценивать, пе...","[преувеличивать, преуменьшать, приуменьшать, н...",[],[],[],[116390-V],[116390-V],"[115032-V, 131407-V, 132922-V, 116112-V, 13909...","[116115-V, 139096-V, 107602-V, 115033-V, 11503...",[116390-V],[116390-V],"[116115-V, 141697-V, 106632-V, 111769-V]",[116390-V],[116751-V],[116390-V],[]
1,РАЗБЛОКИРОВАТЬ,"[заблокировать, блокировать, разблокировать ус...","[разблокировать, заблокировать, разблокировать...",[],[],[],"[106533-V, 106704-V, 106882-V, 116223-V, 11956...","[106533-V, 106704-V, 106882-V, 116223-V, 11956...",[121460-V],"[121458-V, 115580-V]","[107417-V, 111842-V]","[106710-V, 117028-V, 117371-V, 106938-V, 10647...","[106631-V, 129710-V, 110447-V, 117535-V]","[144157-V, 4661-V, 6444-V]","[111022-V, 120174-V, 106473-V, 107441-V, 10744...","[107409-V, 107410-V, 108870-V]",[]
2,УРБАНИЗИРОВАТЬ,"[гофрировать, милитаризировать, индустриализир...","[урбанизировать, урбанизироваться, урбанизиров...",[],[],[],[124516-V],"[106501-V, 7237-V]",[923-V],"[106501-V, 7237-V]",[3513-V],"[106501-V, 7237-V]",[923-V],"[121336-V, 5944-V]",[106501-V],"[106501-V, 147138-V]",[]
3,ГАЗИРОВАТЬ,"[гофрировать, минерализовать, дегазировать, ка...","[газировать, газироваться, разгазировать, гази...",[],[],[],[124516-V],[111435-V],"[110908-V, 128431-V]","[106494-V, 145128-V]",[107325-V],[107325-V],"[114041-V, 115570-V]","[112144-V, 924-V]","[111682-V, 112820-V]","[106674-V, 115814-V]",[]
4,ПЕРЕВЯЗЫВАТЬ,"[перевязать, зашивать, обвязывать, перевязыват...","[перевязывать, перевязать, зашивать, обвязыват...","[лечить, связывать]","[лечить, связывать]","[лечить, связывать]","[111427-V, 111467-V, 120969-V, 116334-V]","[106557-V, 113291-V, 111568-V]",[115187-V],[111427-V],"[107325-V, 112000-V]","[114599-V, 118663-V, 112000-V]","[116817-V, 127237-V]","[107412-V, 107416-V, 107410-V, 107417-V]",[106557-V],"[110790-V, 113291-V, 106483-V, 106548-V]","[1061-V, 118177-V, 111465-V, 125093-V, 137319-..."
5,БОДРИТЬ,"[взбадривать, взбодрить, тонизировать, отрезвл...","[бодрить, взбадривать, бодриться, взбодрить, т...",[],[],[],[118574-V],[118574-V],"[118574-V, 106632-V]","[128286-V, 116244-V, 124452-V]",[115104-V],[118574-V],[115104-V],"[106535-V, 120557-V, 120649-V]","[106817-V, 129710-V, 111493-V, 111942-V]",[114652-V],[]


In [45]:
import codecs

def save_to_file(words_with_hypernyms, output_path, ruwordnet):
    with codecs.open(output_path, 'w', encoding='utf-8') as f:
        for word, hypernyms in words_with_hypernyms.items():
            for hypernym in hypernyms:
                f.write(f"{word}\t{hypernym}\t{ruwordnet.get_name_by_id(hypernym)}\n")

In [46]:
def get_top_hypernyms(l, sz=10):
    res_set = set()
    res = []
    for el in sorted(l):
        if el[1] not in res_set:
            res.append(el[1])
        res_set.add(el[1])
    return res[:sz]

In [47]:
from itertools import chain

In [48]:
# in df_train some items added to hypernyms multiple times
features = {word: {} for word in chain(df_test['word'], df_train['word'])}
hypernyms = {word: [] for word in chain(df_test['word'], df_train['word'])}

syn_priority_l1 = 4.
syn_priority_l2 = 2.
syn_priority_l3 = 3.
syntail_priority_l1 = 7.
syntail_priority_l2 = 5.
syntail_priority_l3 = 6.
wikhyp_priority2_l1 = 0.
wikhyp_priority2_l2 = 1.
wikhyp_priority3_l1 = 5.
wikhyp_priority3_l2 = 6.

for df in [df_test, df_train]:
    for word, hs in zip(df['word'], df['wikt_top1_hypernyms_docs']):
        for j, hypernym in enumerate(hs):
            features[word].setdefault(hypernym, {})['wikhyp_priority_l1'] = 1
            features[word].setdefault(hypernym, {})['wikhyp_priority_l1_pos'] = j
            for hyphyp in ruwordnet.get_hypernyms_by_id(hypernym):
                features[word].setdefault(hyphyp, {})['wikhyp_priority_l2'] = 1
                features[word].setdefault(hyphyp, {})['wikhyp_priority_l2_pos'] = j
        for j, hypernym in enumerate(hs[:2]):
            hypernyms[word].append((wikhyp_priority2_l1 + j*1e-3, hypernym))
            for hyphyp in ruwordnet.get_hypernyms_by_id(hypernym):
                hypernyms[word].append((wikhyp_priority2_l2 + j*1e-3, hyphyp))
        for j, hypernym in enumerate(hs[2:]):
            hypernyms[word].append((wikhyp_priority3_l1 + j*1e-3, hypernym))
            for hyphyp in ruwordnet.get_hypernyms_by_id(hypernym):
                hypernyms[word].append((wikhyp_priority3_l2 + j*1e-3, hyphyp))

    for i in range(2, 11):
        for word, hs in zip(df['word'], df['wn_top%d_hypernyms' % i]):
            for j, hypernym in enumerate(hs):
                features[word].setdefault(hypernym, {})['syn%d_priority_l2'%i] = 1
                features[word].setdefault(hypernym, {})['syn%d_priority_l2_pos'%i] = j
                hypernyms[word].append((syntail_priority_l2 + (i-2)*1e-3, hypernym))
                for hyphyp in ruwordnet.get_hypernyms_by_id(hypernym):
                    features[word].setdefault(hyphyp, {})['syn%d_priority_l3'%i] = 1
                    features[word].setdefault(hyphyp, {})['syn%d_priority_l3_pos'%i] = j
                    hypernyms[word].append((syntail_priority_l3 + (i-2)*1e-3, hyphyp))

    for word, hs in zip(df['word'], df['wn_top1_hypernyms']):
        for j, hypernym in enumerate(hs):
            hypernyms[word].append((syn_priority_l2, hypernym))
            features[word].setdefault(hypernym, {})['syn1_priority_l2'] = 1
            features[word].setdefault(hypernym, {})['syn1_priority_l2_pos'] = j
            for hyphyp in ruwordnet.get_hypernyms_by_id(hypernym):
                features[word].setdefault(hyphyp, {})['syn1_priority_l3'] = 1
                features[word].setdefault(hyphyp, {})['syn1_priority_l3_pos'] = j
                hypernyms[word].append((syn_priority_l3, hyphyp))

    for word, words in zip(df['word'], df['wn_top10']):
        for synset_id in nouns[words[0]]:
            hypernyms[word].append((syn_priority_l1, synset_id))
            features[word].setdefault(synset_id, {})['syn1_priority_l1'] = 1
            features[word].setdefault(synset_id, {})['syn1_priority_l1_pos'] = 0
        for i, word2 in enumerate(words[1:]):
            for j, synset_id in enumerate(nouns[word2]):
                hypernyms[word].append((syntail_priority_l1 + i*1e-3, synset_id))
                features[word].setdefault(synset_id, {})['syn%d_priority_l1'%(i+2)] = 1
                features[word].setdefault(synset_id, {})['syn%d_priority_l1_pos'%(i+2)] = j

In [49]:
from nltk.corpus import wordnet as wn
try:
    wn.all_synsets
except LookupError as e:
    import nltk
    nltk.download('wordnet')

In [50]:
def drop_trailing_dot(s):
    if s.endswith('.'):
        return s[:-1]
    return s

In [51]:
ru2en = {}
with open('data/ru.txt', 'r', encoding='utf-8') as f_ru, open('data/en_ya.txt', 'r', encoding='utf-8') as f_en_y:
    for i, r, ey in zip(range(100500), f_ru, f_en_y):
        r = drop_trailing_dot(r.strip())
        ey = drop_trailing_dot(ey.strip())
        ru2en[r] = ey
len(ru2en)

4697

In [52]:
en2ru = {}
with open('data/hyp_en.txt', 'r', encoding="utf-8") as f_en, open('data/hyp_ru_ya.txt', 'r', encoding="utf-8") as f_ru_y:
    for i, e, ruy, in zip(range(100500), f_en, f_ru_y):
        e = drop_trailing_dot(e.strip())
        ruy = drop_trailing_dot(ruy.strip())
        en2ru[e] = ruy
len(en2ru)

8481

In [53]:
missing = set()

hypernyms_en = {}
hypernyms_en_txt = {}
for df in [df_test, df_train]:
    cnt = 0
    for word in df["word"]:
        hypernyms_en[word] = set()
        hypernyms_en_txt[word] = set()
        lword = word.lower()
        if lword in ru2en:
            synsets = wn.synsets(ru2en[lword])
            if synsets:
                flag = False
                for sense in synsets:
                    for hyp in sense.hypernyms():
                        for name in hyp.lemma_names():
                            name = name.replace('_', ' ')
                            if name in en2ru:
                                if en2ru[name].lower() in nouns:
                                    flag = True
                                    hypernyms_en_txt[word].add(en2ru[name].lower())
                                    for id_ in nouns[en2ru[name].lower()]:
                                        hypernyms[word].append((0.0, id_))
                                        hypernyms_en[word].add((0.0, id_))
                            else:
                                missing.add(name)
            if hypernyms_en[word]:
                cnt += 1
    print(cnt / (len(df["word"])+1e-6))

0.6476190463854875
0.0


In [54]:
for word in public_test[:2] + private_test[:2]:
    print(word, hypernyms_en_txt[word], hypernyms_en[word])

АБСОЛЮТИЗИРОВАТЬ set() set()
АКТИВИРОВАТЬ {'модифицировать', 'изменять', 'менять', 'инициировать'} {(0.0, '106631-V'), (0.0, '111281-V'), (0.0, '116304-V'), (0.0, '124237-V'), (0.0, '106682-V'), (0.0, '117315-V'), (0.0, '106844-V')}
АДСОРБИРОВАТЬ {'принять'} {(0.0, '124852-V'), (0.0, '115487-V'), (0.0, '106884-V'), (0.0, '146793-V'), (0.0, '145869-V'), (0.0, '134035-V'), (0.0, '141708-V')}
АКАТЬ set() set()


In [55]:
for word in hypernyms_en:
    for score, hypernym in hypernyms_en[word]:
        features[word].setdefault(hypernym, {})['wordnet_en_l1'] = 1
        for hyphyp in ruwordnet.get_hypernyms_by_id(hypernym):
            features[word].setdefault(hyphyp, {})['wordnet_en_l2'] = 1

In [56]:
norm_words = {}
def normalize(s):
    res = []
    for word in word_tokenize(s.lower()):
        if word in norm_words:
            res.append(norm_words[word])
        else:
            mp = morph.parse(word)
            if mp:
                norm_words[word] = mp[0].normal_form
                res.append(norm_words[word])
    return " ".join(res)

In [59]:
def innertext(tag):
    return (tag.text or '') + ''.join(innertext(e) for e in tag) + (tag.tail or '')

def get_serp_texts(xml_path, k=5):
    res = []
    if os.path.exists(xml_path):
        root = ET.parse(xml_path).getroot()
        for e in root.find('response').find('results').find('grouping').findall('group')[:k]:
            res.append(innertext(e.find('doc').find('title')))
            if e.find('doc').find('passages'):
                for passage in e.find('doc').find('passages'):
                    res.append(innertext(passage))
            if e.find('doc').find('headline'):
                res.append(innertext(e.find('doc').find('headline')))
    return " ".join(res)

In [60]:
synset_norm_serp_ya_cnt = Counter()
synset_norm_serp_g_cnt = Counter()
hypernyms_wserp = {}
serp_priority = -4.
serp_hyp_priority = -4.
meaning_priority = -1.

for df in [
    df_test,
    df_train
]:
    for word in tqdm(df["word"]):
        word_file_path = 'data/google_it_all/' + word.lower() + '.tsv'
        total_g_serp = ""
        if os.path.exists(word_file_path):
            with open(word_file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    text = line.split('\t')[1]
                    total_g_serp += text + " "
        norm_total_g_serp = normalize(total_g_serp)
        word_file_path = 'data/yandex_it_all/' + word.upper() + '.xml'
        total_ya_serp = get_serp_texts(word_file_path, k=10)
        norm_total_ya_serp = normalize(total_ya_serp)

        total_meaning = ""
        if word.lower() in ltitle2doc:
            for meaning in parse_wiktionary(ltitle2doc[word.lower()]['text'])['meaning']:
                total_meaning += meaning + " "
        norm_total_meaning = normalize(total_meaning)

        res = []
        for score, hypernym in hypernyms[word]:
            hypernym_texts = synset2words[hypernym] + [ruwordnet.get_name_by_id(hypernym)]
            for hypernym_text in hypernym_texts:
                norm_hypernym_text = normalize(hypernym_text)
                if norm_hypernym_text in norm_total_g_serp:
                    score += serp_priority
                    features[word][hypernym]['serp_g_norm'] = 1
                    for hyphyp in ruwordnet.get_hypernyms_by_id(hypernym):
                        if hyphyp in features[word]:
                            features[word][hyphyp]['serp_g_norm_l2'] = 1
                    synset_norm_serp_g_cnt[hypernym] += 1
                if hypernym_text in total_g_serp:
                    features[word][hypernym]['serp_g'] = 1

                if norm_hypernym_text in norm_total_ya_serp:
                    features[word][hypernym]['serp_ya_norm'] = 1
                    synset_norm_serp_ya_cnt[hypernym] += 1
                    for hyphyp in ruwordnet.get_hypernyms_by_id(hypernym):
                        if hyphyp in features[word]:
                            features[word][hyphyp]['serp_ya_norm_l2'] = 1
                if hypernym_text in total_ya_serp:
                    features[word][hypernym]['serp_ya'] = 1

                if norm_hypernym_text in norm_total_meaning:
                    score += meaning_priority
                    features[word][hypernym]['meaning_norm'] = 1
                if hypernym_text in total_meaning:
                    features[word][hypernym]['meaning'] = 1
            res.append((score, hypernym))
        hypernyms_wserp[word] = res

100%|████████████████████████████████████████████████████████████████████████████████| 525/525 [00:30<00:00, 17.24it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 525/525 [00:52<00:00, 10.05it/s]


In [73]:
# feature_names = set()
# for word in features:
#     for synset_id in features[word]:
#         for key in features[word][synset_id]:
#             feature_names.add(key)
# feature_names = sorted(feature_names)
# len(feature_names)
feature_names = [
    "meaning", 
    "meaning_norm", 
#     "serp_g", 
#     "serp_g_norm", 
#     "serp_g_norm_l2", 
#     "serp_ya", 
#     "serp_ya_norm", 
#     "serp_ya_norm_l2", 
    "syn1_priority_l1", 
    "syn1_priority_l2", 
    "syn1_priority_l3", 
    "syn2_priority_l1", 
    "syn2_priority_l2", 
    "syn2_priority_l3", 
    "syn3_priority_l1", 
    "syn3_priority_l2", 
    "syn3_priority_l3", 
    "syn4_priority_l1", 
    "syn4_priority_l2", 
    "syn4_priority_l3", 
    "syn5_priority_l1", 
    "syn5_priority_l2", 
    "syn5_priority_l3", 
    "syn6_priority_l1", 
    "syn6_priority_l2", 
    "syn6_priority_l3", 
    "syn7_priority_l1", 
    "syn7_priority_l2", 
    "syn7_priority_l3", 
    "wikhyp_priority_l1", 
    "wikhyp_priority_l2", 
    "wordnet_en_l1", 
    "wordnet_en_l2", 
]

In [74]:
total = sum([len(features[x]) for x in df_train["word"]])
X = np.zeros( (total, len(feature_names)) )
y = np.zeros( total )
X.shape, y.shape

((20640, 27), (20640,))

In [75]:
pos = 0
for word in df_train["word"]:
    lword = word.lower()
    true_hypernyms = set()
    for synset_id in nouns[lword]:
        true_hypernyms.update(ruwordnet.get_hypernyms_by_id(synset_id))
    for synset_id in features[word]:
        y[pos] = 1 if synset_id in true_hypernyms else 0
        X[pos] = [features[word][synset_id].get(fn, 0) for fn in feature_names]
        pos += 1
pos

20640

In [76]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_train.shape, y_test.shape

((16512,), (4128,))

In [78]:
for C in [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3, 10, 30, 100, 300, 1000]:
    model = LogisticRegression(C=C)
    model.fit(X_train, y_train)
    print(C, roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

0.001 0.9487468445052916
0.003 0.9508602226071673
0.01 0.952534185460138
0.03 0.9496675240744257
0.1 0.949092099343717
0.3 0.9485525452455719
1.0 0.9476796931865228
3 0.9474360718070278
10 0.9474196311004363
30 0.9474136526616758
100 0.9474002011744642
300 0.9473987065647742
1000 0.9474002011744643


In [79]:
model = LogisticRegression(C=0.03)
model.fit(X, y)

LogisticRegression(C=0.03, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [80]:
weights = {k:v for k,v in zip(feature_names, model.coef_[0])}

In [81]:
pos_weights = {}
for i in range(1, 11):
    for j in range(1,4):
        pos_weights['syn%d_priority_l%d_pos' % (i, j)] = -0.0001

def calc_score(d):
    score = 0.
    for feature, weight in weights.items():
        score += weight * d.get(feature, 0)
    for feature, weight in pos_weights.items():
        score += weight * d.get(feature, 0)
    return score

In [82]:
prefix = 'subm/subm106_no_gya'
save_to_file({
    k: get_top_hypernyms( [(-calc_score(features[k][x]), x) for x in features[k]] )
    for k in public_test
}, prefix + '_public_verbs.tsv', ruwordnet)
save_to_file({
    k: get_top_hypernyms( [(-calc_score(features[k][x]), x) for x in features[k]] )
    for k in private_test
}, prefix + '_private_verbs.tsv', ruwordnet)

In [83]:
weights = {
    "meaning": 0.15,
    "meaning_norm": 0.15,
    "serp_g": 0.20,
    "serp_g_norm": 0.50,
    "serp_g_norm_l2": 0.20,
    "serp_ya": 0.15,
    "serp_ya_norm": 0.20,
    "serp_ya_norm_l2": 0.20,
    
    "syn1_priority_l1": 0.05,
    "syn1_priority_l2": 0.50,
    "syn1_priority_l3": 0.30,
    
    "syn2_priority_l1": 0.05,
    "syn2_priority_l2": 0.25,
    "syn2_priority_l3": 0.15,
    
    "syn3_priority_l1": 0.05,
    "syn3_priority_l2": 0.25,
    "syn3_priority_l3": 0.15,
    
    "syn4_priority_l1": 0.00,
    "syn4_priority_l2": 0.10,
    "syn4_priority_l3": 0.15,
    
    "syn5_priority_l1": 0.00,
    "syn5_priority_l2": 0.10,
    "syn5_priority_l3": 0.00,
    
    "syn6_priority_l1": 0.00,
    "syn6_priority_l2": 0.10,
    "syn6_priority_l3": 0.00,
    
    "syn7_priority_l1": 0.00,
    "syn7_priority_l2": 0.10,
    "syn7_priority_l3": 0.00,
    
    "wikhyp_priority_l1": 0.20,
    "wikhyp_priority_l2": 0.50,
    
    "wordnet_en_l1": 0.05,
    "wordnet_en_l2": 0.20,
}

In [84]:
prefix = 'subm/subm107_no_gya'
save_to_file({
    k: get_top_hypernyms( [(-calc_score(features[k][x]), x) for x in features[k]] )
    for k in public_test
}, prefix + '_public_verbs.tsv', ruwordnet)
save_to_file({
    k: get_top_hypernyms( [(-calc_score(features[k][x]), x) for x in features[k]] )
    for k in private_test
}, prefix + '_private_verbs.tsv', ruwordnet)