In [1]:
import numpy as np
import pandas as pd
import re
import json

In [2]:
words_filename = '/Users/maxim_rachinskiy/Develop/SE_Java/WiktionaryParser/data/words.txt'
samples_filename = '/Users/maxim_rachinskiy/Develop/SE_Java/WiktionaryParser/data/samples.txt'
id_prefix = 'wikt_ru'
min_gloss_len = 4
dev_samples_part = 0.1
random_seed = 42

In [3]:
output_train_samples_filename = f'data/Wiki/{id_prefix}/train_samples.txt'
output_dev_samples_filename = f'data/Wiki/{id_prefix}/dev_samples.txt'
output_words_filename = f'data/Wiki/{id_prefix}/words.txt'
output_glosses_filename = f'data/Wiki/{id_prefix}/glosses.txt'

In [4]:
with open(words_filename) as f:
    words = json.load(f)
    
with open(samples_filename) as f:
    samples = json.load(f)
    
len(words), len(samples)

(220385, 161809)

In [5]:
def clean_text(text):
    text = text.replace(u'\xa0', ' ')
    text = text.replace('&nbsp;', ' ')
    text = text.replace('<br/>', '\n')
    text = text.replace('<br />', '\n')
    text = text.replace('{{-}}', '-')
    text = text.replace('{{L}}', '...')
    text = text.replace('{{l}}', '..')
    text = text.lstrip(';')
    text = text.strip(' ,')
        
    return text

In [6]:
clean_text("; совершенно, абсолютно, {{L}} нисколько не уважать кого-либо; не счи")

'совершенно, абсолютно, ... нисколько не уважать кого-либо; не счи'

In [7]:
def get_word_id(word_id):
    return id_prefix + '_' + word_id

def get_sense_id(word_id, gloss_ind):
    return get_word_id(word_id) + f'::{gloss_ind}'

In [8]:
word_to_ids = {}
word_id_to_ids = {}
id_to_gloss = {}
id_to_word = {}

for word, glosses in words.items():
    current_ids = set()
    for i, gloss in enumerate(glosses):
        current_id = get_sense_id(word, i)
        current_ids.add(current_id)
        id_to_gloss[current_id] = gloss
        id_to_word[current_id] = word
    word_to_ids[word] = current_ids
    word_id_to_ids[get_word_id(word)] = current_ids

In [9]:
for sample in samples:
    sample['senseId'] = get_sense_id(sample['targetWordId'], sample['senseLabel'])
    
samples[0]

{'quotation': 'Для этого опыта нам потребовался {{выдел|эбонитовый}} стержень.',
 'senseLabel': 0,
 'targetWordId': 'эбонитовый+ADJECTIVE',
 'senseId': 'wikt_ru_эбонитовый+ADJECTIVE::0'}

In [10]:
from nltk.tokenize import word_tokenize

In [11]:
def tokenize_sample(sample_text):
    target_words = re.findall(r"{{выдел\|([^}]+)}}", sample_text)
    contexts = re.split(r"{{выдел\|[^}]+}}", sample_text)
    if len(target_words) == 0:
        target_words = re.findall(r"'''([^']+)'''", sample_text)
        contexts = re.split(r"'''[^']+'''", sample_text)
        
    if len(target_words) == 0:
        return None
    
    tokenized_contexts = [word_tokenize(context) for context in contexts]
    result_tokens = []
    
    for i in range(len(target_words)):
        result_tokens.extend((t, False) for t in tokenized_contexts[i])
        result_tokens.append((target_words[i], True))
    result_tokens.extend((t, False) for t in tokenized_contexts[-1])
    
    return result_tokens

In [12]:
samples[6676]

{'quotation': 'На крутом склоне горы в конце улицы с незапамятных времён высился огромный {{выдел|круглый}} камень.',
 'senseLabel': 0,
 'targetWordId': 'круглый+ADJECTIVE',
 'senseId': 'wikt_ru_круглый+ADJECTIVE::0'}

In [13]:
tokenize_sample(samples[6676]['quotation'])

[('На', False),
 ('крутом', False),
 ('склоне', False),
 ('горы', False),
 ('в', False),
 ('конце', False),
 ('улицы', False),
 ('с', False),
 ('незапамятных', False),
 ('времён', False),
 ('высился', False),
 ('огромный', False),
 ('круглый', True),
 ('камень', False),
 ('.', False)]

In [14]:
tokenize_sample("долгое время были только монастыри со своими '''скрипториями''', где иногда даже")

[('долгое', False),
 ('время', False),
 ('были', False),
 ('только', False),
 ('монастыри', False),
 ('со', False),
 ('своими', False),
 ('скрипториями', True),
 (',', False),
 ('где', False),
 ('иногда', False),
 ('даже', False)]

In [15]:
broken_glosses = []

for sense_id in id_to_gloss:
    id_to_gloss[sense_id] = clean_text(id_to_gloss[sense_id])
    if len(id_to_gloss[sense_id]) < min_gloss_len:
        broken_glosses.append(sense_id)
        
for sense_id in broken_glosses:
    id_to_gloss.pop(sense_id)
    word_to_ids[id_to_word[sense_id]].remove(sense_id)
    
for word_id in word_id_to_ids:
    word_id_to_ids[word_id] = list(word_id_to_ids[word_id])
    
len(broken_glosses)

3420

In [16]:
broken_glosses = set(broken_glosses)
cleaned_samples = []

for sample in samples:
    if sample['senseId'] not in broken_glosses:
        cleaned_samples.append(sample)
    
print(len(samples) - len(cleaned_samples))

samples = cleaned_samples

1476


In [17]:
from tqdm.auto import tqdm

In [18]:
tokenized_samples = []

for sample in tqdm(samples):
    word = sample['targetWordId']
    target_lemma, pos = word.split('+')
    quotation = clean_text(sample['quotation'])
    tokens = tokenize_sample(quotation)
    if tokens is not None and len(word_to_ids[word]) >= 2:
        tokenized_samples.append({
            'lemma': target_lemma, 'pos': pos,
            'sense_id': sample['senseId'], 'tokens': tokens
        })
        
len(tokenized_samples)

  0%|          | 0/160333 [00:00<?, ?it/s]

68119

In [19]:
tokenized_samples[1]

{'lemma': 'эбонитовый',
 'pos': 'ADJECTIVE',
 'sense_id': 'wikt_ru_эбонитовый+ADJECTIVE::1',
 'tokens': [('У', False),
  ('неё', False),
  ('были', False),
  ('две', False),
  ('короткохвостые', False),
  ('шиншиллы', False),
  (':', False),
  ('одна', False),
  ('эбонитовая', True),
  (',', False),
  ('а', False),
  ('другая', False),
  ('—', False),
  ('бежевая', False),
  ('.', False)]}

In [20]:
word_id_to_ids['wikt_ru_эбонитовый+ADJECTIVE']

['wikt_ru_эбонитовый+ADJECTIVE::0', 'wikt_ru_эбонитовый+ADJECTIVE::1']

In [21]:
id_to_gloss['wikt_ru_эбонитовый+ADJECTIVE::1']

'очень чёрный'

In [22]:
from sklearn.model_selection import train_test_split

In [24]:
train_samples, dev_samples, _, _ = train_test_split(
    tokenized_samples,
    range(len(tokenized_samples)),
    test_size=dev_samples_part,
    random_state=random_seed
)
# TODO: rewrite without sklearn

len(train_samples), len(dev_samples)

(61307, 6812)

In [25]:
with open(output_train_samples_filename, 'w', encoding='utf-8') as f:
    json.dump(train_samples, f, indent=4, ensure_ascii=False)
    
with open(output_dev_samples_filename, 'w', encoding='utf-8') as f:
    json.dump(dev_samples, f, indent=4, ensure_ascii=False)
    
with open(output_words_filename, 'w', encoding='utf-8') as f:
    json.dump(word_id_to_ids, f, indent=4, ensure_ascii=False)
    
with open(output_glosses_filename, 'w', encoding='utf-8') as f:
    json.dump(id_to_gloss, f, indent=4, ensure_ascii=False)