In [1]:
import os
import sys
import codecs

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.corpora import WikiCorpus
from tqdm import tqdm

In [2]:
%%time

f = 'ruwiki-latest-pages-articles.xml.bz2'
# with codecs.open('wiki.ru__preview.text', 'w',
with codecs.open('wiki.ru.text', 'w',
                 'utf-8', errors='ignore') as fout:
    wiki = WikiCorpus(f, lemmatize=False,
                      dictionary={}, processes=2
    print('starting...')
    for i, text in tqdm(enumerate(wiki.get_texts())):
        line = ' '.join(text) + '\n'
        fout.write(line)

0it [00:00, ?it/s]

starting...


1300455it [2:36:01, 138.91it/s]

CPU times: user 1h 32min 17s, sys: 9min 30s, total: 1h 41min 47s
Wall time: 5h 11min 43s





In [3]:
%%time
import pymorphy2

morph = pymorphy2.MorphAnalyzer()
with codecs.open('wiki.ru.text', 'r', 'utf-8') as fin:
    with codecs.open('wiki_data.txt',
                     'w', 'utf-8') as fout:
        for i, line in tqdm(enumerate(fin)):  # zip(range(100), fin):
            for w in line.strip().split(' '):
                fout.write(u'{} '.format(
                    morph.parse(w)[0].normal_form))
            fout.write('\n')

50000it [1:04:16, 20.54it/s]

CPU times: user 1h 13s, sys: 29.2 s, total: 1h 43s
Wall time: 1h 4min 16s





In [4]:
%%time
model_cbow_base = Word2Vec(LineSentence('wiki_data.txt'),
                           size=400,
                           window=5,
                           min_count=5,
                           workers=2)

CPU times: user 1h 5min 28s, sys: 25.1 s, total: 1h 5min 53s
Wall time: 40min 43s


In [5]:
%%time
model_cbow_base.save('wiki_w2v.model')

CPU times: user 2.25 s, sys: 1.89 s, total: 4.14 s
Wall time: 16.6 s


In [28]:
%%time
model_sg_base = Word2Vec(LineSentence('wiki_data.txt'),
                         sg=1,
                         size=400,
                         window=5,
                         min_count=5,
                         workers=2)

CPU times: user 4h 24min 21s, sys: 1min 1s, total: 4h 25min 22s
Wall time: 2h 52min 34s


In [29]:
%%time
model_sg_base.save('wiki_w2v_sg.model')

CPU times: user 2.26 s, sys: 1.72 s, total: 3.98 s
Wall time: 14.7 s


In [37]:
model_cbow_base= Word2Vec.load('wiki_w2v.model')
model_sg_base = Word2Vec.load('wiki_w2v_sg.model')

In [39]:
word = 'комендантский'
model_cbow_base.most_similar(word), model_sg_base.most_similar(word)

  


([('пополудни', 0.6086992025375366),
  ('гкал', 0.5136786699295044),
  ('микрорентген', 0.49235546588897705),
  ('урочный', 0.4689948260784149),
  ('киловатт', 0.46074092388153076),
  ('круглосуточный', 0.4566711187362671),
  ('буйнакск', 0.45602744817733765),
  ('час', 0.44416868686676025),
  ('дудаевский', 0.4425424039363861),
  ('полночный', 0.44177040457725525)],
 [('мпво', 0.5969606041908264),
  ('постовой', 0.5895310640335083),
  ('дудаевский', 0.5749117732048035),
  ('омсдон', 0.5741949677467346),
  ('комиссариатский', 0.5680011510848999),
  ('ковтюха', 0.566831111907959),
  ('патрулироваться', 0.5637158155441284),
  ('погранотряд', 0.5627739429473877),
  ('южурво', 0.5623371005058289),
  ('баррикадный', 0.5620620250701904)])

In [41]:
vecs_cbow = {k: model_cbow_base.wv.word_vec(k, use_norm=False) for k in model_cbow_base.wv.vocab}
vecs_sg = {k: model_sg_base.wv.word_vec(k, use_norm=False) for k in model_sg_base.wv.vocab}

In [42]:
import numpy as np

norms_cbow = {k: float(np.linalg.norm(v)) for k, v in vecs_cbow.items()}
norms_sg = {k: float(np.linalg.norm(v)) for k, v in vecs_sg.items()}

In [24]:
import json

with open('norms_cbow_base.json', 'w') as f:
    json.dump(norms_cbow, f)

In [43]:
import json

with open('norms_sg_base.json', 'w') as f:
    json.dump(norms_sg, f)

In [44]:
norms_cbow_sorted = sorted(norms_cbow.items(), key=lambda x: x[1], reverse=True)
norms_sg_sorted = sorted(norms_sg.items(), key=lambda x: x[1], reverse=True)

In [45]:
norms_cbow_sorted[:10], norms_cbow_sorted[-10:]

([('jpg', 30.95418357849121),
  ('ред', 29.2774658203125),
  ('деятель', 28.778352737426758),
  ('ул', 28.526844024658203),
  ('битва', 28.202451705932617),
  ('война', 28.119966506958008),
  ('династия', 28.02814483642578),
  ('престол', 27.755056381225586),
  ('училище', 27.752113342285156),
  ('зависеть', 27.596450805664062)],
 [('siebenberg', 0.2713257372379303),
  ('siebenbergjr', 0.2475903034210205),
  ('gsamoylov', 0.23046043515205383),
  ('leontiev', 0.2270047813653946),
  ('lguitar', 0.22493717074394226),
  ('hardred', 0.2056535929441452),
  ('я_атака', 0.1741270273923874),
  ('shumov', 0.1715807318687439),
  ('shurov', 0.16904737055301666),
  ('vsamoylov', 0.15682563185691833)])

In [46]:
norms_sg_sorted[:10], norms_sg_sorted[-10:]

([('bar', 9.805545806884766),
  ('till', 9.550966262817383),
  ('text', 9.081475257873535),
  ('align', 9.057826042175293),
  ('color', 8.8042573928833),
  ('fontsize', 8.701995849609375),
  ('width', 8.572052955627441),
  ('center', 8.357748985290527),
  ('bgcolor', 8.21220588684082),
  ('colspan', 8.180337905883789)],
 [('арио', 1.2444965839385986),
  ('кадавр', 1.234136939048767),
  ('аксио', 1.2317339181900024),
  ('наполео', 1.2276191711425781),
  ('артакс', 1.2191438674926758),
  ('пья', 1.1911792755126953),
  ('евкли', 1.1821703910827637),
  ('жанда', 1.1652240753173828),
  ('отч', 1.1402981281280518),
  ('нахо', 1.1079518795013428)])

/\

судя по всему, какие-то тэги пробрались в корпус.