In [1]:
%matplotlib inline
import gensim
import en_core_web_md
import spacy
from gensim.utils import tokenize
from nltk.tokenize import sent_tokenize
import itertools
from tqdm import tqdm
import logging
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)

In [3]:
nlp = en_core_web_md.load(disable=['parser', 'ner'])

In [4]:
nlp.add_pipe(nlp.create_pipe('sentencizer'))

41168

In [5]:
class LOTR:
    def __iter__(self):
        text = None
        with open('lotr.txt') as f:
            text = f.read()
        sents = sent_tokenize(text)
        #doc = spacy.tokens.Doc(nlp.vocab, tokens)
        #nlp.tagger(doc)
        for sent in sents:
            tokens = nlp(sent)
            new_tokens = ['<S>']
            for token in tokens:
                if token.lemma_ == ' ':
                    continue
                else:
                    new_tok = token.lemma_ + '_' + token.tag_
                    new_tokens.append(new_tok)
            new_tokens.append('</S>')
            yield new_tokens

lotr = list(tqdm(LOTR(),total=41168))

100%|██████████| 41168/41168 [02:50<00:00, 241.72it/s]


In [37]:
m4 = gensim.models.Word2Vec(lotr, size=100, window=5, min_count=1, iter=40)
m4.save('lotr.gs')

In [38]:
len(m4.wv.vocab)

20709

In [39]:
m4.wv['ring_NN']

array([ 1.9555393 , -0.13288398,  0.7162461 , -1.7443082 ,  0.4357134 ,
        2.1088314 ,  1.7970675 ,  0.9730945 , -0.2817831 , -1.2967294 ,
        1.0671456 ,  1.1517249 ,  0.256114  ,  1.8122553 , -3.0562263 ,
        0.75673985,  0.23464973,  1.7259942 ,  0.3920171 ,  3.9448795 ,
       -1.1873442 ,  0.33956924,  1.0164793 ,  0.17521821,  1.4563706 ,
        0.05073483,  1.3544884 ,  1.666326  ,  2.5911598 , -0.37733594,
       -0.12244103,  1.1340721 , -2.833015  , -2.2428408 ,  1.6615325 ,
        3.817434  ,  0.14090115, -0.6368559 , -1.139593  ,  0.42020988,
       -0.25808644,  3.5547032 , -0.56135553, -2.1688087 , -0.53243524,
        0.77664167,  0.29440054,  0.10226014, -0.5761085 , -0.5382233 ,
       -1.324501  , -0.01425888,  0.78402346,  1.2903308 , -1.0995082 ,
        1.305997  ,  0.50497484,  1.5924135 , -1.0156242 , -1.4779359 ,
       -2.8891919 , -0.82433075,  0.07610452, -0.79525024, -0.51849157,
       -0.18065079, -0.29567546,  1.8603171 , -1.2751254 , -1.30

In [40]:
m4.wv.most_similar(positive=['frodo_NNP', 'ring_NNP'])

[('bane_NNP', 0.5592973232269287),
 ('gollum_NNP', 0.5343833565711975),
 ('farmer_NN', 0.5263173580169678),
 ('master_NN', 0.5262237787246704),
 ('faramir_NNP', 0.5124294757843018),
 ('pippin_NNP', 0.4891310930252075),
 ('baggins_NNP', 0.4859563708305359),
 ('isildur_NNP', 0.4845581650733948),
 ('company_NNP', 0.48206382989883423),
 ('bilbo_NNP', 0.4737789034843445)]

In [41]:
m4.wv.most_similar(positive=['gandalf_NNP', 'ring_NNP'])

[('saruman_NNP', 0.6206921339035034),
 ('boromir_NNP', 0.5949528217315674),
 ('faramir_NNP', 0.5849675536155701),
 ('bane_NNP', 0.5618189573287964),
 ('company_NNP', 0.5549007654190063),
 ('isildur_NNP', 0.5471694469451904),
 ('map_NN', 0.540136992931366),
 ('beregond_NNP', 0.5380115509033203),
 ('wizard_NN', 0.5357128381729126),
 ('belladonna_NNP', 0.5289272665977478)]

In [42]:
m4.wv.most_similar(positive=['arathorn_NNP'], negative=['aragorn_NNP'])

[('willow_NNP', 0.48530858755111694),
 ('heir_NNP', 0.47873497009277344),
 ('compose_VBD', 0.47868460416793823),
 ('relative_NN', 0.47860491275787354),
 ('grimbeorn_NNP', 0.4658237099647522),
 ('jolly_RB', 0.4637915790081024),
 ('contents_NNPS', 0.459810733795166),
 ('departure_NNP', 0.45532581210136414),
 ('gram_NNP', 0.45282283425331116),
 ('\n \n\n __SP', 0.44886893033981323)]

In [43]:
m4.wv.most_similar(positive=['king_NN', 'aragorn_NNP'], negative=['queen_NN'])

[('jomer_NNP', 0.6487311720848083),
 ('faramir_NNP', 0.616490364074707),
 ('gandalf_NNP', 0.5973088145256042),
 ('thjoden_NNP', 0.5944362878799438),
 ('beregond_NNP', 0.5769267082214355),
 ('gimli_NNP', 0.566455602645874),
 ('imrahil_NNP', 0.5574600100517273),
 ('elrond_NNP', 0.5420892834663391),
 ('glorfindel_NNP', 0.5154005289077759),
 ('denethor_NNP', 0.5064072608947754)]

In [135]:
tsne = TSNE()
X = tsne.fit_transform(m4[m4.wv.vocab])

  


In [141]:
df = pd.DataFrame(X, index=m4.wv.vocab, columns=['x', 'y'])

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

ax.scatter(df['x'], df['y'])
for word, pos in df.iterrows():
    ax.annotate(word, pos)
ax.

2018-04-28 16:35:01,461 : ERROR : Uncaught exception in ZMQStream callback
Traceback (most recent call last):
  File "/Users/serhiinechyporhuk/.local/share/virtualenvs/SerhiiNechyporchuk-Ib9yWLjX/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/Users/serhiinechyporhuk/.local/share/virtualenvs/SerhiiNechyporchuk-Ib9yWLjX/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/serhiinechyporhuk/.local/share/virtualenvs/SerhiiNechyporchuk-Ib9yWLjX/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/serhiinechyporhuk/.local/share/virtualenvs/SerhiiNechyporchuk-Ib9yWLjX/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/serhiinechyporhuk/.local/share/virtualenvs/SerhiiNechyporchuk-Ib9yWL