## Word2Vec

We will use this notebook to display examples of word2vec and doc2vec.

In [1]:
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim import utils
import gensim
import multiprocessing
import os
from time import time
cores = multiprocessing.cpu_count()

In [2]:
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
 ['this', 'is', 'the', 'second', 'sentence'],
 ['yet', 'another', 'sentence'],
 ['one', 'more', 'sentence'],
 ['and', 'the', 'final', 'sentence']]

In [3]:
# train model
model = Word2Vec(sentences, min_count=1)

In [4]:
# summarize the loaded model
print(model)

Word2Vec(vocab=14, vector_size=100, alpha=0.025)


In [5]:
# access vector for one word
model.wv['sentence']

array([-5.3622725e-04,  2.3643016e-04,  5.1033497e-03,  9.0092728e-03,
       -9.3029495e-03, -7.1168090e-03,  6.4588715e-03,  8.9729885e-03,
       -5.0154282e-03, -3.7633730e-03,  7.3805046e-03, -1.5334726e-03,
       -4.5366143e-03,  6.5540504e-03, -4.8601604e-03, -1.8160177e-03,
        2.8765798e-03,  9.9187379e-04, -8.2852151e-03, -9.4488189e-03,
        7.3117660e-03,  5.0702621e-03,  6.7576934e-03,  7.6286553e-04,
        6.3508893e-03, -3.4053659e-03, -9.4640255e-04,  5.7685734e-03,
       -7.5216386e-03, -3.9361049e-03, -7.5115822e-03, -9.3004224e-04,
        9.5381187e-03, -7.3191668e-03, -2.3337698e-03, -1.9377422e-03,
        8.0774352e-03, -5.9308959e-03,  4.5161247e-05, -4.7537349e-03,
       -9.6035507e-03,  5.0072931e-03, -8.7595871e-03, -4.3918253e-03,
       -3.5099984e-05, -2.9618264e-04, -7.6612402e-03,  9.6147414e-03,
        4.9820566e-03,  9.2331432e-03, -8.1579182e-03,  4.4957972e-03,
       -4.1370774e-03,  8.2453492e-04,  8.4986184e-03, -4.4621779e-03,
      

In [6]:
# save model
model.save('model.bin')

In [7]:
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

Word2Vec<vocab=14, vector_size=100, alpha=0.025>


In [8]:
del model

In [9]:
class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = datapath('lee_background.cor')
        for line in open(corpus_path):
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

In [10]:
sentences = MyCorpus()

In [11]:
corpus_path = datapath('lee_background.cor')
for line in open(corpus_path):
    # assume there's one document per line, tokens separated by whitespace
    print(utils.simple_preprocess(line))
    break

['hundreds', 'of', 'people', 'have', 'been', 'forced', 'to', 'vacate', 'their', 'homes', 'in', 'the', 'southern', 'highlands', 'of', 'new', 'south', 'wales', 'as', 'strong', 'winds', 'today', 'pushed', 'huge', 'bushfire', 'towards', 'the', 'town', 'of', 'hill', 'top', 'new', 'blaze', 'near', 'goulburn', 'south', 'west', 'of', 'sydney', 'has', 'forced', 'the', 'closure', 'of', 'the', 'hume', 'highway', 'at', 'about', 'pm', 'aedt', 'marked', 'deterioration', 'in', 'the', 'weather', 'as', 'storm', 'cell', 'moved', 'east', 'across', 'the', 'blue', 'mountains', 'forced', 'authorities', 'to', 'make', 'decision', 'to', 'evacuate', 'people', 'from', 'homes', 'in', 'outlying', 'streets', 'at', 'hill', 'top', 'in', 'the', 'new', 'south', 'wales', 'southern', 'highlands', 'an', 'estimated', 'residents', 'have', 'left', 'their', 'homes', 'for', 'nearby', 'mittagong', 'the', 'new', 'south', 'wales', 'rural', 'fire', 'service', 'says', 'the', 'weather', 'conditions', 'which', 'caused', 'the', 'fire'

In [12]:
model = Word2Vec(sentences, vector_size=200, min_count=2, workers=cores-1)

In [13]:
print(model)

Word2Vec<vocab=3955, vector_size=200, alpha=0.025>


In [14]:
model.wv.most_similar(positive=['wales'], topn=5)

[('new', 0.99937903881073),
 ('south', 0.9993640780448914),
 ('africa', 0.9992729425430298),
 ('australia', 0.9992663264274597),
 ('test', 0.9992376565933228)]

In [17]:
model.wv['new']

array([-0.14338367,  0.03998487, -0.20943362,  0.5973699 ,  0.30079585,
       -0.23764928,  0.08761957,  1.1943921 , -0.31954172,  0.18249029,
       -0.02307011, -0.44148406,  0.47445065,  0.3503205 , -0.09545527,
       -0.3976932 , -0.10317948,  0.28344774, -0.02775254, -1.0466515 ,
        0.59434086, -0.36467063, -0.23369451, -0.13927028,  0.53105485,
       -0.5877652 , -0.04237099, -0.48288912, -0.85628843, -0.46268696,
        0.39186865,  0.31432793,  0.26629812, -0.19758067, -0.17839251,
        0.50475913,  0.5608563 , -0.07763929, -0.13094828, -0.7425225 ,
       -0.5374132 , -0.09287325, -0.0956854 ,  0.23006085,  0.5630922 ,
        0.07078621, -0.43105054, -0.0108513 ,  0.22206661,  0.2765979 ,
       -0.16630223, -0.3688862 , -0.49419984, -0.32656154,  0.29817665,
       -0.54484844,  0.00660767, -0.74903667, -0.64437777, -0.06086994,
       -0.08329268, -0.13262145,  0.33610827, -0.16957305, -0.7150176 ,
        0.36537975, -0.16831899,  0.9082919 , -0.9343931 ,  0.65

In [18]:
del model

In [19]:
from nltk.corpus import reuters, stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
lemmatizer = WordNetLemmatizer()

def process_text(doc):
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', doc)
    words = word_tokenize(re_clean)
    lem = [lemmatizer.lemmatize(word) for word in words]
    output = [word.lower() for word in lem]
    return output

In [20]:
ids = reuters.fileids()
corpus = [reuters.raw(i) for i in ids]

In [21]:
t = time()
# word tokenize all sentences
sentences = [process_text(story) for story in corpus]
print('Time to preprocess: {} mins'.format(round((time() - t) / 60, 2)))

Time to preprocess: 0.18 mins


In [22]:
model = Word2Vec(min_count=5,
                     window=5,
                     vector_size=300,  
                     workers=cores-1)

In [23]:
t = time()

model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.01 mins


In [24]:
t = time()

model.train(sentences, total_examples=model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 0.31 mins


In [25]:
model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

[('siddeley', 0.9943416714668274),
 ('andover', 0.9761882424354553),
 ('ltsterling', 0.9651047587394714),
 ('ltms', 0.9506111741065979),
 ('ltpwj', 0.9304921627044678),
 ('ltrev', 0.9234672784805298),
 ('ps', 0.9175193309783936),
 ('caledonian', 0.9125973582267761),
 ('wpp', 0.9030499458312988),
 ('dark', 0.8927291035652161)]

In [26]:
model.wv.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [27]:
model.wv.similarity('woman', 'man')

-0.05434244

In [28]:
model.wv.similarity('woman', 'cereal')

-0.029030088

In [29]:
model.wv.distance('man', 'woman')

1.0543424412608147

Probability of a text under the model:

In [30]:
del model

Bigrams

In [23]:
bigram_transformer = gensim.models.Phrases(sentences)
model_bi = word2vec.Word2Vec(bigram_transformer[sentences], size=100)



### Doc2Vec



In [31]:
import gensim
import os
import collections
import smart_open
import random

In [32]:
# Set file names for train and test data
test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'
lee_test_file = test_data_dir + os.sep + 'lee.cor'

In [33]:
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [34]:
train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

In [35]:
train_corpus[:2]

[TaggedDocument(words=['hundreds', 'of', 'people', 'have', 'been', 'forced', 'to', 'vacate', 'their', 'homes', 'in', 'the', 'southern', 'highlands', 'of', 'new', 'south', 'wales', 'as', 'strong', 'winds', 'today', 'pushed', 'huge', 'bushfire', 'towards', 'the', 'town', 'of', 'hill', 'top', 'new', 'blaze', 'near', 'goulburn', 'south', 'west', 'of', 'sydney', 'has', 'forced', 'the', 'closure', 'of', 'the', 'hume', 'highway', 'at', 'about', 'pm', 'aedt', 'marked', 'deterioration', 'in', 'the', 'weather', 'as', 'storm', 'cell', 'moved', 'east', 'across', 'the', 'blue', 'mountains', 'forced', 'authorities', 'to', 'make', 'decision', 'to', 'evacuate', 'people', 'from', 'homes', 'in', 'outlying', 'streets', 'at', 'hill', 'top', 'in', 'the', 'new', 'south', 'wales', 'southern', 'highlands', 'an', 'estimated', 'residents', 'have', 'left', 'their', 'homes', 'for', 'nearby', 'mittagong', 'the', 'new', 'south', 'wales', 'rural', 'fire', 'service', 'says', 'the', 'weather', 'conditions', 'which', '

In [36]:
test_corpus[:2]

[['the',
  'national',
  'executive',
  'of',
  'the',
  'strife',
  'torn',
  'democrats',
  'last',
  'night',
  'appointed',
  'little',
  'known',
  'west',
  'australian',
  'senator',
  'brian',
  'greig',
  'as',
  'interim',
  'leader',
  'shock',
  'move',
  'likely',
  'to',
  'provoke',
  'further',
  'conflict',
  'between',
  'the',
  'party',
  'senators',
  'and',
  'its',
  'organisation',
  'in',
  'move',
  'to',
  'reassert',
  'control',
  'over',
  'the',
  'party',
  'seven',
  'senators',
  'the',
  'national',
  'executive',
  'last',
  'night',
  'rejected',
  'aden',
  'ridgeway',
  'bid',
  'to',
  'become',
  'interim',
  'leader',
  'in',
  'favour',
  'of',
  'senator',
  'greig',
  'supporter',
  'of',
  'deposed',
  'leader',
  'natasha',
  'stott',
  'despoja',
  'and',
  'an',
  'outspoken',
  'gay',
  'rights',
  'activist'],
 ['cash',
  'strapped',
  'financial',
  'services',
  'group',
  'amp',
  'has',
  'shelved',
  'million',
  'plan',
  'to',
 

In [37]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [38]:
model.build_vocab(train_corpus)

In [39]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [40]:
model.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires'])

array([-0.04791664, -0.26631793, -0.20385428,  0.04493998,  0.07566991,
       -0.04488162,  0.08599745, -0.01018131, -0.11621599, -0.04643453,
        0.15109375, -0.0326075 ,  0.00045545, -0.05237348, -0.0528719 ,
       -0.20590305, -0.08528878,  0.12680979,  0.14977431, -0.08923152,
       -0.07557649, -0.08774199,  0.07382328, -0.02180372, -0.13142008,
       -0.11524298, -0.15427268, -0.11294831,  0.01444572, -0.1715813 ,
        0.26346606, -0.06099621,  0.17754175,  0.12617412,  0.18923643,
       -0.03582897,  0.01700234, -0.20325042, -0.20874994, -0.07782171,
       -0.06087074, -0.11536571,  0.0212279 , -0.02624841,  0.11088474,
        0.03155903, -0.06317703, -0.16629872,  0.09989914,  0.02870445],
      dtype=float32)

Assessing Model

In [41]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

In [42]:
inferred_vector = model.infer_vector(train_corpus[0].words)
sims = model.dv.most_similar([inferred_vector])

In [43]:
sims

[(0, 0.9627946019172668),
 (48, 0.8895305395126343),
 (255, 0.8523130416870117),
 (40, 0.8485395908355713),
 (272, 0.8371813297271729),
 (33, 0.8371266722679138),
 (8, 0.8319430351257324),
 (264, 0.7350155115127563),
 (9, 0.7048150897026062),
 (105, 0.7012432217597961)]

In [44]:
' '.join(train_corpus[0].words)

'hundreds of people have been forced to vacate their homes in the southern highlands of new south wales as strong winds today pushed huge bushfire towards the town of hill top new blaze near goulburn south west of sydney has forced the closure of the hume highway at about pm aedt marked deterioration in the weather as storm cell moved east across the blue mountains forced authorities to make decision to evacuate people from homes in outlying streets at hill top in the new south wales southern highlands an estimated residents have left their homes for nearby mittagong the new south wales rural fire service says the weather conditions which caused the fire to burn in finger formation have now eased and about fire units in and around hill top are optimistic of defending all properties as more than blazes burn on new year eve in new south wales fire crews have been called to new fire at gunning south of goulburn while few details are available at this stage fire authorities says it has clo

In [45]:
collections.Counter(ranks) 

Counter({0: 292, 1: 8})

In [46]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (299): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not v

In [47]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the most/median/least similar documents from the train corpus
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (103): «the hih royal commission has heard evidence that there were doubts about the company ability to pay all of its creditors three months before its collapse partner for accountancy firm ernst and young john gibbons says he and his colleague kim smith attended meeting with hih on november mr gibbons has told the commission hih chairman ray williams and finance director dominic federa were at that meeting mr gibbons said mr smith noted that if hih was wound up on that date there would be clear shortage of assets to pay creditors he says the directors were told it was highly likely all creditors would not receive per cent returns the commission has also heard that the accountancy firm told the directors that even with hih restructuring plans there was potential for insolvency»

Similar Document (247, 0.7992265820503235): «the royal commission into hih has been adjourned until monday after interviewing of the first witness ended abruptly lawyers acting on behalf of seve