In [7]:
# Repearing the Training and Test data
import os
import gensim
test_data_dir = os.path.join(gensim.__path__[0], 'test', 'test_data')
lee_train_file = os.path.join(test_data_dir, 'lee_background.cor')
lee_test_file = os.path.join(test_data_dir, 'lee.cor')

In [8]:
# Define a Function to Read and Preprocess Text
import smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

train_corpus[:2]
test_corpus[:2]

[['the',
  'national',
  'executive',
  'of',
  'the',
  'strife',
  'torn',
  'democrats',
  'last',
  'night',
  'appointed',
  'little',
  'known',
  'west',
  'australian',
  'senator',
  'brian',
  'greig',
  'as',
  'interim',
  'leader',
  'shock',
  'move',
  'likely',
  'to',
  'provoke',
  'further',
  'conflict',
  'between',
  'the',
  'party',
  'senators',
  'and',
  'its',
  'organisation',
  'in',
  'move',
  'to',
  'reassert',
  'control',
  'over',
  'the',
  'party',
  'seven',
  'senators',
  'the',
  'national',
  'executive',
  'last',
  'night',
  'rejected',
  'aden',
  'ridgeway',
  'bid',
  'to',
  'become',
  'interim',
  'leader',
  'in',
  'favour',
  'of',
  'senator',
  'greig',
  'supporter',
  'of',
  'deposed',
  'leader',
  'natasha',
  'stott',
  'despoja',
  'and',
  'an',
  'outspoken',
  'gay',
  'rights',
  'activist'],
 ['cash',
  'strapped',
  'financial',
  'services',
  'group',
  'amp',
  'has',
  'shelved',
  'million',
  'plan',
  'to',
 

In [18]:
# Training the Model
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(train_corpus)
print(f"Word 'penalty' appeared {model.wv.get_vecattr('penalty', 'count')} times in the training corpus.")
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)


Word 'penalty' appeared 4 times in the training corpus.


In [10]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
vector

array([-0.1675882 , -0.27366742, -0.10139403,  0.2121064 , -0.03583408,
       -0.09157623,  0.04313027,  0.08175007, -0.26609933, -0.11163501,
        0.12789114, -0.08119049, -0.02012281, -0.08292892, -0.15383506,
       -0.18261328,  0.10000724,  0.20067796,  0.18077599, -0.02304729,
       -0.09908886, -0.05827311,  0.20005772,  0.08254921, -0.0438288 ,
       -0.03906713, -0.21980903, -0.04157836, -0.10934214,  0.01641025,
        0.36733457, -0.07394706,  0.22100867,  0.15016866,  0.18227561,
        0.1807672 , -0.03406857, -0.20708029, -0.15636607,  0.06444842,
       -0.04698168, -0.03685278,  0.10404624, -0.13286287,  0.02096705,
        0.17298155, -0.14990675, -0.02450923,  0.09067685, -0.05523528],
      dtype=float32)

In [11]:
# Assessing the Model
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [12]:
import collections

counter = collections.Counter(ranks)
counter

Counter({0: 293, 1: 7})

In [13]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (299): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not v

In [14]:
# Pick a random document from the corpus and infer a vector from the model
import random
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (140): «osama bin laden admitted planning the september terrorist attacks on the united states in videotape released by the pentagon today in the videotape lasting roughly one hour bin laden explains planning aspects of the operation and his own calculations in advance concerning the scale of the damage to the world trade center in new york and the number of casualties he said he expected the fire and gas from the attacks on the world trade center to topple the floors above the points where hijacked planes struck not the entire structure we calculated in advance the number of casualties from the enemy who would be killed based on the position of the tower he said according to transcript translated into english from the arabic due to my experience in this field was thinking that the fire from the gas in the plane would melt the iron structure of the building and collapse the area where the plane hit and all the floors above it only he said that is all that we had hoped fo

In [15]:
# Testing the Model
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (3): «radical armed islamist group with ties to tehran and baghdad has helped al qaida establish an international terrorist training camp in northern iraq kurdish officials say intelligence officers in the autonomous kurdish region of iraq told the guardian that the ansar al islam supporters of islam group is harbouring up to al qaida members in string of villages it controls along the iraq iran border most of them fled afghanistan after the us led offensive but officials from the patriotic union of kurdistan puk which controls part of north east iraq claim an abnormal number of recruits are making their way to the area from jordan syria and egypt»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (141, 0.7734178304672241): «united states air strikes on al qaeda fighters have intensified following the collapse of surrender talks with the northern alliance the battle for tora bora appears to be heading towards bloody climax northern alliance c