In [39]:
import pandas as pd
import nltk.data
from bs4 import BeautifulSoup  
import re
from nltk.corpus import stopwords
import numpy as np
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

import random

In [40]:
train = pd.read_csv( "stories.tsv", header=0, delimiter="\t", quoting=3 )
train = train[train['summary'].notnull()]

In [41]:
def pre_process_review(review, remove_stopwords=False):
    text = BeautifulSoup(review, "lxml").get_text() # remove html
    no_punc = re.sub("[^a-zA-Z]", " ", text) # remove punctuation
    words = no_punc.lower().split() # lower case and split into words
    if remove_stopwords:
        stops = set(stopwords.words("english"))  
        words = [w for w in words if not w in stops]
    return words


def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append( pre_process_review( raw_sentence, remove_stopwords ))
    return sentences

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


sentences = []  # Initialize an empty list of sentences
map_uid = {}
i = 0
print("Parsing sentences from training set")
for idx, review in train.iterrows():

    sentence = review_to_sentences(review[1] + ' ' + review[3], tokenizer)
    key = "SENT_{}".format(review[0])
    td = TaggedDocument(words=sentence[0], tags=[key])
    sentences.append(td)
    map_uid[key] = i
    i += 1

Parsing sentences from training set


In [42]:
def sentences_perm(sentences):
        shuffled = list(sentences)
        random.shuffle(shuffled)
        return(shuffled)

In [43]:
model = Doc2Vec(sentences, alpha=0.025, min_alpha=0.025)  # use fixed learning rate
model.train_words = False
for epoch in range(10):
    model.train(sentences_perm(sentences), total_examples=model.corpus_count)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay

In [44]:
model.save('stories_sim.doc2vec')
# load the model back
model_loaded = Doc2Vec.load('stories_sim.doc2vec')

In [49]:
doc_id = 3667 #np.random.randint(model_loaded.docvecs.count)
sentences[doc_id]

TaggedDocument(words=['this', 'couple', 'have', 'a', 'stranger', 'things', 'contract', 'to', 'ensure', 'no', 'spoilers', 'shall', 'ruin', 'season', 'the', 'time', 'has', 'almost', 'come'], tags=['SENT_185311'])

In [46]:
sims = model_loaded.docvecs.most_similar(doc_id, topn=model_loaded.docvecs.count) 

In [50]:
print(u'TARGET (%d) (%s): «%s»\n' % (doc_id,sentences[doc_id].tags, ' '.join(sentences[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
# for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
#     idx = map_uid[sims[index][0]]
#     print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(sentences[idx].words)))

for index in range(100):
    idx = map_uid[sims[index][0]]
    print(u'%s %s: «%s»\n' % (index, sims[index], ' '.join(sentences[idx].words)))

TARGET (3667) (['SENT_185311']): «this couple have a stranger things contract to ensure no spoilers shall ruin season the time has almost come»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d100,n5,w5,mc5,s0.001,t3):

0 ('SENT_187952', 0.5728790760040283): «lost teddy bear gets special treatment on flight back to its owner a young girl was reunited with her lost teddy bear thanks to a flight attendant who brought the toy from edinburgh to orkney on november»

1 ('SENT_188098', 0.5676577687263489): «family of lynx visit anchorage alaska home alaska resident cathy newton captured footage of a family of lynx that visited her home in anchorage on october»

2 ('SENT_185162', 0.5658762454986572): «cars drive through flooded streets in boone north carolina cleanup efforts continued in boone north carolina on october after flash flooding hit the area the previous day local media reported»

3 ('SENT_185659', 0.5549824237823486): «polish plane aborts crosswind landing in austria returns to fra