In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

import json

In [None]:
with open('./data/papers.json', 'r', encoding='utf-8') as f:
  data = json.load(f)

title_tag_obj = dict()
abstract_tag_obj = dict()
tagged_data = []

def tag_docs(data, tag_obj, field):
  for i, d in enumerate(data):
    td = TaggedDocument(words=word_tokenize(d[field].lower()), tags=[i])
    tagged_data.append(td)
    tag_obj[i] = data[i][field]

  return tagged_data

tagged_abstracts = tag_docs(data, abstract_tag_obj, 'abstract')
tagged_titles = tag_docs(data, title_tag_obj, 'title')

In [None]:
def create_and_train(tagged_data, model_name, max_epochs=100, vec_size=20, alpha=0.025):
  # max_epochs: Number of iterations over the data
  # vec_size: Dimensionality of the feature vectors.
  # alpha: Initial learning rate

  model = Doc2Vec(vec_size=vec_size,
                  alpha=alpha,
                  min_alpha=0.00025,  # Learning rate will linearly drop to 'min_alpha' as training progresses.
                  min_count=1,  # Ignores all words with total frequency lower than this.
                  dm=1,  # The training algorithm.  If dm=1, ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW)
                  workers=10,  # Threads
                  )
  
  model.build_vocab(tagged_data)

  for epoch in range(max_epochs):
    model.train(tagged_data,
                total_examples=model.corpus_count,  # Count of sentences
                epochs=model.iter  # Number of iterations (epochs) over the corpus.
                )
    
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

  model.save(model_name)
  
  return model

In [None]:
def find_most_similar_vectors(tagged_data, tag_object, similar_to_tag, model_name):
  # similar_to_tag: number between 0 and number of titles/abstracts
  create_and_train(tagged_abstracts, model_name)
  model = Doc2Vec.load(model_name)

  # find the most similar abstracts using tags
  similar_doc = model.docvecs.most_similar(similar_to_tag)
  similar_vectors = [(tag_object[tag], value) for tag, value in similar_doc]

  return similar_vectors

In [None]:
find_most_similar_vectors(tagged_abstracts, abstract_tag_obj, 1, 'abstracts.model')
find_most_similar_vectors(tagged_titles, title_tag_obj, 1, 'title.model')