<a href="https://colab.research.google.com/github/ajurberg/article-parser/blob/main/07_doc2vec_gensim_gdf11_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
############################# 
#@title Installation of libraries 
#############################
import os
from time import time
import re, string, unicodedata
from string import punctuation
from tqdm import tqdm
import pandas as pd
import numpy as np
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
############################# 
#@title Mounting Google Drive
#############################
from google.colab import drive
drive._mount('/content/drive')

workdir_path = '/content/drive/My Drive/papers'
os.chdir(workdir_path)

Mounted at /content/drive


In [None]:
############################# 
#@title Read json file to dictionary
#############################
import json

with open("gdf11-dictionary-word-tokens.json") as json_file:
  gdf11_dict = json.load(json_file)

#@markdown Files to evaluate
#@markdown - `gdf11-dictionary.json` (no preprocessing)
#@markdown - `gdf11-dictionary-sent-tokens.json` (tokenized sentences)
#@markdown - `gdf11-dictionary-sent-tokens-stop.json` (tokenized sentences with removal of stopwords)
#@markdown - `gdf11-dictionary-word-tokens.json` (tokenized words)
#@markdown - `gdf11-dictionary-word-tokens-stop.json` (tokenized words with removal of stopwords)

In [None]:
############################# 
#@title Doc2Vec Gensim
#############################
#@markdown https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


2022-01-15 02:11:13,484 : INFO : 'pattern' package not found; tag filters are not available for English


In [None]:
############################# 
#@title Preparation of training data
#############################
#@markdown For non-tokenized dictionaries - that is `gdf11-dictionary.json`

# tagged_data = []
# for key, text in tqdm(gdf11_dict.items()):
#   t = TaggedDocument(words=word_tokenize(text), tags=[str(key)])
#   tagged_data.append(t)

In [None]:
#@markdown For tokenized dictionaries
tagged_data = []
for key, text in tqdm(gdf11_dict.items()):
  t = TaggedDocument(words=text, tags=[str(key)])
  tagged_data.append(t)

100%|██████████| 334/334 [00:00<00:00, 236558.18it/s]


In [None]:
############################# 
#@title (*If model is already trained, skip to next step*) Model training
#############################
import gensim

MAX_EPOCHS = 100 #@param {type:"integer"}
VEC_SIZE = 20 #@param {type:"integer"}
ALPHA = 0.025 #@param {type:"number"}
DM = 1 #@param [0, 1]
#@markdown * `0` = 'distributed bag of words' (PV-DBOW): does not preserve the word order in the document
#@markdown * `1` = 'distributed memory' (PV-DM): preserves the word order in the document

model = gensim.models.doc2vec.Doc2Vec(vector_size=VEC_SIZE, # Doc2Vec
                                      alpha=ALPHA,
                                      min_alpha=0.00025,
                                      min_count=1,
                                      dm=DM)

# Build a vocabulary  
model.build_vocab(tagged_data)

# Actual training - it takes about 25-30 minutes to run
for epoch in tqdm(range(MAX_EPOCHS)):
  model.train(tagged_data,
              total_examples=model.corpus_count,
              epochs=model.epochs)
  # Decrease the learning rate
  model.alpha -= 0.0002
  # Fix the learning rate, no decay
  model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")

In [None]:
############################# 
#@title (Alternative) Load the model
#############################
import gensim
from gensim.models.doc2vec import Doc2Vec

model = Doc2Vec.load("d2v.model")

2022-01-15 02:44:32,890 : INFO : loading Doc2Vec object from d2v.model
2022-01-15 02:44:33,367 : INFO : loading vocabulary recursively from d2v.model.vocabulary.* with mmap=None
2022-01-15 02:44:33,368 : INFO : loading trainables recursively from d2v.model.trainables.* with mmap=None
2022-01-15 02:44:33,371 : INFO : loading wv recursively from d2v.model.wv.* with mmap=None
2022-01-15 02:44:33,374 : INFO : loading docvecs recursively from d2v.model.docvecs.* with mmap=None
2022-01-15 02:44:33,376 : INFO : loaded d2v.model


In [None]:
#@markdown Identify papers with higher similarity to a given `test_sentence`
test_sentence = "Loss of myostatin increases muscle mass." #@param {type: 'string'}
test_data = word_tokenize(test_sentence.lower()) # word tokenization before inference
vector = model.infer_vector(test_data)
sims = model.docvecs.most_similar([vector])
print(sims)
#print(f"Vector: {vector}")
# Because the underlying training/inference algorithms are an iterative approximation problem that makes use of internal randomization, repeated inferences of the same text may return slightly different vectors.

2022-01-15 02:47:46,534 : INFO : precomputing L2-norms of doc weight vectors


[('j.pharmthera.2017.02.032', 0.46597936749458313), ('j.oftal.2014.05.012', 0.4462655782699585), ('science.344.6184.570', 0.44061872363090515), ('j.pep.2008.09.014', 0.4315495491027832), ('science.1251141', 0.41867461800575256), ('cpr.12631', 0.4175918400287628), ('phy2.12663', 0.4146520495414734), ('MCB.24.12.5106-5118.2004', 0.4131613075733185), ('s12033-019-00154-w', 0.41275060176849365), ('etm.2018.5861', 0.4123450517654419)]


In [None]:
############################# 
#@title Assessing the model
#############################
#@markdown To assess the model, we first infer new vectors for each document of the training corpus,
#@markdown compare the inferred vectors with the training corpus, and 
#@markdown then return the rank of the document based on self-similarity.

#@markdown The expectation is that we likely overfit our model 
#@markdown (i.e., all of the ranks will be less than 2) and so 
#@markdown we should be able to find similar documents very easily.
#@markdown Additionally, we keep track of the second ranks for a comparison of less similar documents.

ranks = []
second_ranks = []
for doc_id in range(len(tagged_data)):
  inferred_vector = model.infer_vector(tagged_data[doc_id].words)
  sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
  rank = [docid for docid, sim in sims].index(sim[0])
  ranks.append(rank)

  second_ranks.append(sims[1])

print(sims)

[('j.pharmthera.2017.02.032', 0.8469412922859192), ('S0960-9822(00)80013-5', 0.7190202474594116), ('gad.1021802', 0.6707805395126343), ('mrd.21252', 0.6480540037155151), ('pnas.1916034117', 0.642542839050293), ('science.344.6184.570', 0.64143306016922), ('j.bbrc.2012.09.098', 0.6308866739273071), ('j.surg.2018.03.008', 0.621944010257721), ('j.bbrc.2011.11.019', 0.6211661696434021), ('jcp.28904', 0.6205242872238159), ('s-0031-1273691', 0.6169726252555847), ('j.cbpb.2014.07.006', 0.6164159774780273), ('ncomms12794', 0.6136460304260254), ('jomp.2015.40.3.110', 0.6128988265991211), ('1756-0500-7-766', 0.6074994802474976), ('j.cytogfr.2016.06.003', 0.6071280837059021), ('fonc.2019.01039', 0.6061106324195862), ('s12276-020-00516-4', 0.6024479269981384), ('j.1742-4658.2010.07909.x', 0.601850152015686), ('ijdb.072276mm', 0.6015409231185913), ('10320', 0.6009638905525208), ('s11357-019-00054-6', 0.5989878177642822), ('s00441-002-0668-y', 0.5983281135559082), ('s41436-021-01216-8', 0.59619355201

In [None]:
# Count how each document ranks with respect to the training corpus
# Results vary between runs due to random seeding and small corpus
import collections

counter = collections.Counter(ranks)
print(counter)
# Basically, >95% of the inferred documents are found to be most similar to itself and
# about 5% of the time it is mistakenly most similar to another document.
# Checking the inferred-vector against a training-vector is a sort of 'sanity check' as
# to whether the model is behaving in a usefully consistent manner, though not a real 'accuracy' value.

Counter({92: 5, 323: 5, 153: 5, 333: 4, 314: 4, 134: 4, 329: 4, 126: 4, 97: 4, 133: 4, 180: 4, 42: 4, 322: 3, 62: 3, 137: 3, 99: 3, 198: 3, 244: 3, 71: 3, 325: 3, 138: 3, 76: 3, 94: 3, 112: 3, 189: 3, 253: 3, 263: 3, 66: 3, 125: 3, 300: 3, 84: 3, 149: 3, 68: 3, 103: 3, 326: 2, 108: 2, 159: 2, 63: 2, 298: 2, 141: 2, 246: 2, 206: 2, 293: 2, 80: 2, 216: 2, 310: 2, 28: 2, 129: 2, 18: 2, 135: 2, 35: 2, 124: 2, 247: 2, 13: 2, 331: 2, 30: 2, 17: 2, 89: 2, 215: 2, 48: 2, 116: 2, 72: 2, 83: 2, 105: 2, 296: 2, 106: 2, 70: 2, 131: 2, 119: 2, 230: 2, 302: 2, 286: 2, 53: 2, 41: 2, 46: 2, 224: 2, 1: 2, 223: 2, 33: 2, 145: 2, 2: 2, 332: 2, 82: 2, 320: 2, 179: 2, 114: 1, 160: 1, 102: 1, 162: 1, 101: 1, 22: 1, 178: 1, 157: 1, 220: 1, 312: 1, 197: 1, 290: 1, 77: 1, 65: 1, 273: 1, 32: 1, 330: 1, 309: 1, 25: 1, 307: 1, 56: 1, 181: 1, 21: 1, 140: 1, 172: 1, 231: 1, 321: 1, 96: 1, 123: 1, 281: 1, 295: 1, 69: 1, 148: 1, 227: 1, 239: 1, 236: 1, 81: 1, 38: 1, 297: 1, 308: 1, 73: 1, 209: 1, 34: 1, 317: 1, 49: 1

In [None]:
#@markdown Get the most similar document
DOC_ID = 92 #@param {type: 'number'}
print(f"Document ({DOC_ID}): «{' '.join(tagged_data[DOC_ID].words)}»\n")

print(f"SIMILAR/DISSIMILAR DOCS PER MODEL {model}:")
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims)-1)]:
  print(f"{label}) paper_id={sims[index][0]}; index={sims[index][1]}: {}")

Document (92): «plasma growth differentiation factors 8 and 11 levels in cats with congestive heart failure secondary to hypertrophic cardiomyopathy a unique a detailing of pr downloading is view the mater manuscript . correspond e-mail addr 1760-2734/ª 2 journal of veterinary cardiology 25 , 41e51 www.elsevier.com/locate/jvc plasma growth differentiation factors 8 and 11 levels in cats with congestive heart failure secondary to hypertrophic cardiomyopathy v.k . yang , dvm , phda , , j.e . rush , dvm , msa , s. bhasin , mbbsb , a.j . wagers , phdc , d , e , r.t. lee , mdc adepartment of clinical sciences , cummings school of veterinary medicine at tufts university , 200 westboro rd , north grafton , ma , 01536 , usa bdepartment of medicine , brigham and women ’ s hospital , 221 longwood ave , boston , ma , 02115 , usa cdepartment of stem cell and regenerative biology and harvard stem cell institute , harvard university , 7 divinity ave , cambridge , ma , 02138 , usa d section on islet 

In [None]:
from gensim.models.doc2vec import Doc2Vec

model = Doc2Vec.load("d2v.model")

# To find the vector of a document which is not in training data
test_sentence = "Loss of myostatin increases muscle mass." #@param {type: 'string'}
test_data = word_tokenize(test_sentence.lower())
v1 = model.infer_vector(test_data)
print(f"V1_infer:\n {v1}")

# To find most similar doc using tags
similar_doc = model.docvecs.most_similar(2)
print(f"Most similar docs:\n {similar_doc}")

# To find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
print(f"Vector of doc:\n {model.docvecs[1]}")

2022-01-15 03:51:53,232 : INFO : loading Doc2Vec object from d2v.model
2022-01-15 03:51:53,711 : INFO : loading vocabulary recursively from d2v.model.vocabulary.* with mmap=None
2022-01-15 03:51:53,712 : INFO : loading trainables recursively from d2v.model.trainables.* with mmap=None
2022-01-15 03:51:53,716 : INFO : loading wv recursively from d2v.model.wv.* with mmap=None
2022-01-15 03:51:53,720 : INFO : loading docvecs recursively from d2v.model.docvecs.* with mmap=None
2022-01-15 03:51:53,722 : INFO : loaded d2v.model
2022-01-15 03:51:53,933 : INFO : precomputing L2-norms of doc weight vectors


V1_infer:
 [-0.02856265  0.01224875 -0.04325694 -0.07388379  0.04772583 -0.04615019
  0.01310947 -0.2315653   0.02839004  0.12721492 -0.03334247  0.2719835
  0.17415883  0.04105621 -0.03831164  0.003886   -0.08989064  0.00315566
  0.01621302 -0.05157117]
Most similar docs:
 [('dbio.1998.9191', 0.9510872960090637), ('j.mod.2010.08.004', 0.949094831943512), ('10320', 0.930795431137085), ('gad.1041203', 0.919755220413208), ('jcp.28904', 0.9068231582641602), ('j.ydbio.2010.08.022', 0.9037100076675415), ('sj.embor.7400752', 0.895709216594696), ('dbio.2000.9981', 0.87835294008255), ('j.ydbio.2013.08.025', 0.8719825744628906), ('dvg.20238', 0.8592759370803833)]
Vector of doc:
 [ 6.3090034   3.3144815  -8.01659    -1.853893    5.4467397  -0.20899546
 -7.966937   -8.896505   -2.6183863   3.6486418   8.662718    2.2587907
  1.3619238  -1.3263054   0.8414642   5.8401866  -2.3254719  -4.807281
 -8.100371    2.5491276 ]


In [None]:
# Pick a random document from the corpus and infer a vector from the model
import random
doc_id = random.randint(0, len(tagged_data)-1)

# Compare and print the second-most-similar document
print(f"Train Document ({doc_id}): «{' '.join(tagged_data[doc_id].words)}»\n")

sim_id = second_ranks[doc_id]
print(f"Similar Document ({sim_id[0]}): {sim_id[1]}")

Train Document (296): «10.1128/mcb.25.14.5846-5858.2005 . 2005 , 25:5846. doi : mol . cell . biol . greenspan gaoxiang ge , delana r. hopkins , wen-bin ho and daniel s. differentiation of pc12 cells can modulate nerve growth factor-induced protein 1-activated latent complex that gdf11 forms a bone morphogenetic updated information and services can be found at : these include :»

Similar Document (ncomms12794): 0.6725701689720154
