<a href="https://colab.research.google.com/github/shashank-m/covid_mining_papers/blob/master/Doc2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports and setting up files

In [0]:
import pandas as pd
import numpy as np
import codecs
from tqdm import tqdm
import spacy
import matplotlib.pyplot as plt
import nltk
from nltk.util import ngrams
import re
import json
import os

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader
torch.manual_seed(1)

<torch._C.Generator at 0x7fab888ee490>

In [4]:

is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [5]:
from google.colab import drive
drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


In [0]:
filenames=os.listdir("/content/drive/My Drive/covid_nlp/2020-03-13/biorxiv_medrxiv/biorxiv_medrxiv")
biorxiv="/content/drive/My Drive/covid_nlp/2020-03-13/biorxiv_medrxiv/biorxiv_medrxiv/"

## ***Preprocessing of abstract***

In [0]:
# all_files=[json.load(open(biorxiv+filename, 'rb')) for filename in filenames]
all_files=[]
for filename in filenames:
  all_files.append(json.load(open(biorxiv+filename, 'rb')))

In [0]:
def preprocess(abstract):
  abstract=abstract.lower()
  abstract=re.sub(r'[,-?<>:!_+=#$()%^{}|~@;\'"*&[\]]',' ',abstract) #remove punctuation.
  abstract=re.sub(r'\d+',' ',abstract) # remove digits.
  abstract=re.sub(r'\s+',' ',abstract).strip() # replace multiple white spaces with single whiite space.
  return abstract

In [0]:
cleaned_abstracts=[]
for i,f in enumerate(all_files):
  if f['abstract']: # checks if abstract exists or not.
    abstract=''

    for dic in f['abstract']:
      abstract+=dic['text']
      abstract+='\n'
    abstract=preprocess(abstract)

    if abstract!='' and len(re.findall(r'\S+',abstract))>=5: # rejects abstracts which are empty or less than 5 words. 
      cleaned_abstracts.append((abstract,i)) # appending a tuple of abstract and paper number. 
no_docs=len(cleaned_abstracts)     

### **Create train and test data for Doc2Vec model**

In [0]:
v=''
for doc,i in cleaned_abstracts:
  v+=doc
v=v.split()
vocab=set(v)
vocab_size=len(vocab)
word_to_idx={}
for i,word in enumerate(vocab):
  word_to_idx[word]=i
word_to_idx['<UNK>']=vocab_size  
vocab_size+=1

In [0]:

def get_train_tensor(cleaned_abstracts,c):
  """
  Cleaned abstracts is a list of tuples where each tuple contains the paper number(from alll_files list)
  and the cleaned abstract.
  """
  total_input=[]
  total_lable=[]
  i=0
  test_word_loc=int((c-1)/2)
  for doc,file_num in (cleaned_abstracts):
    if doc:
      train_data=[]
      all_tokens=re.findall(r'\S+',doc)
      penta_gram=ngrams(all_tokens,c) # c is window size for ngram. c=n.
      
      for window in penta_gram:
        window=list(window)
        test_word=window[test_word_loc]
        train_words=window[:test_word_loc]+window[(test_word_loc+1):]
        train_words.append(i)
        train_data.append((train_words,test_word)) 

      context_data=[]
      target_data=[]

      for context,target in train_data:

        inputs=[]
        for word in context[:-1]:
          try:
            inputs.append(word_to_idx[word])
          except KeyError:
            inputs.append(word_to_idx['<UNK>'])
        inputs.append(i)   # i is the document number.   
        inputs.append(file_num) # file_num is the file in biorxiv folder. They may not be same as some files were removed due to lack of abstract.
        inputs=torch.LongTensor(inputs).view(1,-1)
        target_vector=torch.LongTensor([word_to_idx[target]])
        context_data.append(inputs)
        target_data.append(target_vector)

      try:
        X_train=torch.cat(context_data,0)
        X_train_label=torch.cat(target_data,0)
        total_input.append(X_train)
        total_lable.append(X_train_label)
        i+=1
      except RuntimeError:
        print(i,file_num)
        pass
  return total_input,total_lable


In [0]:
window=5
total_input,total_lable=get_train_tensor(cleaned_abstracts,window)

**Last column of X_train contains file_num and last but one contains document no.**

In [0]:
X_train=torch.cat(total_input,0) # put all training_data into a single tensor so that it can be batched by Dataloader.

In [0]:
X_train_label=torch.cat(total_lable,0)

In [0]:
batch_size=64
training=TensorDataset(X_train,X_train_label)
train_loader=DataLoader(training,batch_size=batch_size,drop_last=False)

In [17]:
X_train.shape

torch.Size([212592, 6])

### Doc2Vec

In [0]:
class doc2vec(nn.Module):
  def __init__(self,vocab_size,embed_dim,context,hidden_dim,batch_size,no_docs):
    super(doc2vec,self).__init__()
    self.ndocs=no_docs
    self.bs=batch_size
    self.c=context

    self.w_embeddings=nn.Embedding(vocab_size,embed_dim)
    self.d_embeddings=nn.Embedding(no_docs,embed_dim)

    self.linear1=nn.Linear(embed_dim,hidden_dim)
    self.linear2=nn.Linear(hidden_dim,vocab_size)
    
  def forward(self,inputs):
      
      w_embeds=self.w_embeddings(inputs[:,:4])
      d_embeds=self.d_embeddings(inputs[:,4]).unsqueeze(1)
      combined=torch.cat((w_embeds,d_embeds),1)
      a_1=F.relu(self.linear1(combined).sum(axis=1)/(self.c+1))
      out=self.linear2(a_1)
      return out

embed_dim=300
context=window-1
hidden_dim=50
model=doc2vec(vocab_size,embed_dim,context,hidden_dim,batch_size,no_docs)
criterion=nn.CrossEntropyLoss()
lr=1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

model.state_dict
if is_cuda:
  model.cuda()

In [51]:
model.state_dict()['d_embeddings.weight'].shape


torch.Size([693, 300])

**Note that here training data is generated from all abstracts\.We run the Doc2Vec model over 3 epochs and it looks like the model is learning something as the loss decreases every epoch. If we can overfit on this small data that means out training is not an issue.**

In [0]:

def train_infer_loop(epochs,loader,model):
  count=0
  for j in range(epochs):
    # train_loss=[]
    for i,(x,y) in enumerate(loader):
      count+=1
      model.zero_grad()
      if(is_cuda):
        x, y = x.cuda(), y.cuda()
      out=model(x)

      loss=criterion(out,y)
      loss.backward()

      # train_loss.append(loss.item())

      optimizer.step()
      # if i%400==0:
      #   print('epoch {},batch={},running train loss={}'.format(j+1,i,np.average(train_loss))) 
    print("epoch {} , loss= {}".format(j+1,loss.item()))   

In [53]:
train_infer_loop(10,train_loader,model)

epoch 1 , loss= 6.309681415557861
epoch 2 , loss= 5.852118015289307
epoch 3 , loss= 5.512955188751221
epoch 4 , loss= 5.181860446929932
epoch 5 , loss= 4.9008259773254395
epoch 6 , loss= 4.63656759262085
epoch 7 , loss= 4.324553966522217
epoch 8 , loss= 4.156632900238037
epoch 9 , loss= 3.985248565673828
epoch 10 , loss= 3.8501224517822266


**Inference stage.**

In [54]:
model.linear1.weight

Parameter containing:
tensor([[ 0.2298,  0.1238,  0.3842,  ...,  0.1489,  0.1802,  0.0185],
        [-0.1405,  0.0601,  0.0624,  ...,  0.1533, -0.1157, -0.0396],
        [ 0.3770, -0.4081, -0.3381,  ...,  0.4333,  0.0632, -0.2612],
        ...,
        [-0.2157,  0.0762,  0.1830,  ..., -0.1227, -0.2109, -0.3166],
        [-0.2073,  0.2350,  0.1430,  ..., -0.0952, -0.2026, -0.0797],
        [-0.0725, -0.0123,  0.0772,  ...,  0.0566,  0.0515,  0.4032]],
       device='cuda:0', requires_grad=True)

In [55]:
for key in model.state_dict():
  print(key)

w_embeddings.weight
d_embeddings.weight
linear1.weight
linear1.bias
linear2.weight
linear2.bias


In [56]:
model.state_dict()['d_embeddings.weight'][0]

tensor([ 7.6642e-01, -5.8539e-01, -1.5511e+00,  3.9555e-01, -1.4272e+00,
        -1.9348e+00,  2.9453e-01, -6.1008e-01, -3.7993e-01, -2.5643e+00,
        -2.7140e+00,  3.5350e-01, -8.2012e-01,  1.8017e+00,  3.0188e-01,
        -6.7503e-01, -9.3303e-01, -5.0716e-01, -3.6577e+00,  1.5420e-02,
        -6.3998e-01,  2.0770e+00, -2.9984e-01, -8.8807e-01,  3.5561e-01,
        -1.4543e+00, -7.8796e-01,  1.0569e-01, -1.0248e+00, -5.9145e-01,
        -6.3116e-01, -4.5057e-02,  3.8976e-01,  2.0394e-01, -1.1785e+00,
         1.0208e+00,  1.2712e+00,  6.4129e-01, -1.4899e+00,  1.1558e+00,
         6.4827e-01, -8.0238e-02,  7.5821e-02, -6.7544e-01,  2.1301e+00,
        -7.1728e-01, -9.9774e-01,  7.2381e-01, -2.3320e+00,  6.0348e-01,
        -1.2715e+00, -9.0010e-01, -3.4766e-01,  6.5404e-02,  6.1175e-01,
        -8.3246e-01,  1.5995e-02,  8.3488e-01, -1.3093e+00,  4.7996e-01,
         5.2170e-01,  2.2406e-01, -1.8375e+00,  1.6658e+00, -6.7962e-01,
         2.5200e-01,  3.5381e-01,  8.0617e-01, -2.2

In [0]:
query='Efforts targeted at a universal coronavirus vaccine.'
query=preprocess(query)

In [0]:
infer_data=[(query,0)]
infer_window=3
data,label=get_train_tensor(infer_data,infer_window)

In [0]:
x_input=torch.cat(data,0)
x_label=torch.cat(label,0)

batch_size=1
inference=TensorDataset(x_input,x_label)
infer_loader=DataLoader(inference,batch_size=batch_size,drop_last=False)

In [30]:
for x,y in infer_loader:
  print(x[0][2]+693)
  break

tensor(693)


In [0]:
class infer(nn.Module):
  def __init__(self,vocab_size,embed_dim,context,hidden_dim,batch_size,no_docs,dic):
    super(infer,self).__init__()
    self.ndocs=no_docs
    self.bs=batch_size
    self.c=context

    self.w_embeddings=nn.Embedding(vocab_size,embed_dim)

    self.w_embeddings.weight=nn.Parameter(dic['w_embeddings.weight'])
    self.w_embeddings.weight.requires_grad=False


    self.linear1=nn.Linear(embed_dim,hidden_dim)
    self.linear2=nn.Linear(hidden_dim,vocab_size)

    self.linear1.weight=nn.Parameter(dic['linear1.weight'])
    self.linear1.bias=nn.Parameter(dic['linear1.bias'])
    self.linear2.weight=nn.Parameter(dic['linear2.weight'])
    self.linear2.bias=nn.Parameter(dic['linear2.bias'])

    l=[self.linear1,self.linear2]
    for layer in l:
      layer.weight.requires_grad=False
      layer.bias.requires_grad=False
    # old=dic['d_embeddings.weight']
    # add=torch.randn(1,300).cuda()

    self.d_embeddings=nn.Embedding(no_docs,embed_dim)
    # self.d_embeddings.weight=nn.Parameter(torch.cat((old,add),0))
    # self.d_embeddings.weight=nn.Parameter(dic['d_embeddings.weight'])
  def forward(self,inputs):
    w_embeds=self.w_embeddings(inputs[0,:self.c])
    d_embeds=self.d_embeddings(inputs[0,self.c]).unsqueeze(0)
    combined=torch.cat((w_embeds,d_embeds),0)
    a_1=F.relu(self.linear1(combined).sum(axis=0)/(self.c+1))
    out=self.linear2(a_1).unsqueeze(0)

    return out
    



In [0]:

hidden_dim=50
infer_context=infer_window-1
batch_size=1
no_docs=1
embed_dim=300
model_infer=infer(vocab_size,embed_dim,infer_context,hidden_dim,batch_size,no_docs,model.state_dict())
criterion=nn.CrossEntropyLoss()
lr=1e-3
optimizer = torch.optim.Adam(model_infer.parameters(), lr=lr)

In [106]:
model_infer

infer(
  (w_embeddings): Embedding(13807, 300)
  (linear1): Linear(in_features=300, out_features=50, bias=True)
  (linear2): Linear(in_features=50, out_features=13807, bias=True)
  (d_embeddings): Embedding(1, 300)
)

In [107]:
if(is_cuda):
  model_infer.cuda()
count=0
for j in range(300):
  # train_loss=[]
  for i,(x,y) in enumerate(infer_loader):
    count+=1
    model_infer.zero_grad()
    if(is_cuda):
      x, y = x.cuda(), y.cuda()
    out=model_infer(x)

    loss=criterion(out,y)
    loss.backward()

    # train_loss.append(loss.item())
    
    optimizer.step()
    # if i%400==0:
    #   print('epoch {},batch={},running train loss={}'.format(j+1,i,np.average(train_loss))) 
  if j==0:
    print("epoch {} , loss= {}".format(j+1,loss.item())) 

  if (j+1)%100==0:
    print("epoch {} , loss= {}".format(j+1,loss.item())) 

epoch 1 , loss= 14.112100601196289
epoch 100 , loss= 8.480064392089844
epoch 200 , loss= 6.30994987487793
epoch 300 , loss= 5.495576858520508


In [108]:
model_infer.linear1.weight

Parameter containing:
tensor([[ 0.2298,  0.1238,  0.3842,  ...,  0.1489,  0.1802,  0.0185],
        [-0.1405,  0.0601,  0.0624,  ...,  0.1533, -0.1157, -0.0396],
        [ 0.3770, -0.4081, -0.3381,  ...,  0.4333,  0.0632, -0.2612],
        ...,
        [-0.2157,  0.0762,  0.1830,  ..., -0.1227, -0.2109, -0.3166],
        [-0.2073,  0.2350,  0.1430,  ..., -0.0952, -0.2026, -0.0797],
        [-0.0725, -0.0123,  0.0772,  ...,  0.0566,  0.0515,  0.4032]],
       device='cuda:0')

In [0]:
model_infer.state_dict()['d_embeddings.weight']

In [0]:
model.state_dict()['d_embeddings.weight'][0]