<a href="https://colab.research.google.com/github/shashank-m/covid_mining_papers/blob/master/Doc2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports and setting up files

In [0]:
import pandas as pd
import numpy as np
import codecs
from tqdm import tqdm
import spacy
import matplotlib.pyplot as plt
import nltk
from nltk.util import ngrams
import re
import json
import os

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader
torch.manual_seed(1)

<torch._C.Generator at 0x7fe62fd77bd0>

In [4]:

is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [5]:
from google.colab import drive
drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


In [0]:
filenames=os.listdir("/content/drive/My Drive/covid_nlp/2020-03-13/biorxiv_medrxiv/biorxiv_medrxiv")
biorxiv="/content/drive/My Drive/covid_nlp/2020-03-13/biorxiv_medrxiv/biorxiv_medrxiv/"

## ***Preprocessing of abstract***

In [0]:
all_files=[json.load(open(biorxiv+filename, 'rb')) for filename in filenames]

In [0]:
def preprocess(abstract):
  abstract=re.sub(r'[,-?<>:!_+=#$()%^{}|~@;\'"*&[\]]',' ',abstract) #remove punctuation.
  abstract=re.sub(r'\d+',' ',abstract) # remove digits.
  abstract=re.sub(r'\s+',' ',abstract).strip() # replace multiple white spaces with single whiite space.
  return abstract

In [0]:
cleaned_abstracts=[]
for i,f in enumerate(all_files):
  if f['abstract']: # checks if abstract exists or not.
    abstract=''

    for dic in f['abstract']:
      abstract+=dic['text']
      abstract+='\n'
    abstract=abstract.lower() # converts all alphabets to lower case.
    abstract=preprocess(abstract)

    if abstract!='' and len(re.findall(r'\S+',abstract))>=5: # rejects abstracts which are empty or less than 5 words. 
      cleaned_abstracts.append((abstract,i)) # appending a tuple of abstract and paper number. 
no_docs=len(cleaned_abstracts)     

### **Create train and test data for Doc2Vec model**

In [0]:
v=''
for doc,i in cleaned_abstracts:
  v+=doc
v=v.split()
vocab=set(v)
vocab_size=len(vocab)
word_to_idx={}
for i,word in enumerate(vocab):
  word_to_idx[word]=i
word_to_idx['<UNK>']=vocab_size  
vocab_size+=1

In [0]:

total_input=[]
total_lable=[]
i=0
for doc,file_num in (cleaned_abstracts):
  if doc:
    train_data=[]
    all_tokens=re.findall(r'\S+',doc)
    penta_gram=ngrams(all_tokens,5)

    for window in penta_gram:
      window=list(window)
      test_word=window[2]
      train_words=window[:2]+window[3:]
      train_words.append(i)
      train_data.append((train_words,test_word)) 

    context_data=[]
    target_data=[]

    for context,target in train_data:

      inputs=[]
      for word in context[:-1]:
        try:
          inputs.append(word_to_idx[word])
        except KeyError:
          inputs.append(word_to_idx['<UNK>'])
      inputs.append(i)   # i is the document number.   
      inputs.append(file_num) # file_num is the file in biorxiv folder. They may not be same as some files were removed due to lack of abstract.
      inputs=torch.LongTensor(inputs).view(1,-1)
      target_vector=torch.LongTensor([word_to_idx[target]])
      context_data.append(inputs)
      target_data.append(target_vector)

    try:
      X_train=torch.cat(context_data,0)
      X_train_label=torch.cat(target_data,0)
      total_input.append(X_train)
      total_lable.append(X_train_label)
      i+=1
    except RuntimeError:
      print(i,file_num)
      pass


**Last column of X_train contains file_num and last but one contains document no.**

In [0]:
X_train=torch.cat(total_input,0) # put all training_data into a single tensor so that it can be batched by Dataloader.

In [0]:
X_train_label=torch.cat(total_lable,0)

In [0]:
batch_size=64
training=TensorDataset(X_train,X_train_label)
train_loader=DataLoader(training,batch_size=batch_size,drop_last=False)

In [15]:
X_train.shape

torch.Size([212592, 6])

### Doc2Vec

In [0]:
class cbow(nn.Module):
  def __init__(self,vocab_size,embed_dim,context,hidden_dim,batch_size,no_docs):
    super(cbow,self).__init__()
    self.ndocs=no_docs
    self.bs=batch_size
    self.c=context

    self.w_embeddings=nn.Embedding(vocab_size,embed_dim)
    self.d_embeddings=nn.Embedding(no_docs,embed_dim)

    self.linear1=nn.Linear(embed_dim,hidden_dim)
    self.linear2=nn.Linear(hidden_dim,vocab_size)
    
  def forward(self,inputs):
      
      w_embeds=self.w_embeddings(inputs[:,:4])
      d_embeds=self.d_embeddings(inputs[:,4]).unsqueeze(1)
      combined=torch.cat((w_embeds,d_embeds),1)
      a_1=F.relu(self.linear1(combined).sum(axis=1)/(self.c+1))
      out=self.linear2(a_1)
      return out

embed_dim=300
model=cbow(vocab_size,embed_dim,4,50,batch_size,no_docs)
criterion=nn.CrossEntropyLoss()
lr=1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

model.state_dict
if(is_cuda):
  model.cuda()

**Note that here training data is generated from all abstracts\.We run the Doc2Vec model over 3 epochs and it looks like the model is learning something as the loss decreases every epoch. If we can overfit on this small data that means out training is not an issue.**

In [23]:
epochs=3
for j in range(epochs):
  train_loss=[]
  for i,(x,y) in enumerate(train_loader):
    count+=1
    model.zero_grad()
    if(is_cuda):
      x, y = x.cuda(), y.cuda()
    out=model(x)

    loss=criterion(out,y)
    loss.backward()

    train_loss.append(loss.item())

    optimizer.step()
    if i%200==0:
      print('epoch {},batch={},running train loss={}'.format(j+1,i,np.average(train_loss)))    

epoch 1,batch=0,running train loss=9.55390453338623
epoch 1,batch=200,running train loss=8.496730541115376
epoch 1,batch=400,running train loss=7.944065985834212
epoch 1,batch=600,running train loss=7.73745178343254
epoch 1,batch=800,running train loss=7.603608276662458
epoch 1,batch=1000,running train loss=7.521674966955042
epoch 1,batch=1200,running train loss=7.457838540470273
epoch 1,batch=1400,running train loss=7.395984579204748
epoch 1,batch=1600,running train loss=7.338700695085496
epoch 1,batch=1800,running train loss=7.2907657408833435
epoch 1,batch=2000,running train loss=7.262255778972772
epoch 1,batch=2200,running train loss=7.240111396833747
epoch 1,batch=2400,running train loss=7.222505417628767
epoch 1,batch=2600,running train loss=7.193835469586535
epoch 1,batch=2800,running train loss=7.170622033844757
epoch 1,batch=3000,running train loss=7.153190809978878
epoch 1,batch=3200,running train loss=7.137829578656474
epoch 2,batch=0,running train loss=6.237630844116211
epo

**As we can see above, our tarin loss almost comes down from 9.5 to around 5.8 in 3 epochs. Our model is learning. Val set can be added to check generalising ability and hyperparam tuning**
