<a href="https://colab.research.google.com/github/shashank-m/-ML-QSTP-19-/blob/master/word_doc2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports and setting up files

In [0]:
import pandas as pd
import numpy as np
import codecs
from tqdm import tqdm
import spacy
import matplotlib.pyplot as plt
import nltk
from nltk.util import ngrams
import re

In [179]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [180]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader
torch.manual_seed(1)

<torch._C.Generator at 0x7f37274e3ef0>

In [181]:

is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU not available, CPU used


In [182]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
import json
import os

filenames=os.listdir("/content/drive/My Drive/covid_nlp/2020-03-13/biorxiv_medrxiv/biorxiv_medrxiv")
biorxiv="/content/drive/My Drive/covid_nlp/2020-03-13/biorxiv_medrxiv/biorxiv_medrxiv/"

## ***Preprocessing of abstract***

In [0]:
all_files=[json.load(open(biorxiv+filename, 'rb')) for filename in filenames]

In [0]:
f=all_files[32]
abstract=''
for dic in f['abstract']:
  abstract+=dic['text']
  abstract+='\n'
abstract=abstract.lower() # converts all alphabets to lower case.

In [0]:
def preprocess(abstract):
  abstract=re.sub(r'[,-?<>:!_+=#$()%^{}|~@;\'."*&[\]]',' ',abstract) #remove punctuation.
  abstract=re.sub(r'\d+',' ',abstract) # remove digits.
  abstract=re.sub(r'\s+',' ',abstract).strip() # replace multiple white spaces with single whiite space.
  return abstract

In [173]:
abstract=preprocess(abstract)
len(abstract)

5602

### **Create train and test data for CBOW model**

In [0]:
train_data=[]

In [0]:
all_tokens=re.findall(r'\S+',abstract)

In [0]:
penta_gram=ngrams(all_tokens,5)
for window in penta_gram:
  window=list(window)
  test_word=window[2]
  train_words=window[:2]+window[3:]
  train_data.append((train_words,test_word))

In [177]:

vocab=set(all_tokens)
vocab_size=len(set(all_tokens))
word_to_idx={}
for i,word in enumerate(vocab):
  word_to_idx[word]=i
vocab_size  

380

In [183]:
context_data=[]
target_data=[]
for context,target in train_data:
  inputs=torch.LongTensor([word_to_idx[word] for word in context]).view(1,-1) 
  target_vector=torch.LongTensor([word_to_idx[target]])
  context_data.append(inputs)
  target_data.append(target_vector)
len(target_data)

834

In [0]:
X_train=torch.cat(context_data,0) # put all training_data into a single tensor so that it can be batched by Dataloader.

In [0]:
X_train_label=torch.cat(target_data,0)

In [0]:
batch_size=32
training=TensorDataset(X_train,X_train_label)
train_loader=DataLoader(training,batch_size=batch_size,drop_last=True)

### CBOW model

In [192]:
class cbow(nn.Module):
  def __init__(self,vocab_size,embed_dim,context,hidden_dim,batch_size):
    super(cbow,self).__init__()
    self.bs=batch_size
    self.c=context
    self.embeddings=nn.Embedding(vocab_size,embed_dim)
    self.linear1=nn.Linear(embed_dim,hidden_dim)
    self.linear2=nn.Linear(hidden_dim,vocab_size)
    
  def forward(self,inputs):
    # if inputs.shape[0]==self.bs:
      embeds=self.embeddings(inputs)
      a_1=F.relu(self.linear1(embeds).sum(axis=1)/self.c)
      out=self.linear2(a_1)
      return out

embed_dim=300

model=cbow(vocab_size,embed_dim,4,50,batch_size)
criterion=nn.CrossEntropyLoss()
lr=1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

model.state_dict

<bound method Module.state_dict of cbow(
  (embeddings): Embedding(380, 300)
  (linear1): Linear(in_features=300, out_features=50, bias=True)
  (linear2): Linear(in_features=50, out_features=380, bias=True)
)>

**Note that here training data is generated from only one abstract.We run the cbow model over 50 epochs and it looks like the model is learning something as the loss decreases every epoch. If we can overfit on this small data that means out training process works. We just need to collect more training data**

In [193]:
epochs=50
for j in range(epochs):
  for i,(x,y) in enumerate(train_loader):
    model.zero_grad()
    out=model(x)

    loss=criterion(out,y)
    loss.backward()

    optimizer.step()
  print('After epoch {},loss={}'.format(j+1,loss))    

After epoch 1,loss=5.885824203491211
After epoch 2,loss=5.616886138916016
After epoch 3,loss=5.2316460609436035
After epoch 4,loss=4.883759498596191
After epoch 5,loss=4.592529296875
After epoch 6,loss=4.324159622192383
After epoch 7,loss=4.058540344238281
After epoch 8,loss=3.7785441875457764
After epoch 9,loss=3.4769744873046875
After epoch 10,loss=3.1627917289733887
After epoch 11,loss=2.826920986175537
After epoch 12,loss=2.493203639984131
After epoch 13,loss=2.1635921001434326
After epoch 14,loss=1.8549734354019165
After epoch 15,loss=1.5731874704360962
After epoch 16,loss=1.3291620016098022
After epoch 17,loss=1.1199700832366943
After epoch 18,loss=0.9454092383384705
After epoch 19,loss=0.7982237339019775
After epoch 20,loss=0.6755102276802063
After epoch 21,loss=0.5713686347007751
After epoch 22,loss=0.48508062958717346
After epoch 23,loss=0.41218945384025574
After epoch 24,loss=0.3532406985759735
After epoch 25,loss=0.30278918147087097
After epoch 26,loss=0.2626253068447113
Aft

**As we can see above, our tarin loss almost comes down to zero. This shows the learning process is not an issue. Time to collect more trainig data.**