In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
#Using zipfile to extract JSON files from the .zip file
import zipfile
zip_ref = zipfile.ZipFile("/content/drive/My Drive/pdf_json.zip", 'r')
zip_ref.extractall("/content/files/")
zip_ref.close()

In [None]:
#Using the small english language model for preprocessing
import re
import spacy
nlp=spacy.load("en_core_web_sm")



In [None]:
#Function to extract the text from the JSON files
import json
def extract_body_text(filename):
  file1 = open(filename)
  paper_content = json.load(file1,encoding='utf-8')
  body_text = ""
  if "body_text" in paper_content :
    for bt in paper_content ["body_text"]:
      body_text = body_text + bt["text"]
  return ( body_text + '\n').lower()

In [None]:
#Function for performing text preprocessing
#Preprocessing includes case folding, removing punctuations and lemmatization
def preprocessing(text):
  res=""

  #Converting corpus to lower case
  text=text.lower()

  #Removing references
  text=re.sub("\[\d+\]","",text)

  #Removing punctuations
  text=re.sub("\$|-|\#|\^|\(|\)|\-#@\'","",text)

  text=re.sub("\.|\,"," ",text)

  #Removing multiple occurences of space with a single space
  text=re.sub("\s*\s"," ",text)


  
  return text


In [None]:
import os 
import nltk 
nltk.download("punkt")
from nltk import ngrams
from collections import Counter
import tqdm

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


<b>Building the corpus from the JSON files and forming the bigrams and trigrams</b>

In [None]:
!pip install fasttext-langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext-langdetect
  Downloading fasttext-langdetect-1.0.5.tar.gz (6.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fasttext>=0.9.1
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 KB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2
  Using cached pybind11-2.10.3-py3-none-any.whl (222 kB)
Building wheels for collected packages: fasttext-langdetect, fasttext
  Building wheel for fasttext-langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext-langdetect: filename=fasttext_langdetect-1.0.5-py3-none-any.whl size=7522 sha256=b21b8f27c96b1995b3e7bac84067e74f855cdaca7b2829e843d80b0c35bd10e4
  Stored in directory: /root/.cache/pip/wheels/ba/0c/91/bd32e760105a77b98238686ace395c1e8cc1602b4b1e2be852
  Bu

In [None]:
from ftlangdetect import detect

In [None]:
#To store the number of words in the corpus
len_corpus=0
unigrams=Counter()
#Counter object to store the unigrams

with open("nlp_corpus.txt","r") as f:
  for j in tqdm.tqdm(os.listdir("files/pdf_json/")):
    try:
      text=preprocessing(extract_body_text(f'files/pdf_json/{j}'))

      #Finding the laguage of the text
      lang=detect(text,low_memory=False)

      #Verifying if the text is in English
      if lang['lang']=='en':

        #Finding length of corpus
        len_corpus+=len(text.split(" "))

        #Making the corpus
        f.write(text+ "\n")

        ugrams=ngrams(text.split(" "),1)

        for i in ugrams:
          unigrams.update({i:1})
      else:
        pass
    except UnicodeDecodeError:
      pass

100%|██████████| 56529/56529 [19:52<00:00, 47.40it/s]


In [None]:
print("Vocabulary size =",len(unigrams))

Vocabulary size = 2394602


In [None]:
#Length of the entire corpus
print(len_corpus)

113181943


In [None]:
#To store the number of words in the corpus
len_corpus=0

#Counter object to store the bigrams
bigrams=Counter()


#Used all the files to build corpus, but used 30000 files to form the bigram counter object
for j in tqdm.tqdm(os.listdir("files/pdf_json/")[:30000]):
    try:
      text=preprocessing(extract_body_text(f'files/pdf_json/{j}'))

      #Finding the language of the text
      lang=detect(text,low_memory=False)
      
      #Verifying if the text is in English
      if lang['lang']=='en':
         
        bgrams=ngrams(text.split(" "),2)

        for i in bgrams:
          bigrams.update({i:1})
      else:
        pass
    except UnicodeDecodeError:
      pass


100%|██████████| 30000/30000 [07:35<00:00, 65.92it/s]


In [None]:
bigrams.most_common(20)

[(('of', 'the'), 818933),
 (('in', 'the'), 629189),
 (('to', 'the'), 302799),
 (('et', 'al'), 298868),
 (('and', 'the'), 212646),
 (('on', 'the'), 177564),
 (('for', 'the'), 175473),
 (('with', 'the'), 156164),
 (('to', 'be'), 147456),
 (('that', 'the'), 125972),
 (('from', 'the'), 125093),
 (('it', 'is'), 124090),
 (('by', 'the'), 120847),
 (('in', 'a'), 117200),
 (('such', 'as'), 115129),
 (('can', 'be'), 109918),
 (('as', 'a'), 103364),
 (('number', 'of'), 102963),
 (('of', 'a'), 102018),
 (('has', 'been'), 97863)]

In [None]:
#Using 10000 files to build the trigram counter object

trigrams=Counter()
for j in tqdm.tqdm(os.listdir("files/pdf_json/")[:10000]):
    try:
      text=preprocessing(extract_body_text(f'files/pdf_json/{j}'))

      #Finding the laguage of the text
      lang=detect(text,low_memory=False)

      if lang['lang']=='en':

        tgrams=ngrams(text.split(" "),3)

        for i in tgrams:
          trigrams.update({i:1})
    except UnicodeDecodeError:
      pass


100%|██████████| 10000/10000 [04:27<00:00, 37.40it/s]


In [None]:
trigrams.most_common(20)

[(('as', 'well', 'as'), 17351),
 (('the', 'number', 'of'), 14828),
 (('the', 'presence', 'of'), 11034),
 (('the', 'use', 'of'), 10309),
 (('in', 'order', 'to'), 8837),
 (('due', 'to', 'the'), 8447),
 (('one', 'of', 'the'), 8111),
 (('based', 'on', 'the'), 8020),
 (('in', 'patients', 'with'), 7832),
 (('in', 'this', 'study'), 7416),
 (('copyright', 'holder', 'for'), 6601),
 (('holder', 'for', 'this'), 6601),
 (('to', 'display', 'the'), 6587),
 (('a', 'license', 'to'), 6569),
 (('author/funder', 'who', 'has'), 6563),
 (('who', 'has', 'granted'), 6563),
 (('display', 'the', 'preprint'), 6560),
 (('license', 'to', 'display'), 6553),
 (('has', 'granted', 'medrxiv'), 6534),
 (('granted', 'medrxiv', 'a'), 6534)]

In [None]:
%cp nlp_corpus.txt '/content/drive/My Drive/'

<h3><b>Building the Bigram model</b></h3>

In [None]:
#Using Counter objects stored in a dictionary to form the word counts, i.e., count_bigrams[prev_word][next_word]=i implies
#next word appears after prev_word i times
counts_bigrams=Counter()

for i,j in bigrams.items():
  word1,word2=i
  if word1 not in counts_bigrams.keys():
    counts_bigrams[word1]=Counter()
  if word2 in counts_bigrams[word1].keys():
    counts_bigrams[word1][word2]+=j
  else:
    counts_bigrams[word1][word2]=j
    

In [None]:
#Printing some objects from the counts_bigrams Counter variable
from itertools import islice

def take(n, iterable):
    return list(islice(iterable, n))
n_items=take(3,counts_bigrams.items())

In [None]:
n_items

[('to',
  Counter({'the': 302799,
           'balance': 644,
           'be': 147456,
           'have': 24138,
           'most': 797,
           'overcome': 2894,
           'reach': 3388,
           'achieve': 7306,
           'prevent': 12639,
           'patients': 4064,
           'minimize': 3946,
           'note': 2803,
           'suspect': 115,
           'define': 3163,
           'exclude': 1217,
           'decide': 855,
           'avoid': 9026,
           'undergo': 1049,
           'admit': 221,
           'date': 7604,
           'implantation': 14,
           'perform': 5947,
           'conception': 58,
           'acquisition': 64,
           'restrict': 644,
           'police': 42,
           'invade': 293,
           'early': 759,
           'inhibit': 4715,
           'lysosomes': 131,
           'restricted': 40,
           'membrane': 219,
           'require': 774,
           'shed': 419,
           'identify': 19619,
           'differentiate': 1760,
      

In [None]:
#Using Counter objects stored in a dictionary to store the probabilites, i.e., probs_bigrams[prev_word][next_word]=i implies
#P(next_word|prev_word)=i.
probs_bigrams=Counter()
for i,j in bigrams.items():
  word1,word2=i
  if word1 not in probs_bigrams.keys():
    probs_bigrams[word1]=Counter()
  if word2 in probs_bigrams[word1].keys():
    probs_bigrams[word1][word2]+=j
  else:
    probs_bigrams[word1][word2]=j
    
for i,j in probs_bigrams.items():
  tot=sum(j.values())
  for x,y in j.items():

    #Using laplace smoothing to normalise the probabilities
    #len(unigrams) represents the vocabulary size
    j[x]=(j[x]+1)/(tot+len(unigrams))

In [None]:
#Saving the Counter object
import pickle
with open("bigram_model.pkl","wb") as f:
  pickle.dump(probs_bigrams,f)           

In [None]:
#Function to predict the next word using the Bigram model

#P(W)=P(w1).P(w2|w1)....P(wn|wn-1)

def bigram_model(sent):
  grams=ngrams(sent.split(" "),2) 
  first_word=sent.split(" ")[0]

  #Storing the smoothed probability of the first word appearing in the corpus
  val=(sum(counts_bigrams[first_word].values())+1)/(len(unigrams)+len_corpus)
  for i in grams:
    a,b=i 

    #To check if unknown word is present in the test sentence
    if probs_bigrams[a][b]!=0:
      pass
    else:

      #If present, then probability is smoothed
      probs_bigrams[a][b]=1/len(unigrams)
    

    val*=probs_bigrams[a][b]
  
  #Predict last word based on the previous word
  last_word=sent.split(" ")[-1]

  #Display the 10 most probable next words
  for i,j in probs_bigrams[last_word].most_common(10):
    print(f"{i:{20}}{val*j}")

In [None]:
#Function to calculate the Prepexity of the bigram language model
def perplexity_bigram_model(line):

  #Preprocess the test sentence
  line=preprocessing(line)

  #To store the number of words in the test sentence
  num=len(nltk.word_tokenize(line))

  grams=ngrams(line.split(" "),2) 

  first_word=line.split(" ")[0]

  #Smoothed P(w1)
  val=(sum(counts_bigrams[first_word].values())+1)/(len(unigrams)+len_corpus)
  for i in grams:
    a,b=i 
    if probs_bigrams[a][b]!=0:
      pass
    else:
      probs_bigrams[a][b]=1/len(unigrams)
    val*=probs_bigrams[a][b]
  last_word=line.split(" ")[-1]
  
  #To store P(W)
  ans= probs_bigrams[last_word].most_common(1)[0][1]
  
  return (1/ans)**(1/num)

<h3><b>Building the Trigram model</b></h3>

In [None]:
#Using Counter objects stored in a dictionary to form the word counts, i.e., count_trigrams[second_last_word,last_word][next_word]=i implies
#next word follows second_last_word and last_word i times

counts_trigrams=Counter()
for i,j in trigrams.items():
  word1,word2,word3=i
  if (word1,word2) not in counts_trigrams.keys():
    counts_trigrams[(word1,word2)]=Counter()
  if word3 in counts_trigrams[(word1,word2)].keys():
    counts_trigrams[(word1,word2)][word3]+=j
  else:
    counts_trigrams[(word1,word2)][word3]=j
    


In [None]:
n_items = take(3, counts_trigrams.items())
n_items

[(('to', 'the'),
  Counter({'excellent': 8,
           'accuracy': 31,
           'cd225/pfam04505': 1,
           'plasma': 182,
           'cytoplasm': 174,
           'overall': 186,
           'cell': 435,
           'wt': 44,
           'acyl': 1,
           'second': 113,
           'total': 276,
           'ha': 26,
           'single': 33,
           'chiropteran': 2,
           'antiviral': 58,
           "manufacturer's": 973,
           'understanding': 63,
           'clinical': 242,
           'european': 57,
           'corresponding': 201,
           'method': 89,
           'time': 200,
           'cells': 172,
           'subsequent': 31,
           'synergistic': 7,
           'action': 40,
           'ed': 210,
           'intensive': 236,
           'virus': 627,
           'worldwide': 15,
           'brazilian': 4,
           'dispersion': 4,
           'alignment': 13,
           'final': 136,
           'following': 282,
           'importance': 78,
           '

In [None]:
#Using Counter objects stored in a dictionary to store the probabilites, i.e., probs_trigrams[second_last_word,last_word][next_word]=i implies
#P(next_word|second_last_word,last_word)=i.

probs_trigrams=Counter()
for i,j in trigrams.items():
  word1,word2,word3=i
  if (word1,word2) not in probs_trigrams.keys():
    probs_trigrams[(word1,word2)]=Counter()
  if word3 in probs_trigrams[(word1,word2)].keys():
    probs_trigrams[(word1,word2)][word3]+=j
  else:
    probs_trigrams[(word1,word2)][word3]=j
    

for i,j in probs_trigrams.items():
  tot=sum(j.values())
  for x,y in j.items():

    #Using laplace smoothing to normalise the probabilities
    j[x]=(j[x]+1)/(tot+len(unigrams))

In [None]:
#Saving the Counter object
import pickle
with open("trigram_model.pkl","wb") as f:
  pickle.dump(probs_trigrams,f)           

In [None]:
#Function to predict the next word using the Trigram model

#P(W)=P(w1,w2).P(w3|w1,w2)....P(wn|wn-1,wn-2)


def trigram_model(sent):
  grams=ngrams(sent.split(" "),3) 
  first_word,second_word=sent.split(" ")[:2]

  #Checking if the word pair appears in the corpus
  try:

    #P(w1,w2)
    val=(sum(counts_trigrams[(first_word,second_word)].values())+1)/(len(unigrams)+len_corpus)
  except:
    probs_trigrams[(first_word,second_word)]=Counter()


    val=1/len_corpus
  for i in grams:
    a,b,c=i 

    #Smoothing the probabilities
    if (a,b) not in probs_trigrams.keys():
      probs_trigrams[(a,b)]=Counter()
      probs_trigrams[(a,b)][c]=1/len(unigrams)

    elif probs_trigrams[(a,b)][c]==0:
      probs_trigrams[(a,b)][c]=1/len(unigrams)

    
    val*=probs_trigrams[(a,b)][c]

  #Using the last two words to predict the next word
  second_last_word,last_word=sent.split(" ")[-2:]
  try:
    for i,j in probs_trigrams[(second_last_word,last_word)].most_common(10):
      print(f"{i:{20}}{val*j}")
  except:
    print("No such words.")

In [None]:
#Function to calculate the Prepexity of the trigram language model
def perplexity_trigram_model(line):

  #Preprocessing the test sentence
  line=preprocessing(line)

  num=len(nltk.word_tokenize(line))

  grams=ngrams(line.split(" "),3) 

  first_word,second_word=line.split(" ")[:2]
  try:

    #Smoothed P(w1,w2)
    val=(sum(counts_trigrams[(first_word,second_word)].values())+1)/(len(unigrams)+len_corpus)
  except:
    val=1/len(unigrams)
  for i in grams:

    a,b,c=i 

    if (a,b) not in probs_trigrams.keys():
      probs_trigrams[(a,b)]=Counter()
      probs_trigrams[(a,b)][c]=1/len(unigrams)

    elif probs_trigrams[(a,b)][c]==0:
      probs_trigrams[(a,b)][c]=1/len(unigrams)

    val*=probs_trigrams[(a,b)][c]
  second_last_word,last_word=line.split(" ")[-2:] 

  try:
    #print(probs_trigrams[(second_last_word,last_word)])
    #Checking if the last two words appear in the trigram counter
    ans= probs_trigrams[(second_last_word,last_word)].most_common(1)[0][1]

  except: 
    ans=val

  return (1/ans)**(1/num)

<h3><b>Acid Test</b></h3>

In [None]:
perp_sents=["it appears that the overall code stroke volume has decreased since the covid- pandemic",
            "half a century ago hypertension was not treatable",
            "sarah s tv is broadcasting an advert for private healthcare"]

In [None]:
def acid_bigram_test(sentence):
  sents=sentence.split("<space>")
  sent=sents[0]
  sent=sent.strip()
  print(bigram_model(sent))

In [None]:
def acid_trigram_test(sentence):
  sents=sentence.split("<space>")
  sent=sents[0]
  sent=sent.strip()
  print(trigram_model(sent))

In [None]:
acid_bigram_test("all houses were <space> ventilated")

not                 1.2755924544624491e-14
used                1.1992239632691873e-14
also                9.482187017736875e-15
performed           9.148074868766356e-15
found               7.973067017134737e-15
collected           7.848125919410468e-15
obtained            6.521504151439286e-15
observed            5.9564615465626714e-15
identified          5.776069062769765e-15
detected            5.2440165230225925e-15
None


In [None]:
acid_trigram_test("all houses were <space> ventilated")

foreclosed          1.207125736318781e-20
temperature         1.207125736318781e-20
observed            1.207125736318781e-20
reduced             1.207125736318781e-20
all                 1.207125736318781e-20
destroyed           1.207125736318781e-20
None


In [None]:
acid_bigram_test("""it aims to develop an integrated <space> to reach mmps exposed
to malaria with prevention diagnosis and treatment""")

into                4.47502740295565e-22
with                2.424192282917221e-22
in                  1.3724819649487955e-22
and                 1.1831741077144788e-22
dna                 9.307636314020566e-23
approach            8.098169448356877e-23
the                 6.099919844216868e-23
moving              4.4171833354673876e-23
to                  4.4171833354673876e-23
care                3.9964992082800175e-23
None


In [None]:
acid_trigram_test("""it aims to develop an integrated <space> to reach mmps exposed
to malaria with prevention diagnosis and treatment""")

approach            1.0301768160097833e-31
and                 3.746097512762848e-32
system              2.809573134572136e-32
knowledge           2.1852235491116615e-32
analysis            2.1852235491116615e-32
view                1.873048756381424e-32
model               1.560873963651187e-32
health              1.560873963651187e-32
one                 1.560873963651187e-32
framework           1.2486991709209496e-32
None


In [None]:
acid_bigram_test("""it aims to develop an integrated dna to reach mmps exposed to malaria with prevention diagnosis and treatment <space> by involving""")

of                  2.7421933416214513e-70
with                1.0608700943382156e-70
and                 7.545398239366032e-71
for                 6.3993468329950864e-71
in                  4.695720592340506e-71
is                  3.728361843703835e-71
was                 3.0297138585773505e-71
the                 2.943726414254091e-71
group               1.7949879002480446e-71
options             1.5410562287309185e-71
None


In [None]:
acid_trigram_test("""it aims to develop an integrated dna to reach mmps exposed to malaria with prevention diagnosis and treatment <space> by involving""")

of                  7.442176628863933e-100
with                8.854396892774861e-101
for                 5.828210612965731e-101
and                 5.828210612965731e-101
in                  5.267805746334411e-101
options             4.819481853029354e-101
is                  4.1469960130717705e-101
strategies          3.922834066419242e-101
the                 3.6986721197667135e-101
are                 2.689943359830337e-101
None


In [None]:
acid_bigram_test("""it aims to develop an integrated dna to reach mmps exposed to malaria with prevention diagnosis and treatment options by involving stakeholders from provincial to community level""")

of                  4.5475636484475573e-113
and                 5.440579501559133e-114
in                  4.642100543905595e-114
was                 3.320090571564123e-114
the                 3.0922227558186556e-114
is                  2.024445470465929e-114
for                 1.4896152252451625e-114
to                  1.3973381758936922e-114
3                   1.1977184364803076e-114
as                  1.1619375397930029e-114
None


In [None]:
acid_trigram_test("""it aims to develop an integrated dna to reach mmps exposed to malaria with prevention diagnosis and treatment options by involving stakeholders from provincial to community level""")

and                 1.072207137599176e-155
in                  1.072207137599176e-155
as                  1.072207137599176e-155
the                 8.935059479993132e-156
this                7.148047583994506e-156
to                  7.148047583994506e-156
of                  5.36103568799588e-156
can                 5.36103568799588e-156
institutions        5.36103568799588e-156
based               5.36103568799588e-156
None


In [None]:
acid_bigram_test("this is because engineers do not work in <space> but rather as a team")

the                 1.1077731393866657e-27
a                   2.058890094119431e-28
this                1.4964749896376743e-28
addition            8.437734002154638e-29
patients            5.555873109830185e-29
our                 5.193379345897188e-29
order               4.8069439683971895e-29
which               4.612928225861623e-29
vitro               4.0596109345353277e-29
an                  3.8058298307250944e-29
None


In [None]:
acid_trigram_test("this is because engineers do not work in <space> but rather as a team")

the                 3.4535619383605927e-41
this                1.4800979735831113e-41
a                   1.2929591493369707e-41
ensuring            7.825805377565876e-42
our                 3.5726502810626823e-42
an                  2.8921454656221714e-42
addition            2.3817668540417882e-42
progress            2.3817668540417882e-42
concert             2.2116406501816605e-42
these               1.871388242461405e-42
None


<b>To find perplexities of the Bigram and Trigram models</b>

In [None]:
def perp_test(sent):
  return(perplexity_bigram_model(sent))

In [None]:
for sentence in perp_sents:
  print(sentence," Perplexity Score=",perp_test(sentence))

it appears that the overall code stroke volume has decreased since the covid- pandemic  Perplexity Score= 1.59929959356214
half a century ago hypertension was not treatable  Perplexity Score= 4.066929809204416
sarah s tv is broadcasting an advert for private healthcare  Perplexity Score= 1.8622067658735522


In [None]:
def perp_trigram_test(sent):
  return(perplexity_trigram_model(sent))

In [None]:
for sentence in perp_sents:
  sentence=preprocessing(sentence)
  print(sentence,": Perplexity Score=",perp_trigram_test(sentence))

it appears that the overall code stroke volume has decreased since the covid pandemic : Perplexity Score= 2.484828547704294
half a century ago hypertension was not treatable : Perplexity Score= 5.751425635388893
sarah s tv is broadcasting an advert for private healthcare : Perplexity Score= 3.7819657270890557
