# Importing Libraries

In [None]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import re
import json
import gzip
import gc
import numpy as np
import pandas as pd
import contractions
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
import matplotlib.pyplot as plt
from collections import Counter, defaultdict

In [None]:
import nltk
nltk.download("punkt")
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus.reader.nombank import NombankTreePointer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Reading Data

In [None]:
def unzip_read_and_extract(filename:str, count):
    data = []
    with ZipFile(filename, "r") as z:
        for filename in z.namelist():
            with z.open(filename) as f:
                try:
                    paper_content = json.load(f)
                    body_text = ""
                    if 'body_text' in paper_content :
                        for bt in paper_content['body_text']:
                            body_text += bt ['text']
                    content = (body_text + '\n').lower()
                    data.append(content)
                except ValueError as e:
                    print("Not a valid JSON")
            if (len(data) == count):
                break
    return data

In [None]:
data = unzip_read_and_extract("/content/drive/MyDrive/pdf_json.zip",50000)

Not a valid JSON
Not a valid JSON


In [None]:
data[5]

'coronavirus", "coronavirus disease 2019", "sars-cov-2", "severe acute respiratory syndrome coronavirus 2", "autoimmune disease", "clinical characteristic", "clinical feature", "risk factor", and "comorbidities". in addition, we searched the reference lists of eligible studies and relevant reviews to find potentially eligible studies (see search strategy of pubmed in appendix table 1 ). study inclusion criteria: (1) patient was diagnosed as covid-19 by the laboratory;(2) provided data of autoimmune disease with severe or non-severe or between death and survivors. study exclusion criteria: (1) studies did not provide the prevalence of autoimmune disease; (2) studies without comparisons (severe versus non-severe patients, death versus survival); (3) studies sample size is less than 10 patients; (4) abstracts, news, comments, editorials and review articles. according to the published studies [2] , the severity of disease was defined mainly on the basis of the symptoms present at diagnosis

# Preprocessing

---

- Fixed contractions i.e, wouldn't to would not
- Removed newline characters and replaced them with an empty character
- Identified citation patters using regex and removed them
- Removed URLs and emails
- Removed punctuation marks
- Discarded symbols and numbers, i.e, considered only alphabets


In [None]:
def preprocessing(text):
  text = contractions.fix(text)
  if "\n" in text:
      text = text.replace("\n","")
  cit_pattern = re.compile(r'\[\d+(,[ \t]*\d+)*\]')
  text = cit_pattern.sub(r'', text)
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
  text = url_pattern.sub(r'', text)
  email_pattern = re.compile(r'''([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|"([]!#-[^-~ \t]|(\\[\t -~]))+")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])''')
  text = email_pattern.sub(r'', text)
  punct_pattern = re.compile(r'[^\w\s]')
  text= punct_pattern.sub(r'', text)
  text= " ".join([word for word in str(text).split() if word.encode().isalpha()])
  text= " ".join(text.split())
  return text

In [None]:
text = list(map(preprocessing,data))

In [None]:
text[1]

'in the last decade two viral pandemics have had a significant impact on worldwide health resulting mainly in severe acute respiratory failure arf the first was the severe acute respiratory syndrome or sars in a virulent clinical entity with a high mortality rate the second was influenza a in which according to the who report caused more than deaths within the first seasonthe use of noninvasive ventilation niv in adults has proved effective in treating chronic obstructive pulmonary disease exacerbation cardiogenic pulmonary oedema and arf in immunocompromised patients in these patients niv has achieved significant reductions in the rate of endotracheal intubation and ventilatorassociated complications and has improved survival rates nevertheless some metaanalyses argue against the use of niv in arf because it offers no advantages over conventional ventilation moreover delaying intubation in hypoxaemic intubated patients with pneumonia may increase the risk of complications early use of

## Saving Corpus in Drive

In [None]:
corpora = "\n".join(text)
with open("/content/drive/MyDrive/corpora.txt", "w") as f:
    f.write(corpora)

In [None]:
import gc
del corpora, text, data
gc.collect()

0

Due to memory crashing repeatedly, variables no longer required have been cleared from memory

In [None]:
with open("/content/drive/MyDrive/corpora.txt", "r") as f:
    content = f.readlines()

In [None]:
content[1]

'in the last decade two viral pandemics have had a significant impact on worldwide health resulting mainly in severe acute respiratory failure arf the first was the severe acute respiratory syndrome or sars in a virulent clinical entity with a high mortality rate the second was influenza a in which according to the who report caused more than deaths within the first seasonthe use of noninvasive ventilation niv in adults has proved effective in treating chronic obstructive pulmonary disease exacerbation cardiogenic pulmonary oedema and arf in immunocompromised patients in these patients niv has achieved significant reductions in the rate of endotracheal intubation and ventilatorassociated complications and has improved survival rates nevertheless some metaanalyses argue against the use of niv in arf because it offers no advantages over conventional ventilation moreover delaying intubation in hypoxaemic intubated patients with pneumonia may increase the risk of complications early use of

In [None]:
content = list(map(lambda text: text.strip(), content)) # removing \n from corpus 

In [None]:
content[1]

'in the last decade two viral pandemics have had a significant impact on worldwide health resulting mainly in severe acute respiratory failure arf the first was the severe acute respiratory syndrome or sars in a virulent clinical entity with a high mortality rate the second was influenza a in which according to the who report caused more than deaths within the first seasonthe use of noninvasive ventilation niv in adults has proved effective in treating chronic obstructive pulmonary disease exacerbation cardiogenic pulmonary oedema and arf in immunocompromised patients in these patients niv has achieved significant reductions in the rate of endotracheal intubation and ventilatorassociated complications and has improved survival rates nevertheless some metaanalyses argue against the use of niv in arf because it offers no advantages over conventional ventilation moreover delaying intubation in hypoxaemic intubated patients with pneumonia may increase the risk of complications early use of

## Vocabulary

The sentences are padded with start and end tag

In [None]:
def padding(text):
    sent = '<s> ' + str(text)+ ' </s>'
    return sent

In [None]:
def vocab_count(corp):
  corp = list(map(padding,corp))
  count={}
  for sen in corp:
    for word in sen.split(' '):
      if word not in count.keys():
        count[word]=1
      else:
        count[word]+=1
  return count,len(count)

In [None]:
vocab_50,vocab_50_count=vocab_count(content)

In [None]:
vocab_50_count

1653831

The vocabulary size for corpus of size 50000 is 1653831. 

In [None]:
vocab_10,vocab_10_count=vocab_count(content[:10000])

In [None]:
vocab_10_count

547503

The vocabulary size for corpus of size 10000 is 547503

In [None]:
with open("/content/drive/MyDrive/vocab_50.txt", "w") as f:
     f.write(json.dumps(vocab_50))

In [None]:
with open("/content/drive/MyDrive/vocab_10.txt", "w") as f:
     f.write(json.dumps(vocab_10))

In [None]:
import gc
del vocab_50_count,vocab_10_count,vocab_50,vocab_10
gc.collect()

0

# Bigram & Trigram Model

## Bigram Model

In [None]:
def bigram_model (text):
    text = list(map(padding,text))   
    bigram_freq = Counter()
    for sentence in text:
        words = sentence.split(' ')
        for i in range(len(words)-1):
            word_pair = (words[i],words[i+1])
            bigram_freq[word_pair]+=1
    return bigram_freq

In [None]:
bi_model=bigram_model(content)

In [None]:
bi_model.most_common(10)

[(('of', 'the'), 1384639),
 (('in', 'the'), 1054897),
 (('to', 'the'), 514503),
 (('et', 'al'), 505727),
 (('and', 'the'), 370868),
 (('on', 'the'), 296200),
 (('for', 'the'), 293120),
 (('with', 'the'), 264445),
 (('to', 'be'), 245346),
 (('that', 'the'), 211801)]

Saving bigram model in Drive

In [None]:
with open(r"/content/drive/MyDrive/Bigram_Model_50k.txt",'w+') as f:
  f.write(str(bi_model))

In [None]:
import pickle
with open('/content/drive/MyDrive/Bigram_Model_50k.txt', 'wb') as f:
            pickle.dump(dict(bi_model), f)

In [None]:
bi_model_10k=bigram_model(content[:10000])

In [None]:
bi_model_10k.most_common(10)

[(('of', 'the'), 282592),
 (('in', 'the'), 214487),
 (('to', 'the'), 104874),
 (('et', 'al'), 99407),
 (('and', 'the'), 75389),
 (('on', 'the'), 60266),
 (('for', 'the'), 60214),
 (('with', 'the'), 54370),
 (('to', 'be'), 50162),
 (('that', 'the'), 43271)]

In [None]:
import gc
del bi_model
gc.collect()

719

In [None]:
import pickle
with open('/content/drive/MyDrive/Bigram_Model_10k.txt', 'wb') as f:
            pickle.dump(dict(bi_model_10k), f)

In [None]:
import gc
del content,bi_model_10k
gc.collect()

0

### Computing Probability for next word using bigram model



In [None]:
import pickle
with open("/content/drive/MyDrive/Bigram_Model_50k.txt" , 'rb') as bi:
    bigram = pickle.load(bi)

In [None]:
from pathlib import Path
vocab_50 = json.loads(Path("/content/drive/MyDrive/vocab_50.txt").read_text())

In [None]:
def predict_bigram_next_word(first_word):
  prob_first_second={}
  for keys in bigram.keys():
    if keys[0]== first_word:
      prob_first_second[keys]=(bigram[keys]+1)/(vocab_50[first_word]+len(vocab_50))
  return Counter(prob_first_second)

In [None]:
predict_bigram_next_word('were').most_common(10)

[(('were', 'not'), 0.011101136090804252),
 (('were', 'used'), 0.010686431280836737),
 (('were', 'also'), 0.008229332117724641),
 (('were', 'performed'), 0.007927338278542086),
 (('were', 'found'), 0.0069923739399354295),
 (('were', 'collected'), 0.0066971785402605375),
 (('were', 'obtained'), 0.005596546965230276),
 (('were', 'observed'), 0.005123518700539116),
 (('were', 'identified'), 0.005063406182787138),
 (('were', 'detected'), 0.004509512269215342)]

In [None]:
predict_bigram_next_word('integrated').most_common(10)

[(('integrated', 'into'), 0.0009600247841295115),
 (('integrated', 'with'), 0.0005007258723977565),
 (('integrated', 'in'), 0.00027738051924192266),
 (('integrated', 'and'), 0.00022454613462441356),
 (('integrated', 'dna'), 0.0001735129222097741),
 (('integrated', 'approach'), 0.00015850315385252723),
 (('integrated', 'the'), 0.00010987150437504728),
 (('integrated', 'to'), 0.00010086564336069914),
 (('integrated', 'care'), 9.065900087777126e-05),
 (('integrated', 'moving'), 7.805079545768387e-05)]

In [None]:
predict_bigram_next_word('treatment').most_common(10)

[(('treatment', 'of'), 0.019140245173176886),
 (('treatment', 'with'), 0.007537260522482551),
 (('treatment', 'and'), 0.005373807747620392),
 (('treatment', 'for'), 0.0044400841657130825),
 (('treatment', 'in'), 0.0032489880580236293),
 (('treatment', 'is'), 0.0025345480445945513),
 (('treatment', 'was'), 0.0021204666659048873),
 (('treatment', 'the'), 0.001947978102114318),
 (('treatment', 'group'), 0.0011296096480417107),
 (('treatment', 'or'), 0.0010757409798547504)]

In [None]:
predict_bigram_next_word('health').most_common(10)

[(('health', 'care'), 0.012729269916687777),
 (('health', 'and'), 0.009255150755828497),
 (('health', 'organization'), 0.004969222657243689),
 (('health', 'system'), 0.0026834648534327884),
 (('health', 'services'), 0.0024311958871167773),
 (('health', 'of'), 0.002156185693229302),
 (('health', 'systems'), 0.002101183654451807),
 (('health', 'emergency'), 0.0016770333169561238),
 (('health', 'professionals'), 0.001601934379394544),
 (('health', 'authorities'), 0.001557509655766567)]

In [None]:
predict_bigram_next_word('in').most_common(10)

[(('in', 'the'), 0.16794224902623878),
 (('in', 'a'), 0.03144294330772192),
 (('in', 'this'), 0.021148282031434824),
 (('in', 'addition'), 0.011959917940736604),
 (('in', 'patients'), 0.01172557206979304),
 (('in', 'our'), 0.007692339532779288),
 (('in', 'which'), 0.007223170183818226),
 (('in', 'order'), 0.006825801098305227),
 (('in', 'vitro'), 0.00599810803917779),
 (('in', 'an'), 0.005768060631899631)]

The spaces are filled up  using bigram model. The word with highest probability for the word preceeding the space is used. 

all houses were *not* ventilated \
it aims to develop an integrated *into* to reach mmps exposed \
to malaria with prevention diagnosis and treatment *of* by \
involving non-health *care* stakeholders from provincial to  \
community level this is because engineers do not work in *the* but rather
as a team


### Computing Perplexity Score for Bigram Model

In [None]:
bigram

{('<s>', 'introduction'): 303,
 ('introduction', 'severe'): 34,
 ('severe', 'acute'): 17364,
 ('acute', 'respiratory'): 27750,
 ('respiratory', 'syndrome'): 20392,
 ('syndrome', 'coronavirus'): 8981,
 ('coronavirus', 'is'): 1768,
 ('is', 'responsible'): 5273,
 ('responsible', 'for'): 22676,
 ('for', 'the'): 293120,
 ('the', 'outbreak'): 21148,
 ('outbreak', 'of'): 9638,
 ('of', 'coronavirus'): 6004,
 ('coronavirus', 'disease'): 8667,
 ('disease', 'the'): 6625,
 ('the', 'numbers'): 4372,
 ('numbers', 'are'): 1246,
 ('are', 'continuously'): 365,
 ('continuously', 'evolving'): 78,
 ('evolving', 'worldwide'): 6,
 ('worldwide', 'about'): 32,
 ('about', 'million'): 950,
 ('million', 'confirmed'): 621,
 ('confirmed', 'cases'): 13595,
 ('cases', 'and'): 17990,
 ('and', 'deaths'): 6480,
 ('deaths', 'as'): 826,
 ('as', 'of'): 11324,
 ('of', 'th'): 476,
 ('th', 'may'): 308,
 ('may', 'and'): 1614,
 ('and', 'italy'): 1113,
 ('italy', 'has'): 305,
 ('has', 'registered'): 29,
 ('registered', 'a'): 84

In [None]:
def preplexity_bigram(sentence):
  sentence = preprocessing(sentence)
  sentence = '<s> '+str(sentence)+' </s>'
  sentence = sentence.split(' ')
  p=1
  for i in range(len(sentence)-1):
    word = (sentence[i], sentence[i+1])
    if word in bigram.keys():
      p*=(bigram[word]+1)/(vocab_50[word[0]]+len(vocab_50.keys()))
    elif word[0] in vocab_50.keys():
      p*=1/(vocab_50[word[0]]+len(vocab_50))
    else:
      p*=1/(len(vocab_50))
  return (p)**(-1/len(sentence))

In [None]:
preplexity_bigram('it appears that the overall code stroke volume has decreased since the covid- pandemic.')

1945.4569333654485

In [None]:
preplexity_bigram('half a century ago hypertension was not treatable.')

9711.97972815161

In [None]:
preplexity_bigram('sarahs tv is broadcasting an advert for private healthcare.')

73773.3133350428

## Trigram Model

In [None]:
def trigram_model(text):
    text = list(map(padding,text))  
    trigram_freq = Counter()
    for sentence in text:
        word_list = sentence.split(' ')
        for i in range(len(word_list)-2):
            trigram = (word_list[i],word_list[i+1],word_list[i+2])
            trigram_freq[trigram]+=1
    return trigram_freq

In [None]:
tri_model = trigram_model(content[:10000])

In [None]:
import gc
del content,f
gc.collect()

0

In [None]:
tri_model.most_common(10)

[(('as', 'well', 'as'), 17346),
 (('the', 'number', 'of'), 14808),
 (('the', 'presence', 'of'), 11073),
 (('the', 'use', 'of'), 10199),
 (('testing', 'testing', 'testing'), 9873),
 (('due', 'to', 'the'), 8672),
 (('in', 'order', 'to'), 8537),
 (('one', 'of', 'the'), 8006),
 (('in', 'patients', 'with'), 7877),
 (('based', 'on', 'the'), 7841)]

Saving trigram model in Drive

In [None]:
import pickle
with open('/content/drive/MyDrive/Trigram_Model_10k.txt', 'wb') as f:
            pickle.dump(dict(tri_model), f)

### Computing Probability for next word using bigram model

In [None]:
import pickle
with open("/content/drive/MyDrive/Trigram_Model_10k.txt" , 'rb') as tri:
    tri_model = pickle.load(tri)

In [None]:
import pickle
with open("/content/drive/MyDrive/Bigram_Model_10k.txt" , 'rb') as bi:
    bigram_10k = pickle.load(bi)

In [None]:
from pathlib import Path
vocab_10 = json.loads(Path("/content/drive/MyDrive/vocab_10.txt").read_text())

In [None]:
def predict_trigram_next_word(first_second_word):
  prob_first_second={}
  for keys in tri_model.keys():
    if (keys[0],keys[1])== first_second_word:
      prob_first_second[keys]=(tri_model[keys]+1)/(bigram_10k[first_second_word]+len(vocab_10))
  return Counter(prob_first_second)

In [None]:
predict_trigram_next_word(('houses','were')).most_common(10)

[(('houses', 'were', 'built'), 3.6529146606076987e-06),
 (('houses', 'were', 'present'), 3.6529146606076987e-06),
 (('houses', 'were', 'malaria'), 3.6529146606076987e-06),
 (('houses', 'were', 'contacted'), 3.6529146606076987e-06),
 (('houses', 'were', 'foreclosed'), 3.6529146606076987e-06)]

In [None]:
predict_trigram_next_word(('an','integrated')).most_common(10)

[(('an', 'integrated', 'approach'), 5.29371927589222e-05),
 (('an', 'integrated', 'and'), 2.1905045279554013e-05),
 (('an', 'integrated', 'system'), 1.8254204399628345e-05),
 (('an', 'integrated', 'one'), 1.8254204399628345e-05),
 (('an', 'integrated', 'health'), 1.8254204399628345e-05),
 (('an', 'integrated', 'analysis'), 1.642878395966551e-05),
 (('an', 'integrated', 'model'), 1.4603363519702676e-05),
 (('an', 'integrated', 'view'), 1.4603363519702676e-05),
 (('an', 'integrated', 'practice'), 1.277794307973984e-05),
 (('an', 'integrated', 'part'), 1.0952522639777006e-05)]

In [None]:
predict_trigram_next_word(('and','treatment')).most_common(10)

[(('and', 'treatment', 'of'), 0.0014530073068627772),
 (('and', 'treatment', 'with'), 0.00018730882679207265),
 (('and', 'treatment', 'for'), 0.0001091119379371297),
 (('and', 'treatment', 'in'), 0.0001091119379371297),
 (('and', 'treatment', 'strategies'), 9.092661494760808e-05),
 (('and', 'treatment', 'and'), 8.365248575179944e-05),
 (('and', 'treatment', 'is'), 5.8193033566469174e-05),
 (('and', 'treatment', 'are'), 5.091890437066053e-05),
 (('and', 'treatment', 'options'), 4.546330747380404e-05),
 (('and', 'treatment', 'plan'), 4.364477517485188e-05)]

In [None]:
predict_trigram_next_word(('non','health')).most_common(10)

[]

In [None]:
predict_trigram_next_word(('work','in')).most_common(10)

[(('work', 'in', 'the'), 0.0003154073768862911),
 (('work', 'in', 'a'), 0.00013491413809008982),
 (('work', 'in', 'this'), 0.00011850566183588971),
 (('work', 'in', 'an'), 4.1932772649622516e-05),
 (('work', 'in', 'our'), 3.82864445931336e-05),
 (('work', 'in', 'addition'), 3.281695250840023e-05),
 (('work', 'in', 'ensuring'), 3.281695250840023e-05),
 (('work', 'in', 'concert'), 2.9170624451911316e-05),
 (('work', 'in', 'progress'), 2.1877968338933487e-05),
 (('work', 'in', 'these'), 2.005480431068903e-05)]

The spaces are filled up  using trigram model. The word with highest probability for the 2 words preceeding the space is used. 

all houses were *built* ventilated \
it aims to develop an integrated *approach* to reach mmps exposed \
to malaria with prevention diagnosis and treatment *of* by \
involving non-health *____* stakeholders from provincial to  \
community level this is because engineers do not work in *the* but rather
as a team

Note that there was no prediction for non-health by trigram model, so the space was left.


### Computing Perplexity Score for Trigram Model

In [None]:
tri_model

Counter({('<s>', 'introduction', 'severe'): 2,
         ('introduction', 'severe', 'acute'): 3,
         ('severe', 'acute', 'respiratory'): 3330,
         ('acute', 'respiratory', 'syndrome'): 3072,
         ('respiratory', 'syndrome', 'coronavirus'): 1837,
         ('syndrome', 'coronavirus', 'is'): 88,
         ('coronavirus', 'is', 'responsible'): 4,
         ('is', 'responsible', 'for'): 1068,
         ('responsible', 'for', 'the'): 1523,
         ('for', 'the', 'outbreak'): 71,
         ('the', 'outbreak', 'of'): 737,
         ('outbreak', 'of', 'coronavirus'): 74,
         ('of', 'coronavirus', 'disease'): 251,
         ('coronavirus', 'disease', 'the'): 53,
         ('disease', 'the', 'numbers'): 1,
         ('the', 'numbers', 'are'): 22,
         ('numbers', 'are', 'continuously'): 1,
         ('are', 'continuously', 'evolving'): 3,
         ('continuously', 'evolving', 'worldwide'): 1,
         ('evolving', 'worldwide', 'about'): 1,
         ('worldwide', 'about', 'million'):

In [None]:
def preplexity_trigram(sentence):
  sentence=preprocessing(sentence)
  sentence='<s> ' + str(sentence)+ ' </s>'
  sentence = sentence.split(' ')
  p=(vocab_10[sentence[0]]+1)/len(vocab_10)
  for i in range(len(sentence)-2):
    word= (sentence[i], sentence[i+1],sentence[i+2])
    if word in tri_model.keys():
      p*=(tri_model[word]+1)/(bigram_10k[(word[0],word[1])]+len(vocab_10))
    elif (word[0],word[1]) in bigram_10k.keys():
      p*=1/(bigram_10k[(word[0],word[1])]+len(vocab_10))
    else:
      p*=1/(len(vocab_10))
  return (p)**(-1/len(sentence))

In [None]:
preplexity_trigram('it appears that the overall code stroke volume has decreased since the covid-pandemic.')

33542.41353904068

In [None]:
preplexity_trigram('half a century ago hypertension was not treatable.')

24817.3210703467

In [None]:
preplexity_trigram('sarahs tv is broadcasting an advert for private healthcare.')

71298.54649964192