## NLP Project



In [1]:
import json
import gensim

from pathlib import Path
from scipy.spatial.distance import cosine
from nltk.tokenize import sent_tokenize, word_tokenize 
from gensim.models import Word2Vec

In [2]:
def create_paper_dict(paper):
    """
    Reads in a research paper and returns a dictionary containing the paper ID, abstract, and main text.
    Input: research paper --> JSON file
    Output: {paper_id: , abstract: , body_text: } --> dictionary
    """
    paper_dict = {}
    abstract = ''
    text = ''
    
    try:  # many papers don't have abstracts
        for i in paper['abstract']:
            abstract += i['text']
    except:
        pass
    for i in paper['body_text']:
        text += i['text']
    
    paper_dict['paper_id'] = paper['paper_id']
    paper_dict['abstract'] = abstract
    paper_dict['body_text'] = text
    
    return paper_dict


# data_path = 'C://Users//Binyamin//PythonProjects//NLP//final_project//data//'
data_path = 'data'
lit = []

# Searches recursively through Repo for .json files and creates a list of dictionary from them.
pathlist = Path(data_path).glob('**/*.json')
for path in pathlist:
    path_in_str = str(path)  # because path is object not string
    with open(path_in_str) as f:
        data = json.load(f)
    paper_dict = create_paper_dict(data)
    lit.append(paper_dict)

In [3]:
len(lit)

788

#### Literature - Text Sample

In [4]:
lit[0]['body_text'][: 963]

'It is highly contagious, and severe cases can lead to acute respiratory distress or multiple organ failure [3] . On 11 March 2020, the WHO has made the assessment that COVID-19 can be characterised as a pandemic. As of , in total, 1,391,890 cases of COVID-19 have been recorded, and the death toll has reached 81,478 with a rapid increase of cases in Europe and NorthAmerica.8th April 2020The disease can be confirmed by using the reverse-transcription polymerase chain reaction (RT-PCR) test [4] . While being the gold standard for diagnosis, confirming COVID-19 patients using RT-PCR is time-consuming, and both high false-negative rates and low sensitivities may put hurdles for the presumptive patients to be identified and treated early [3] [5] [6] .As a non-invasive imaging technique, computed tomography (CT) can detect those characteristics, e.g., bilateral patchy shadows or ground glass opacity (GGO), manifested in the COVID-19 infected lung [7] [8] .'

<br>

### Collating the body text of all the papers

In [5]:
papers = []
for paper in lit:
    papers.append(paper['body_text'])

In [6]:
def get_tokens(f):
    tokenized_data = []

    for text in sent_tokenize(f):
        sentence = [] 
        for word in word_tokenize(text): 
            sentence.append(word.lower()) 
        tokenized_data.append(sentence)
    return tokenized_data

In [7]:
papers_joined = ' '.join(papers)

In [8]:
papers_tokenized = get_tokens(papers_joined)

#### Find the phrases in the Embeddings

In [9]:
def get_phrases(model):
    keys = list(model.wv.vocab.keys())
    phrases = []
    for k in keys:
        if '_' in k:
            print(k)
            phrases.append(k)
    return phrases

### Creating single word embeddings

In [10]:
# Creating the Word2Vec models
model1 = gensim.models.Word2Vec(papers_tokenized, min_count = 1, size = 100, window = 5) 

model2 = gensim.models.Word2Vec(papers_tokenized, min_count = 1, size = 100, window = 5, sg = 1)

In [11]:
# Checking the vectors
print(model1.wv['covid-19'])

[ 8.26923609e-01  2.61659360e+00 -8.27334225e-02  5.38229465e-01
 -9.94117439e-01 -3.87442493e+00 -6.96345270e-01  2.77918267e+00
 -1.40695953e+00  3.95146817e-01  2.23959851e+00 -1.71446490e+00
  1.55104208e+00  3.01887274e-01 -3.03804398e+00 -6.25375211e-01
  1.09006798e+00 -2.22922826e+00  1.49104849e-01  1.68574703e+00
 -1.19529569e+00 -3.25266451e-01 -3.45773625e+00  8.88256192e-01
 -9.33322251e-01  4.60733771e-01  1.68860114e+00  1.47994947e+00
 -2.06382537e+00 -4.34833765e+00  1.65882063e+00 -1.31434572e+00
  3.11345315e+00  1.56149471e+00 -4.20445710e-01  1.32742751e+00
  2.06985235e+00  7.85256743e-01 -1.21892646e-01 -1.18910718e+00
  2.47400117e+00 -1.39158440e+00 -4.51393515e-01  8.70708764e-01
 -2.52037072e+00  1.40527308e-01 -7.54242182e-01 -2.34408522e+00
  6.07615530e-01 -2.46063724e-01  8.19418073e-01 -2.97807097e-01
  3.71279192e+00  8.76003385e-01 -1.49308920e+00  3.20777178e-01
 -2.76745468e-01  2.13290548e+00 -1.36524343e+00 -4.67133790e-01
  1.96110234e-01  1.61062

In [12]:
model2.wv['covid-19']

array([-0.84637946,  0.28372577,  0.04496578,  0.19600813, -0.2470242 ,
       -0.40887585,  0.14495395,  0.4954242 , -0.543544  ,  0.12964353,
        0.60227126, -0.38009283,  0.5444126 ,  0.0414533 ,  0.15818694,
        0.14558883,  0.67090315, -0.6638891 ,  0.05648408,  0.706631  ,
        0.03201538,  0.22368635, -0.16347015, -0.12725721, -0.20802411,
       -0.10820621, -0.09826882,  0.07805505,  0.23418501, -0.6930578 ,
        0.51263225, -0.46163473,  0.48860902,  0.22044551,  0.20175064,
        0.6469956 ,  0.41784322,  0.6856917 ,  0.41666463,  0.11608683,
        0.26542455, -0.37355825, -0.09831123,  0.9270578 ,  0.1818363 ,
       -0.27748898, -0.19454962, -0.25502518, -0.14908303, -0.69140774,
        0.30506262,  0.04884483, -0.10744089,  0.21876861,  0.2604155 ,
        0.02467573, -0.08131506,  0.08704039,  0.04549044, -0.02001595,
        0.08753835, -0.12233245,  0.35407674,  0.27864763, -0.3432078 ,
        0.10098751,  0.47464132, -0.20520347, -0.31862095, -0.53

In [13]:
model1.similarity('covid-19', 'contagious')

  """Entry point for launching an IPython kernel.


0.24166737

In [14]:
model2.similarity('covid-19', 'contagious')

  """Entry point for launching an IPython kernel.


0.51311374

<br>

### Creating embeddings for phrases

#### Method 1

In [15]:
from gensim.models import Phrases

In [17]:
bigram_transformer = Phrases(papers_tokenized)

In [18]:
phrase_model_1 = Word2Vec(bigram_transformer[papers_tokenized], min_count=1)

In [None]:
phrase_model_2 = Word2Vec(bigram_transformer[papers_tokenized], min_count = 1, size = 100, window = 5)

In [None]:
phrase_model_3 = Word2Vec(bigram_transformer[papers_tokenized], min_count = 1, size = 100, window = 5, sg = 1)

In [None]:
m = [phrase_model_1, phrase_model_2, phrase_model_3]

for m in models:
    print(m.wv['covid-19'])
    print(m.wv['highly_contagious'])
    print(m.wv.similarity('covid-19', 'highly_contagious'))

<br>

### Method 2

In [92]:
import re
from spacy.lang.en.stop_words import STOP_WORDS
from gensim.models.phrases import Phrases, Phraser

In [93]:
def clean(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r'[^a-z0-9\s]', '', sentence)
    return re.sub(r'\s{2,}', ' ', sentence)

def tokenize(sentence):
    return [token for token in sentence.split() if token not in STOP_WORDS]

In [94]:
def build_phrases(sentences):
    phrases = Phrases(sentences,
                      min_count=5,
                      threshold=7,
                      progress_per=1000)
    return Phraser(phrases)

In [97]:
phrases_model = build_phrases(papers)

In [None]:
def sentence_to_bi_grams(phrases_model, sentence):
    return ' '.join(phrases_model[sentence])

In [95]:
def sentences_to_bi_grams(n_grams, document):
    output = []
    for sentence in document:
        clean_text = clean(sentence)
        tokenized_text = tokenize(clean_text)
        parsed_text = sentence_to_bi_grams(n_grams, tokenized_text)
        output.append(parsed_text)
    return output

In [96]:
output = sentences_to_bi_grams(phrases_model, papers)

In [101]:
output_text = ' '.join(output)

In [116]:
output_text[: 100]

'highly contagious severe cases lead acute respiratory distress multiple organ failure 3 11 march 202'

In [119]:
tokens = get_tokens(output_text)

In [None]:
tokens[: 100]

In [121]:
model = Word2Vec(tokens, min_count=1)

In [None]:
model.wv.vocab

<br>

### Saving the model

In [145]:
# phrases_model.save('phrases_model.txt')
# phrases_model= Phraser.load('phrases_model.txt')

In [63]:
# This code creates a single text file from all the abstracts and main texts
# Takes the list of dictionaries as input
a = open(data_path + 'single_text_file.txt', 'a', encoding='UTF-8')
all_text = ''
t_time = 0
for num, i in enumerate(lit):
    start = time.perf_counter()
    all_text = all_text+i['abstract']
    all_text = all_text+i['body_text']
    end = time.perf_counter()
    t_time = t_time + (end-start)
    if num%500 == 0:
        a.write(all_text)
        all_text = ''
        print ("done ", num, " papers in ", t_time, " seconds")
        t_time = 0
        
a.close()

done  0  papers in  0.0067048000055365264  seconds
done  500  papers in  10.631782699696487  seconds
done  1000  papers in  7.8358053001138614  seconds
done  1500  papers in  7.038967199987383  seconds
done  2000  papers in  6.566383399927872  seconds
done  2500  papers in  6.56111050004256  seconds
done  3000  papers in  6.073433299810858  seconds
done  3500  papers in  6.77269339976192  seconds
done  4000  papers in  9.315488700071  seconds
done  4500  papers in  9.450686000156566  seconds
done  5000  papers in  10.072518799919635  seconds
done  5500  papers in  9.106466499943053  seconds
done  6000  papers in  8.344403199997032  seconds
done  6500  papers in  8.469056899935822  seconds
done  7000  papers in  8.367996099987067  seconds
done  7500  papers in  8.089393200047198  seconds
done  8000  papers in  8.690419099802966  seconds
done  8500  papers in  8.256924200148205  seconds
done  9000  papers in  8.280585600063205  seconds
done  9500  papers in  8.39931279992743  seconds
don

In [None]:
def diagnose(symptoms, diseases):
    """
    Takes in list of symptoms and list of diseases (maybe make global) and produces avg similarities 
    between to each disease.
    
    Param: symptoms --> list
    Param: diseases --> list
    Output: sims --> dict{similarity: disease}
    """
    sims = {}
    for i in diseases:
        cos_list = []
        for j in symptoms:
            cos_list.append(cosine(we_dict[i], we_dict[j]))
        avg_cos = sum(cos_list)/len(cos_list)
        sims[avg_cos] = i
        
    return sims
    
sims = diagnose(symptoms, diseases)
top_diagnosis = sims[min(sims.keys())]
top_5 = [sims[x] for x in sorted(sims.keys())[:5]]