In [None]:
!pip install spacy==3.0.1

In [None]:
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import os
import json
from spacy.tokens import Doc
import pickle 
import re
import spacy
from collections import Counter
import matplotlib.pyplot as plt
from nltk import sent_tokenize
from spacy.language import Language
import dask.dataframe as dd

In [None]:
print(spacy.__version__)

In [None]:
!pip install scispacy

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz

In [None]:
#load file json from document_parses, get paper_id, title, body_text
path_pdf = '../input/CORD-19-research-challenge/document_parses/pdf_json'
path_pmc = '../input/CORD-19-research-challenge/document_parses/pmc_json'

count = 0
docs = []
def get_data(docs, path_data, filename):
    file_path = os.path.join(path_data, filename)
    data_f_json = json.load(open(file_path, 'rb'))
    
    paper_id = data_f_json['paper_id']
    title = data_f_json["metadata"]["title"]
    texts = data_f_json["body_text"]
    
    full_text = ""
    for text in texts:
        full_text += text['text']
    docs.append([paper_id, title, full_text])
    return docs

for filename in tqdm(os.listdir(path_pdf)):
    docs = get_data(docs, path_pdf, filename)
    count += 1
    if count >= 100:
        break

for filename in tqdm(os.listdir(path_pmc)):
    docs = get_data(docs, path_pmc, filename)
    count += 1
    if count >= 200:
        break

In [None]:
my_data = pd.DataFrame(docs,columns=['paper_id','title','body_text'])
my_data.head()

In [None]:
my_data.shape

In [None]:
my_data.columns

In [None]:
#flatten body_text
sents_text = []
for idx in range(len(my_data["body_text"])):
    text = my_data["body_text"].iloc[idx]
    sents_text.extend(sent_tokenize(text))

In [None]:
@Language.component('normalize')
def normalize(doc):
    norm_text = []
    st_abrv = dict()
    for abrv in doc._.abbreviations:
        st_abrv[abrv.start] = abrv._.long_form
    for token in doc:
        if not re.match('[=|+]',token.text) and token.is_ascii and not token.is_bracket and not token.is_punct and not token.is_stop and not token.is_space and not token.like_num and not token.like_url and not token.like_email and not token.is_currency and not (token.pos_ in ['VBZ','ADP','PRON','AUX']):
            if idx in st_abrv.keys():
                norm_text.append(st_abrv.get(idx))
            else:
                norm_text.append(token.text.lower())
    return Doc(vocab= nlp.vocab, words= norm_text)
from scispacy.abbreviation import AbbreviationDetector
nlp = spacy.load("en_core_sci_sm", disable=['parser','ner'])
# Add the abbreviation pipe to the spacy pipeline.
nlp.add_pipe("abbreviation_detector", after= 'tok2vec')
nlp.add_pipe('sentencizer', after="lemmatizer")
nlp.add_pipe('normalize', before='tagger')
print(nlp.pipe_names)

In [None]:
pipe_docs = nlp.pipe(sents_text,n_process=2, batch_size=100)

In [None]:
pipe_docs = list(pipe_docs)

In [None]:
##Save file docs
# with open('./file_docs','wb') as f:
#     pickle.dump(list(pipe_docs), f)
#     f.close()

In [None]:
# #Load file docs
# with open('../input/file-docs/file_docs', mode='rb') as f:
#     while True:
#         try:
#             docs_f = pickle.load(f)
#         except EOFError:
#             break

In [None]:
#create list word normalized
wordcount = []
for doc in pipe_docs:
    for token in doc:
        wordcount.append(token.text)

In [None]:
#using Counter for count word
word_counter = Counter(wordcount)

lst = word_counter.most_common(30)
df = pd.DataFrame(lst, columns = ['Word', 'Count'])
df.plot.bar(x='Word',y='Count')

In [None]:
for i in range(1000):
    print(wordcount[i])

In [None]:
#get all vector and sent 
#vectors = []
doc_sents = []
for doc in pipe_docs:
        for sent in doc.sents:
            #vectors.append(sent.vector)
            doc_sents.append(sent.text)
#vectors = np.array(vectors)
doc_sents = np.array(doc_sents)
#print(vectors.shape)
print(doc_sents.shape)

In [None]:
doc_sents[0]

In [None]:
!pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
vectors = model.encode(doc_sents[:50])
vec_mean = vectors.mean(axis=0)
centered = vectors - vec_mean

In [None]:
vec_mean.shape

In [None]:
centered.shape

In [None]:
def cosine_similarity(sentence_vec, centered):
    norm_vec = np.linalg.norm(centered, axis = 1, keepdims = True)
    norm_sent = np.linalg.norm(sentence_vec)
    norm_arr = norm_vec*norm_sent
    return np.dot(centered, sentence_vec)/norm_arr

def recommend_text(sentence: str, thres_min: float, thres_max: float, nlp) -> list:
    text = [sent for sent in nlp(sentence).sents]
    sent_sim = list()
    sentence_vec = model.encode(text)[0]
    sentence_vec = sentence_vec - vec_mean
    sims = cosine_similarity(np.expand_dims(sentence_vec, axis=1) ,centered)
    sims = sims.flatten()
    sims = (sims + 1)/2
    sim_index = np.argsort(-sims,axis= 0)
    for idx in sim_index:
        if (sims[idx] > thres_min) and (sims[idx] < thres_max):
            sent_sim.append(sents_text[idx])
    return sent_sim

In [None]:
sent1 = 'range of incubation period'
sent2 = 'transmission of virus in community'
sent3 = 'seasonal outbreaks'
sent4 = 'ethnicity considered point'
recommend_text(sent4, 0.8, 1.0, nlp)
