In [None]:
import os
 
print(os.cpu_count())

In [None]:
import numpy as np 
import pandas as pd 
import dask
import os
import json
from spacy.tokens import Doc
import re
import spacy
from collections import Counter
import matplotlib.pyplot as plt
from nltk import sent_tokenize
from spacy.language import Language
import dask.dataframe as dd
import dask.bag as db
from dask.distributed import Client, LocalCluster
import scispacy
from scispacy.abbreviation import AbbreviationDetector
from os import path
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer
from spacy.symbols import ORTH, NORM

In [None]:
print(spacy.__version__)

In [None]:
%%time
metadata_path = '../input/CORD-19-research-challenge/metadata.csv'
df = dd.read_csv(metadata_path,dtype={'cord_uid':'str','sha':'str','source_x':'str','title':'str','doi':'str','pmcid':'str','license':'str','authors':'str','journal':'str','mag_id':'str','abstract':'str','url':'str','s2_id':'str','pubmed_id': 'str','arxiv_id': 'str','who_covidence_id': 'str','publish_time':'str','pdf_json_files':'str','pmc_json_files':'str'})
df.head()

In [None]:
df = df.dropna(subset=['publish_time', 'pdf_json_files', 'pmc_json_files'])

In [None]:
%%time
import datetime
#select file have publish_time since date input
date = pd.Timestamp('2021-11-20')
df["publish_time"] = df["publish_time"].astype('datetime64[ns]')
filter_df = df[df["publish_time"] > date]

In [None]:
# file_path = pd.DataFrame({'pdf_json_files': ['document_parses/pdf_json/0000028b5cc154f68b8a269f6578f21e31f62977.json'],'pmc_json_files': ['document_parses/pmc_json/PMC1054884.xml.json'] })
# filter_df2 = dd.from_pandas(file_path, npartitions =2)

In [None]:
dd_concat = filter_df.pdf_json_files.append(filter_df.pmc_json_files)

In [None]:
cluster = LocalCluster(dashboard_address=':0', processes=False)
client = Client(cluster)
print(cluster.scheduler)

In [None]:
%%time
def load_json(row_path):
    root_path = '../input/CORD-19-research-challenge/'
    file_path = root_path + row_path
    if path.exists(file_path):
        f = open(file_path)
        return json.load(f)
json_dask = dd_concat.apply(load_json,meta=('json_files', 'str'))

In [None]:
json_dask = json_dask.dropna()

In [None]:
%%time
def get_text(data_json):
    texts = data_json["body_text"]
    full_text = ""
    for text in texts:
        full_text += text['text']
    return sent_tokenize(full_text)   
sentences = json_dask.apply(get_text, meta=('sentences', 'str'))

In [None]:
%%time
f_sentences = db.from_sequence(sentences)
f_sentences = f_sentences.flatten()

In [None]:
%%time
raw_sentences = f_sentences.to_dataframe()
raw_sentences = raw_sentences.rename(columns={0: "sent"}) 

In [None]:
@Language.component('normalize')
def normalize(doc):
    norm_text = []
    st_abrv = dict()
    for abrv in doc._.abbreviations:
        st_abrv[abrv] = abrv._.long_form
    for token in doc:
        if not re.match('[=|+]',token.text) and token.is_ascii and not token.is_bracket and not token.is_punct and (token.norm_ not in stop_words) and not token.is_space and not token.like_num and not token.like_url and not token.like_email and not token.is_currency and (token.pos_ not in ['VBZ','ADP','PRON','AUX']) and len(token.text) > 1:
            if token.text in st_abrv.keys():
                norm_text.append(st_abrv.get(token.text).lower())
            else:
                norm_text.append(token.norm_.lower())
    return Doc(vocab= nlp.vocab, words= norm_text)
nlp = spacy.load("en_core_sci_sm", disable=['parser','ner'])
# Add the abbreviation pipe to the spacy pipeline.
nlp.add_pipe("abbreviation_detector", after= 'tok2vec')
nlp.add_pipe('normalize')

stop_words = nlp.Defaults.stop_words
special_case1 = [{'ORTH':'et','NORM':'and'}]
special_case2 = [{'ORTH':'al.','NORM':'other'}]
nlp.tokenizer.add_special_case("et", special_case1)
nlp.tokenizer.add_special_case("al.", special_case2)
print(nlp.pipe_names)

In [None]:
%%time
pipe_sent = f_sentences.map(nlp)

In [None]:
#create list word normalized
wordcount = []
for doc in pipe_sent:
    for tokens in doc:
        wordcount.append(tokens.text)

In [None]:
#using Counter for count word
word_counter = Counter(wordcount)

lst = word_counter.most_common(30)
df = pd.DataFrame(lst, columns = ['Word', 'Count'])
df.plot.bar(x='Word',y='Count')

In [None]:
model_name = "bert-base-nli-mean-tokens"
model = SentenceTransformer(model_name)
model.tokenizer = AutoTokenizer.from_pretrained(f"sentence-transformers/{model_name}", use_fast=False)

In [None]:
cluster_end = LocalCluster(dashboard_address=':0', processes=True)
client = Client(cluster_end)
print(client.scheduler)

In [None]:
%%time
def cal_vector(sents):
    return model.encode([sents])[0]
vectors = f_sentences.map(cal_vector)

In [None]:
def pytorch_cos_sim(vec, sents):
    return util.pytorch_cos_sim(vec,sents)
def recommend_text(sentence: str, thres_min: float, thres_max: float) -> list:
    sentence_vec = model.encode([sentence])[0]
    sims = vectors.map(pytorch_cos_sim,sents= sentence_vec)
    sims = sims.to_dataframe()
    sims = sims.rename(columns={0: "sim"}) 
    stan_sims = (sims.sim-sims.sim.min())/(sims.sim.max()-sims.sim.min())
    dd_sims = stan_sims.to_frame()
    dd_sent_sims = raw_sentences[(dd_sims['sim'] > thres_min) & (dd_sims['sim'] < thres_max)]
    sents = dd_sent_sims['sent'].compute()
    return sents

In [None]:
%%time
sent1 = 'range of incubation period'
sent2 = 'transmission of virus in community'
sent3 = 'seasonal outbreaks'
sents = recommend_text(sent2, 0.8, 0.9)

In [None]:
[sent for sent in sents]

In [None]:
!pip install spacy==3.0.1

In [None]:
!pip install scispacy

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz

In [None]:
!pip install sentence-transformers