In [None]:
import os
import re
import json
from tqdm import tqdm
import numpy as np
import pandas as pd
import string
import spacy
import math
from matplotlib import pyplot as plt
from wordcloud import WordCloud
from tqdm import tqdm

In [None]:
# Load the metatadata about research papers in dataset directory


debug = False
articles = {}
stat = { }
for dirpath, subdirs, files in os.walk('/kaggle/input'):
    for x in files:
        if x.endswith(".json"):
            articles[x] = os.path.join(dirpath, x)        
df = pd.read_csv('/kaggle/input/CORD-19-research-challenge/metadata.csv')

# A brief dataset first to test things
df

In [None]:
# Only get papers related to coronavirus

VIRUS_REF = ['covid-19', 'covid', 'coronavirus', 'cov-2', 'sars-cov-2', 'sars-cov', 'hcov', '2019-ncov']
def virus_match(text):
    return len(re.findall(rf'({"|".join(VIRUS_REF)})', text, flags=re.IGNORECASE)) > 0

def list_to_string(data, attribute): 
    text = ' '.join([elem['text'] for elem in data[attribute]])
    return text


In [None]:
# Parse all the papers json files and extract abstract and body_text
literature = []
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    sha = str(row['sha'])
    if sha != 'nan':
        sha = sha + '.json';
        try:
            with open(articles[sha]) as f:
                data = json.load(f)
                paper = {'paper_id': data['paper_id'], 'title': data['metadata']['title']}
                body_text = list_to_string(data, 'body_text')
                abstract = list_to_string(data, 'abstract')
                if virus_match(abstract) | virus_match(body_text):
                    paper['body_text'] = body_text
                    paper['abstract'] = abstract
                    literature.append(paper)
    
        except KeyError:
            pass

In [None]:
literature_df = pd.DataFrame(literature)
print(len(literature_df))
literature_df.head()

## Group and query papers by similarity

### Comparing articles similarity

In [None]:
# count vectorizer convert a collection of documents into a matrix of word frecuencies
# max_df param is set the max treshold to ignore words wich frequency pass this value.
# stop_worlds to remove stop_words
# ngrams_range set the tuple of ngrams, in our case we want tuples (word1, word2)

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer

analyzer = CountVectorizer().build_analyzer()
stemmer = SnowballStemmer("english")

def stemming(doc):
    doc=doc.lower()
    return str.join(" ", [stemmer.stem(w) for w in analyzer(doc)])

%time literature_df['stemmized'] = literature_df['body_text'].apply(lambda doc: stemming(doc))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from scipy.spatial import distance
from matplotlib import pyplot as plt
from wordcloud import WordCloud
import ipywidgets as widgets

from IPython.display import Image
from IPython.display import display, HTML

In [None]:
cv = CountVectorizer(max_df=0.95, min_df=0.01, stop_words='english')
%time word_count = cv.fit_transform(literature_df.body_text)
tfidf_tr = TfidfTransformer(smooth_idf=True, use_idf=True)
%time tfidf_tr.fit_transform(word_count)

In [None]:

def get_word_vector(document):
    word_vector = tfidf_tr.transform(cv.transform([document]))
    return word_vector 

%time literature_df['word_vector'] = literature_df.body_text.apply(get_word_vector)

In [None]:

def show_word_cloud(word_vector):
    cloud = WordCloud(background_color='white',
        width=500,
        height=500,
        max_words=20,
        colormap='tab10',
        prefer_horizontal=1.0)
    word_frequency = dict(get_words_with_value(word_vector))
    cloud.generate_from_frequencies(word_frequency)
    plt.gca().imshow(cloud)
    plt.gca().axis('off')

feature_names = cv.get_feature_names()
def get_words_with_value(word_vector):
    return sorted([(feature_names[ind], val) for ind, val in zip(word_vector.indices, word_vector.data)], key=lambda x: x[1], reverse=True)

In [None]:

def calculate_distance_between_words_vectors(query_indices, search_vec, document_vector):
    document_vec = document_vector[0, query_indices].toarray()
    return distance.euclidean(search_vec, document_vec)

def calculate_related_documents(query, max_documents_comparison):    
    query_vector = get_word_vector(query)
    query_indices = query_vector.indices
    query_vector_array = query_vector[0, query_indices].toarray()
    distance_idx = literature_df.apply(lambda x: calculate_distance_between_words_vectors(query_indices, query_vector_array, x.word_vector), axis=1)
    
    relevant_indexes = distance_idx.sort_values().head(max_documents_comparison).index 
    result_columns = ["paper_id", "word_vector"]
    
    result = literature_df[result_columns].iloc[relevant_indexes].fillna("")
    return result

In [None]:

def display_word_frecuencies_distances(df_result):
    display_columns = ["paper_id", "word_vector"]
    display(df_result[display_columns].reset_index(drop=True))
    rows = math.ceil(len(df_result)/3)
    plt.rcParams["figure.figsize"] = (20,15)
    for i in range(len(df_result)):
        row = df_result.iloc[i]
        plt.subplot(rows, 3, i+1)
        show_word_cloud(row.word_vector)
        paper_id = f'{row.paper_id[:5]}...{row.paper_id[-5:]}'

        plt.title(f'Paper {paper_id}', fontsize=10)
    plt.show()

def compare_distances_and_show(questionary):
    for question in questionary:
        display(HTML(f"<h3>{question}<h3>"))
        topic_result = calculate_related_documents(question, 6)
        display_word_frecuencies_distances(topic_result)

In [None]:
task1 = ["Range of incubation periods for the disease in humans (and how this varies across age and health status) and how long individuals are contagious, even after recovery?", 
               "Prevalence of asymptomatic shedding and transmission (e.g., particularly children)", 
               "Physical science of the coronavirus (e.g., charge distribution, adhesion to hydrophilic phobic surfaces, environmental survival to inform decontamination efforts for affected areas and provide information about viral shedding).", 
               "Persistence of virus on surfaces of different materials (e,g., copper, stainless steel, plastic)."]

compare_distances_and_show(task1)

In [None]:
task2 = ["Real-time tracking of whole genomes and a mechanism for coordinating the rapid dissemination of that information to inform the development of diagnostics and therapeutics and to track variations of the virus over time.",
"Access to geographic and temporal diverse sample sets to understand geographic distribution and genomic differences, and determine whether there is more than one strain in circulation. Multi-lateral agreements such as the Nagoya Protocol could be leveraged.",
    "livestock could be infected (e.g., field surveillance, genetic sequencing, receptor binding) and serve as a reservoir after the epidemic appears to be over.",
" whether farmers are infected, and whether farmers could have played a role in the origin." ]

compare_distances_and_show(task2)

In [None]:
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters=10,
                          random_state=0,
                          batch_size=100,
                          max_iter=50)

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=0.4)
%time transform = tfidf_vectorizer.fit_transform(literature_df['body_text'])
%time kmeans.fit(transform)

In [None]:

print("Centroid of clusters: \n")
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names()
for i in range(10):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :30]]
    print("Cluster {}: {} \n".format(i, ' '.join(top_ten_words)))

In [None]:
task2 = ["Real-time tracking of whole genomes and a mechanism for coordinating the rapid dissemination of that information to inform the development of diagnostics and therapeutics and to track variations of the virus over time.",
"Access to geographic and temporal diverse sample sets to understand geographic distribution and genomic differences, and determine whether there is more than one strain in circulation. Multi-lateral agreements such as the Nagoya Protocol could be leveraged.",
    "livestock could be infected (e.g., field surveillance, genetic sequencing, receptor binding) and serve as a reservoir after the epidemic appears to be over.",
" whether farmers are infected, and whether farmers could have played a role in the origin." ]

predictions = [kmeans.predict(tfidf_vectorizer.transform([task])) for task in task2]
for i in range(0, len(task2)):
    print(f'Query: {task2[i]}')
    print(f'Prediction: {predictions[i]}')

