In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt

import pickle
from gensim import corpora, models
import re
from gensim.parsing.preprocessing import STOPWORDS, strip_tags, strip_numeric, strip_punctuation, strip_multiple_whitespaces, remove_stopwords, strip_short, stem_text
from nltk.corpus import stopwords
import pickle
import en_core_web_sm
import csv
import json

from sklearn.preprocessing import Binarizer
from gensim.corpora import Dictionary
from gensim.models import TfidfModel

from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from scipy.spatial.distance import cosine, cdist

## Initializing the functions

**To get similar docs to the target:**

In [None]:
# Making a function for getting the top similar to add to the target table

def get_similar_docs(target_df, meta_df, d2v_model, d2v_target):
    """
      This function takes:
      [1] a target table dataframe
      [2] the metadata table dataframe
      [3] doc2vec model based on the metadata abstracts
      [4] doc2vec model of the target table obtained with the metadata doc2vec model

      Both the target and the metadata tables should contain columns: title, abstract and pdf_json_files.

      For this function to run successfully, 
      the following packages need to be installed:
       from gensim.models.doc2vec import Doc2Vec
       import pandas as pd

      At the end it prints the value count of the final dataframes that contains the following columns:
      ('index', 'original_db', 'similarity_percentage', 'title', 'abstract', 'pdf_json_files');

      It mades 3 dataframes:
      * not_target: it contains all the new docs found
      * similar_to_target_df: it contains the original results from the similarity function (target articles + 1st, 2nd and 3rd most similar)
      * new_docs_target_df: it contains the target articles + similar docs that are not in the target table
      
      At the end it returns the new_docs_target_df and the not_target.
      
    """
    # Run the similarity test assuming all titles are in the filtered dataset:
    similar_to_target = []
    for i in range(len(target_df.title)):
        sim_test = d2v_model.docvecs.most_similar(positive=[d2v_target[i]], topn=3)
        #this way the list could be used to create a dataframe
        similar_to_target.append([i, 'target', 1, 
                                  target_df.title[i], 
                                  target_df.abstract[i], 
                                  target_df.pdf_json_files[i]])
        
        similar_to_target.append([meta_df.index[sim_test[0][0]], 'most similar', sim_test[0][1], 
                                  meta_df.title[sim_test[0][0]], 
                                  meta_df.abstract[sim_test[0][0]], 
                                  meta_df.pdf_json_files[sim_test[0][0]]])

        #checking if the second and third most similar docs are in target table, if not then append them:
        if meta_df.title[sim_test[1][0]] not in list(target_df.title):
            similar_to_target.append([meta_df.index[sim_test[1][0]], 'second most similar', sim_test[1][1], 
                                      meta_df.title[sim_test[1][0]], meta_df.abstract[sim_test[1][0]], meta_df.pdf_json_files[sim_test[1][0]]])
        
        elif meta_df.title[sim_test[2][0]] not in list(target_df.title):
            similar_to_target.append([meta_df.index[sim_test[2][0]], 'third most similar', sim_test[2][1], 
                                      meta_df.title[sim_test[2][0]], meta_df.abstract[sim_test[2][0]], meta_df.pdf_json_files[sim_test[2][0]]])

# creating a dataframe with the top 3 most similar docs of the target ones!
    df_colum = ['original_index', 'original_db', 'similarity_percentage', 'title', 'abstract', 'pdf_json_files']
    similar_to_target_df = pd.DataFrame(similar_to_target, index=range(len(similar_to_target)), columns=df_colum)
    # removing the duplicates
    new_docs_target_df = similar_to_target_df.drop_duplicates(subset='title', keep="first", inplace=False)
    new_docs_target_df.reset_index(drop=True, inplace=True)
    # filtering the target docs, and staying only with the new docs
    not_target = new_docs_target_df[new_docs_target_df['original_db'] !='target']
    not_target.reset_index(drop=True, inplace=True)
    
    print('From the orginal similarity test, we get a total of ' + str(len(similar_to_target_df)) +' articles, counting the target ones and their most similars from metadata.')
    print('After filtering the duplicates from that dataframe, we get a total of ' + str(len(new_docs_target_df)) +' articles.')
    print('Finally, after filtering the target ones, we end with a total of ' + str(len(not_target)) +' new possible articles for the target table.')
    
    #return not_target, similar_to_target_df, new_docs_target_df
    return new_docs_target_df, not_target

**To get relevant documents checking json files:**

In [None]:
def get_relevant_docs(dataframe, target):
    """
    This function will get the relevant docs from a dataframe depending on a target subject.
    
    The dataframe needs to have a column 'pdf_json_files' on it, containing json files names.
    The target is in str format.
    
    At the end it will print the number of relevant docs and 
    return a list of lists for each relevant article with its:
        [0] = original index,
        [1] = original body text 
        [2] = if target or not   
    
    """
    # parsing through all the docs in the target table
    related_docs = []
    
    for i in range(len(dataframe)):
        ori_ind = dataframe.original_index[i]
        ori_tab = dataframe.original_db[i]
        try:
            # open json file
            with open(path + dataframe.pdf_json_files[i], 'r') as myfile:
                data=myfile.read()
            # parse file
            obj = json.loads(data)
            body = obj['body_text']
            # having a list of parts of the text for better parsing
            just_text = [body[d]['text'] for d in range(len(body))]
            clean_body = [text.lower() for text in just_text]
            clean_body = [strip_numeric(text) for text in clean_body] # Remove numbers
            clean_body = [strip_punctuation(text) for text in clean_body] # Remove punctuation
            clean_body = [strip_multiple_whitespaces(text) for text in clean_body] # Remove multiple spaces
            clean_body = [remove_stopwords(text) for text in clean_body] #removing the stopwords
            clean_body = [strip_short(text) for text in clean_body]
            stem_body = [stem_text(text) for text in clean_body]
            relevant_parts = []
            # check if the doc is related with the target
            for t in range(len(stem_body)):
                if target in stem_body[t]:
                    # save the index of relevant parts
                    relevant_parts.append(t)
            # save the docs in a list that has: target_index, clean_json_body, original_json_body
            if len(relevant_parts) != 0:
                # convert json body_text into a text to have the original text 
                original_text=''
                for d in range(len(body)):
                    original_text = original_text+body[d]['text']
                related_docs.append([ori_ind, original_text, ori_tab])
                #related_docs.append([ori_ind, original_text, relevant_parts, clean_body])
            
        except:
            TypeError
        
    print('You have ' + str(len(related_docs)) + ' relevant docs')
    return related_docs

**To build a final dataframe of the new docs for the target table:**

In [None]:
def build_relevant_docs_df(meta_df, relevant_docs_list):
    """
    This function needs the metadata dataframe and a list of relevant documents.
    
    The list of relevant documents must contain one list for each relevant doc, that 
    has 3 values: [0] = original index,
                  [1] = original body text 
                  [2] = if target or not  
                  
    Finally this function returns a dataframe with all the relevant documents body text obtained from the json file
    and its corresponding columns from the metadata table.
    
    """
    relevant_for_target = []
    for i in range(len(relevant_docs_list)): 
    #this way the list could be used to create a dataframe
        index_rev = relevant_docs_list[i][0]
        relevant_for_target.append([index_rev, 
                                    meta_df.publish_time[index_rev], 
                                    meta_df.title[index_rev], 
                                    meta_df.abstract[index_rev], 
                                    meta_df.cord_uid[index_rev], 
                                    meta_df.doi[index_rev],
                                    meta_df.journal[index_rev],
                                    meta_df.url[index_rev],
                                    meta_df.pdf_json_files[index_rev],
                                    meta_df.date[index_rev],
                                    relevant_docs_list[i][1],
                                    relevant_docs_list[i][2]
                                   ])
    # creating a dataframe with the relevant docs including all needed columns
    df_colum = ['original_index', 'publish_time', 'title', 'abstract', 'cord_uid', 'doi',
                    'journal', 'url', 'pdf_json_files', 'date', 'body_text', 'target_or_not']
    relevant_df = pd.DataFrame(relevant_for_target, index=range(len(relevant_for_target)), columns=df_colum)
    return relevant_df

### Loading the metadata table

In [None]:
path = '../input/CORD-19-research-challenge/'
meta_df = pd.read_csv(path + 'metadata.csv') 

In [None]:
print('There are ' + str(len(meta_df)) + ' articles in total')
print("Cols names: {}".format(meta_df.columns))

## Loading filtered metadata file
It has the stemmed abstracts

In [None]:
# saving the filtered original dataset into a dataframe
path2 = '../input/stemmed-meta-and-risk-targets-doc2vec-model/'
meta = pickle.load(open(path2 + 'meta_stemmed', 'rb'))
meta.head()

In [None]:
print('There are ' + str(len(meta)) + ' articles in total')
print("Cols names: {}".format(meta.columns))

### Making the corpus for the filtered metadata

In [None]:
corpus_meta= [doc.split() for doc in list(meta.abstract)]
print(corpus_meta[0])

## Loading the doc2vec model created with the filtered metadata
This model is implemented with epochs=200

In [None]:
model = pickle.load(open(path2+'d2v_model_saved.pkl', 'rb'))

# Processing the diabetes target

## Loading the stemmed dataframe of the target risk

In [None]:
diabetes_df= pickle.load(open(path2 + 'diabetes_stemmed', 'rb'))
diabetes_df.head()

In [None]:
len(diabetes_df)

### Applying the model to the target dataset

making the corpus for smoking target:

In [None]:
corpus_diabetes= [doc.split() for doc in list(diabetes_df.abstract)]
print(corpus_diabetes[0])

In [None]:
# Saving the doc2vec model of the target risk
d2v_diabetes = []
for i in range(len(corpus_diabetes)):
    model.random.seed(0)
    d2v_diabetes.append(model.infer_vector(corpus_diabetes[i], epochs=200))

d2v_diabetes[0]

## Getting similar docs

running the function and obtaining **similar_to_target_df, new_docs_target_df**

In [None]:
# function filtering the second and third most similar
diabetes_new_df, dia_not_target = get_similar_docs(diabetes_df, meta, model, d2v_diabetes)

### change to a dataframe that includes original target articles.

In [None]:
diabetes_new_df.head(2)

In [None]:
# checking how many articles are missing their json file in the new dataframe
diabetes_new_df.notna().sum()

In [None]:
print(len(diabetes_df))
print(len(diabetes_new_df))
print(len(diabetes_new_df)-len(diabetes_df))
print('we only run the topic test with this many articles because of no json file:')
print(len(diabetes_new_df)-72)

In [None]:
# checking how many articles are missing their json file in the new dataframe
diabetes_df.notna().sum()

**For the diabetes topic:**

After the similarity test:
* We got 64 new possible target articles, one for each target article.
* Of the total 128 articles, only 56 have a json file available.  

* From the original target table, only 31 articles have a json file available, 
* So we can apply the relevance function to only 25 new articles.

**After running the relevance function:**
* There are 27 original target articles and 9 new articles identified as relevant



## Getting the relevant documents based on their body text

In [None]:
stem_text('diabetes')

In [None]:
dia_relevant_docs = get_relevant_docs(diabetes_new_df, 'diabet')

In [None]:
dia_new_relevant = get_relevant_docs(dia_not_target, 'diabet')

In [None]:
### Modifying function so that it works with target table dataframe
def get_relevant_docs_mod(dataframe, target):
    """
    This function will get the relevant docs from a dataframe depending on a target subject.
    
    The dataframe needs to have a column 'pdf_json_files' on it, containing json files names.
    The target is in str format.
    
    
    At the end it will print the number of relevant docs and 
    return a list of lists for each relevant article with its:
        [0] = original index,
        [1] = original body text 
        [2] = if target or not   
    
    MODIFIED SO IT WORKS WITH TARGET TABLE!!
    """
    # parsing through all the docs in the target table
    related_docs = []
    
    for i in range(len(dataframe)):
        try:
            # open json file
            with open(path + dataframe.pdf_json_files[i], 'r') as myfile:
                data=myfile.read()
            # parse file
            obj = json.loads(data)
            body = obj['body_text']
            # having a list of parts of the text for better parsing
            just_text = [body[d]['text'] for d in range(len(body))]
            clean_body = [text.lower() for text in just_text]
            clean_body = [strip_numeric(text) for text in clean_body] # Remove numbers
            clean_body = [strip_punctuation(text) for text in clean_body] # Remove punctuation
            clean_body = [strip_multiple_whitespaces(text) for text in clean_body] # Remove multiple spaces
            clean_body = [remove_stopwords(text) for text in clean_body] #removing the stopwords
            clean_body = [strip_short(text) for text in clean_body]
            stem_body = [stem_text(text) for text in clean_body]
            relevant_parts = []
            # check if the doc is related with the target
            for t in range(len(stem_body)):
                if target in stem_body[t]:
                    # save the index of relevant parts
                    relevant_parts.append(t)
            # save the docs in a list that has: target_index, clean_json_body, original_json_body
            if len(relevant_parts) != 0:
                original_text=''
                for d in range(len(body)):
                    original_text = original_text+body[d]['text']
                #related_docs.append([ori_ind, original_text, ori_tab])
                related_docs.append([relevant_parts, clean_body])
            
        except:
            TypeError
        # convert json body_text into a text list
    print('You have ' + str(len(related_docs)) + ' relevant docs')
    return related_docs

In [None]:
dia_relevant_target = get_relevant_docs_mod(diabetes_df, 'diabet')


In [None]:
print('New articles added: ' + str(len(dia_new_relevant)))
print('Total relevant target articles: ' + str(len(dia_relevant_docs)))

## Creating a table of the relevant docs

In [None]:
#obtaining the info of the relevant doc in the similarity dataframe
diabetes_new_df[diabetes_new_df.original_index == dia_relevant_docs[0][0]]
# we can see the similarity of this article

In [None]:
dia_relevant_df = build_relevant_docs_df(meta, dia_relevant_docs)
dia_relevant_df.head()

In [None]:
dia_relevant_df.target_or_not.value_counts()

In [None]:
#pickle.dump(dia_relevant_df, open("diabetes_new_target.pkl", "wb"))

# Processing the hypertension target

In [None]:
hyper_df= pickle.load(open(path2 + 'hypertension_stemmed', 'rb'))
hyper_df.head()

In [None]:
len(hyper_df)

In [None]:
# making a corpus to run the doc2vec model
corpus_hyper= [doc.split() for doc in list(hyper_df.abstract)]
print(corpus_hyper[0])

In [None]:
# Saving the doc2vec model of the target risk
d2v_hyper = []
for i in range(len(corpus_hyper)):
    model.random.seed(0)
    d2v_hyper.append(model.infer_vector(corpus_hyper[i], epochs=200))

d2v_hyper[0]

In [None]:
# function filtering the second and third most similar
#hyper_new_docs, hyper_similar_raw, hyper_uniq_sim = get_similar_docs(hyper_df, meta, model, d2v_hyper)
hyper_new_docs, hyper_not_target = get_similar_docs(hyper_df, meta, model, d2v_hyper)

In [None]:
hyper_new_docs.head()

In [None]:
stem_text('hypertension')

In [None]:
hyper_relevant_docs = get_relevant_docs(hyper_new_docs, 'hypertens')

In [None]:
hyper_new_relevant = get_relevant_docs(hyper_not_target, 'hypertens')

In [None]:
hyper_relevant_target = get_relevant_docs_mod(hyper_df, 'hypertens')

In [None]:
#obtaining the info of the relevant doc in the similarity dataframe
hyper_new_docs[hyper_new_docs.original_index == hyper_relevant_docs[0][0]]
# we can see the similarity of this article

In [None]:
hyper_relevant_df = build_relevant_docs_df(meta, hyper_relevant_docs)
hyper_relevant_df.head()

In [None]:
#pickle.dump(hyper_relevant_df, open("hypertension_new_target.pkl", "wb"))

# Processing the smoking target

In [None]:
smoke_df= pickle.load(open(path2 + 'smoking_stemmed', 'rb'))
smoke_df.head()

In [None]:
len(smoke_df)

In [None]:
# making a corpus to run the doc2vec model
corpus_smoke= [doc.split() for doc in list(smoke_df.abstract)]
print(corpus_smoke[0])

In [None]:
# Saving the doc2vec model of the target risk
d2v_smoke = []
for i in range(len(corpus_smoke)):
    model.random.seed(0)
    d2v_smoke.append(model.infer_vector(corpus_smoke[i], epochs=200))

d2v_smoke[0]

In [None]:
# function filtering the second and third most similar
#hyper_new_docs, hyper_similar_raw, hyper_uniq_sim = get_similar_docs(hyper_df, meta, model, d2v_hyper)
smoke_new_docs, smoke_not_target = get_similar_docs(smoke_df, meta, model, d2v_smoke)

In [None]:
smoke_new_docs.head()

In [None]:
smoke_new_docs.notna().sum()

In [None]:
stem_text('smoke')

In [None]:
smoke_relevant_docs = get_relevant_docs(smoke_new_docs, 'smoke')

In [None]:
smoke_new_relevant = get_relevant_docs(smoke_not_target, 'smoke')

In [None]:
smok_target_relevant = get_relevant_docs_mod(smoke_df, 'smoke')

In [None]:
smoke_relevant_df = build_relevant_docs_df(meta, smoke_relevant_docs)
smoke_relevant_df.head()

In [None]:
smoke_relevant_df.target_or_not.value_counts()

In [None]:
#pickle.dump(smoke_relevant_df, open("smoking_new_target.pkl", "wb"))