### Steps:

- represent the documents in form of vectors

- find the cosine similarity between the documents

- prepare the document-term matrix (indexing) for fast access

- get the most similar documents 

In [1]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
## Load the data
dataframe = pd.read_csv('./flaskapp/models/output/results_dataframe.csv')

In [3]:


# Cast as a list of values for calculating weights
text_data= dataframe['processed_text'].values.tolist()

In [6]:
text_data[:5]

['pandemic-09-h1n1 virus cause pandemic start 2 quarter 2009 world prepare face pandemic anticipate decade country include india detail pandemic preparedness plan good ahead actual occurrence infection rapidly spread country 2 3 month national tactic slow importation international air traveler slow spread city major town 75 infect person urban dweller suggest effort concentrate urban community general illness pandemic influenza similar endemic/seasonal influenza insufficient epidemiological clinical datum hope unprecedented experience manage pandemic encourage government india plan confront endemic/seasonal influenza systematically pandemic reach peak september/october decline',
 'aim study evaluation clinical radiologic outcome parallel graft treatment patient fail prior endovascular aneurysm repair type ia endoleak methods prospectively collect clinical radiologic datum consecutive patient prior endovascular aneurysm repair evidence type ia endoleak analyze patient treat january 2009

In [9]:

# Calculate TF-IDF matrix
def tf_idf(search_keys, data):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_weights_matrix = tfidf_vectorizer.fit_transform(data)
    search_query_weights = tfidf_vectorizer.transform([search_keys])

    return search_query_weights, tfidf_weights_matrix

# Calculate the cosine similarity between search query and TF-IDF vectors
def cos_similarity(search_query_weights, tfidf_weights_matrix):
    cosine_sim = cosine_similarity(search_query_weights, tfidf_weights_matrix)
    similarity_list = cosine_sim[0]

    return similarity_list

# Calculate number of relevant vectors
def calculate_num_vectors(cosine_similarity):

    num = 0
    for i in cosine_similarity:
        if i != 0.0:
            num += 1
    return num

# Calculate the most relevant vectors
def most_similar(similarity_list, N):

    most_similar = []

    while N > 0:
        tmp_index = np.argmax(similarity_list)
        most_similar.append(tmp_index)
        similarity_list[tmp_index] = 0
        N -= 1

    return most_similar

# Create weights at specific index for quick retrieval
def create_matrix_dict(cosine_similarity):

    matrix_dict = {}

    iter_counter = 0
    for i in cosine_similarity:
        matrix_dict[iter_counter] = i
        iter_counter += 1

    return matrix_dict

# -----------
# Return the recipes with relevant search term
def return_relevant_recipes(search_term):

    # Create local variables
    # convert_documents to vector representations
    search, matrix = tf_idf(search_term, text_data)
    
    # Find the cosine similarity
    cosine_sim_list = cos_similarity(search, matrix)
    
    # Get the number of relevant documents
    num_relevant_vectors = calculate_num_vectors(cosine_sim_list)
    
    # Prepare the " indexing " (one of stages in web information retrieval) for faster retrieval 
    # (Similar concept is also used by the Google, namely stored as document-term matrix)
    dictionary = create_matrix_dict(cosine_sim_list)
    
    # Get the most similar items
    list_of_most_similar = most_similar(cosine_sim_list, num_relevant_vectors)

    df = pd.DataFrame()

    for index in list_of_most_similar:

        recipe = dataframe.iloc[index]

        if df.empty:

            to_dataframe = recipe.to_frame()
            df = to_dataframe.T

        else:
            to_dataframe = recipe.to_frame()
            df = pd.concat([df, to_dataframe.T], join='outer')

    ### Specify the required columns here
    columns = dataframe.columns

    return df[columns]

In [14]:
return_relevant_recipes('covid')

Unnamed: 0.1,Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,...,pmc_json_files,url,s2_id,language,processed_text,words_count,y,keywords,tsne_embedding_coord1,tsne_embedding_coord2
54,101466,dblg9wfk,,WHO,Target Virus or Target Ourselves for COVID-19 ...,,,,unk,"The COVID-19 pandemic, after it was reported i...",...,,,2.15747e+08,en,covid-19 pandemic report december 2019 highly ...,48,18,"['covid-', 'disease', 'virus', 'cell', 'study'...",-4.83305,3.06372
704,148646,69nfi379,6b45aa983702a9425fd6604470b931906f12b19c,Medline; PMC,COVID-19: POSTMORTEM DIAGNOSTIC AND BIOSAFETY ...,10.1097/paf.0000000000000567,PMC7202125,32379077,cc-by-nc-nd,As a result of the 2019 novel human coronaviru...,...,document_parses/pmc_json/PMC7202125.xml.json,https://doi.org/10.1097/paf.0000000000000567; ...,2.18532e+08,en,result 2019 novel human coronavirus covid-19 g...,105,18,"['covid-', 'disease', 'virus', 'cell', 'study'...",-5.479,0.0222779
555,109522,9k0741ct,,WHO,Clinical and imaging features of COVID-19,,,,unk,"Since December 2019, multiple cases of 2019 co...",...,,,2.16554e+08,en,december 2019 multiple case 2019 coronavirus d...,92,18,"['covid-', 'disease', 'virus', 'cell', 'study'...",-6.28544,0.531187
85,156229,zzkqb0u2,056f35bf25005ec3974c0633a63492b675a17849,Medline; PMC,Ideas for how informaticians can get involved ...,10.1186/s13040-020-00213-y,PMC7216865,32419848,cc-by,The coronavirus disease 2019 (COVID-19) pandem...,...,document_parses/pmc_json/PMC7216865.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/32419848/;...,2.18605e+08,en,coronavirus disease 2019 covid-19 pandemic sig...,61,18,"['covid-', 'disease', 'virus', 'cell', 'study'...",11.9508,-2.46405
403,69749,6n29eryp,56344abdbb723e92ee5d28d38f6bd87d37f35f0e,MedRxiv,"Psychiatric symptoms, risk, and protective fac...",10.1101/2020.07.03.20144931,,,medrxiv,This study investigated psychiatric symptoms (...,...,,http://medrxiv.org/cgi/content/short/2020.07.0...,2.20335e+08,en,study investigate psychiatric symptom depressi...,166,18,"['covid-', 'disease', 'virus', 'cell', 'study'...",-13.1674,3.70276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
758,165987,443gyw3e,878c67df99d516cc7670e4de57d54dd393880b6b,Elsevier; Medline; PMC,Telemedicine for ENT: effect on quality of car...,10.1016/j.anorl.2020.06.014,PMC7306717,32624390,els-covid,Abstract Aims To assess the benefit of telemed...,...,,https://api.elsevier.com/content/article/pii/S...,2.19958e+08,en,aims assess benefit telemedicine consultation ...,137,87,,-23.9344,-15.7141
571,105282,5d0gev9e,,WHO,An international multicentre study of protocol...,,,,unk,BACKGROUND: The outbreak of Covid-19 has vastl...,...,,,2.18855e+08,en,background outbreak covid-19 vastly increase o...,170,18,"['covid-', 'disease', 'virus', 'cell', 'study'...",-27.7667,2.31984
45,191775,dbxufi3m,38197b689df3566474ed08974a9a6cb220686ab1,Medline; PMC,Vertebrates on the brink as indicators of biol...,10.1073/pnas.1922686117,PMC7306750,32482862,cc-by,The ongoing sixth mass species extinction is t...,...,document_parses/pmc_json/PMC7306750.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/32482862/;...,2.1917e+08,en,ongoing 6 mass species extinction result destr...,140,18,"['covid-', 'disease', 'virus', 'cell', 'study'...",22.1392,18.6933
889,188003,me6qusdz,eeb37b751a171b7d78b53b4f5b32d0a10c877335,Elsevier; Medline; PMC,Pediatrician attitudes toward and experiences ...,10.1016/j.acap.2020.05.004,PMC7207114,32437881,els-covid,Abstract Background: The American Academy of P...,...,,https://www.sciencedirect.com/science/article/...,2.18552e+08,en,background american academy pediatrics 2015 po...,156,18,"['covid-', 'disease', 'virus', 'cell', 'study'...",-23.3443,9.8803
