In [1]:
import os
import csv
import json
from IPython import embed
import scipy
import pickle as pkl
import numpy as np
import gensim
from progiter import ProgIter
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from gensim.models import Word2Vec, KeyedVectors

In [None]:
phrases = [['fever'], ['dry', 'cough'], ['cough'], ['sore', 'throat'], ['malaise'],
          ['headache'], ['muscle', 'pain'], ['nasal', 'congestion'], ['shortness', 'of', 'breath'],
          ['dyspnea'], ['respiratory', 'distress'], ['tachypnea'], ['hypoxia'], ['bilateral', 'opacities'],
          ['lung', 'infiltrates'], ['acute', 'respiratory', 'distress', 'syndrome'], ['ARDS'], ['pneumonia']]

### Sources for the embeddings

The embeddings are publicly available and need to be downloaded separately. Due to their large size, it is a bit hard to upload them to the drive.

#### PubMed+PMC model: 
http://evexdb.org/pmresources/vec-space-models/PubMed-and-PMC-w2v.bin

#### WikiPedia+PubMed+PMC model: 
http://evexdb.org/pmresources/vec-space-models/wikipedia-pubmed-and-PMC-w2v.bin

#### BioASQ model: 
http://bioasq.org/news/bioasq-releases-continuous-space-word-vectors-obtained-applying-word2vec-pubmed-abstracts

#### BioWordVec models: 
https://figshare.com/articles/Improving_Biomedical_Word_Embeddings_with_Subword_Information_and_MeSH_Ontology/6882647

#### Clinical Embeddings
https://upenn.box.com/s/s52hsf65c51e3ro0ssx79e6l25qykt0m

#### GloVe embeddings: 
http://nlp.stanford.edu/data/glove.840B.300d.zip


In [3]:
'''
The BioASQ embeddings are provided as two separate files for the words and vectors. Here, we 
consolidate them in one file to convert to a gensim-compatible format.
'''


with open('embeddings/word2vecTools/types.txt') as f:
    num_lines = 0
    for _ in f:
        num_lines += 1
    
with open('embeddings/word2vecTools/types.txt') as f, open('embeddings/word2vecTools/vectors.txt') as f1, open('embeddings/word2vecTools/w2v_test.txt', 'w') as fw:
    fw.write(str(num_lines))
    fw.write(' ')
    fw.write('200')
    for line in f:
        word = line.strip()
        vectors = f1.readline()
        vectors = [float(x) for x in vectors.strip().split()]
        fw.write('\n')
        fw.write(word)
        for x in vectors:
            fw.write(' ')
            fw.write(str(x))

In [None]:
'''
This block loads all the models and it will take some time to load all of them (~45 min - 1 hour)
You can set the path of the embeddings depending on where you have downloaded them.
'''


pbm_pmc_model = KeyedVectors.load_word2vec_format('embeddings/w2v/PubMed-and-PMC-w2v.bin', binary=True)
wiki_pbm_pmc_model = KeyedVectors.load_word2vec_format('embeddings/w2v/wikipedia-pubmed-and-PMC-w2v.bin', binary=True)
bioasq_model = KeyedVectors.load_word2vec_format('embeddings/word2vecTools/w2v.txt')
biow2v_model = KeyedVectors.load_word2vec_format('embeddings/biow2v/BioWordVec_PubMed_MIMICIII_d200.vec.bin', 
                                                binary=True)
w2v300_model = Word2Vec.load('embeddings/clinical_embeddings/W2V_300/w2v_OA_CR_300d.bin')
extrinsic_model = KeyedVectors.load_word2vec_format('embeddings/biow2v/bio_embedding_extrinsic', binary=True)
intrinsic_model = KeyedVectors.load_word2vec_format('embeddings/biow2v/bio_embedding_intrinsic', binary=True)
standard_glove = KeyedVectors.load_word2vec_format('embeddings/glove/glove.840B.300d.txt')

In [None]:
models = [pbm_pmc_model, wiki_pbm_pmc_model, bioasq_model, w2v300_model, 
          extrinsic_model, intrinsic_model, standard_glove]

In [None]:
'''
This piece of code retrieves the top 20 most similar words/phrases from the vocabulary of each model
for the COVID symptoms
'''

all_returned_phrases = dict()
for phrase in phrases:
    returned_phrases = list()
    for model in models:
        results = list()
        try:
            results = model.most_similar(positive=phrase, topn=20)
        except:
            results = model.most_similar(positive=[x.lower() for x in phrase], topn=20)
        returned_phrases.append(results)
    all_returned_phrases[' '.join(phrase)] = returned_phrases

In [None]:
'''
This piece of code first retrieves the top 20 most similar words/phrases from the vocabulary of each model
for the COVID symptoms. 
Then, it computes the cosine similarity between the input and each word/phrase returned across all models.
'''



all_similarities = dict()
for phrase in ProgIter(phrases):
    all_similarities[' '.join(phrase)] = list()
    all_results = Counter()
    for model in models:
        try:
            results = model.most_similar(positive=phrase, topn=20)
        except:
            results = model.most_similar(positive=[x.lower() for x in phrase], topn=20)
        for x in results:
            all_results[x[0]] += 1
    for model in models:
        curr_results = Counter()
        for result in all_results:
            try:
                curr_results[result] = model.n_similarity(phrase, [result])
            except:
                curr_results[result] = -1
        all_similarities[' '.join(phrase)].append(curr_results)