In [28]:
from gensim.models import KeyedVectors
from nltk import word_tokenize
from nltk.corpus import stopwords

import re
from string import punctuation
import json
import nltk
import numpy as np
from umap import UMAP

In [5]:
model = KeyedVectors.load_word2vec_format(
    '/Users/prush/PycharmProjects/thesis/biomedvis/app/static/mlmodels/BioWordVec_PubMed_MIMICIII_d200.txt',
    binary=True)

In [10]:
def preprocess_sentence_returns_list(text):
    stop_words = set(stopwords.words('english'))
    # stop_words.extend(['from', 'subject', 're', 'edu', 'use','figure', 'fig'])
    # stemmer = SnowballStemmer("english")
    lem = nltk.stem.wordnet.WordNetLemmatizer()
    cleanr = re.compile('\[(.*?)\]')
    text = re.sub(cleanr, '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub("[0-9]{2}", '', text)
    text = text.replace('/', ' ')
    text = text.replace('\'', ' \' ')
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]'
    text = re.sub(pat, '', text)
    text = text.lower()

    # Tokenise the text - try with bert tokeniser later
    words = word_tokenize(text)
    # words = text.split()
    # text = ' '.join([stemmer.stem(word) for word in words])
    # words = [stemmer.stem(word) for word in words]
    text = ' '.join([lem.lemmatize(word) for word in words])
    # text = ' '.join(words)
    text = ' '.join([w for w in text.split() if len(w) > 1])
    text = text.replace('/`/', '')
    text = text.replace('/"/', '')
    text = text.replace("/'/", "")

    tokens = [token for token in word_tokenize(text) if token not in punctuation and token not in stop_words]
    return tokens

In [37]:
def get_embedding(text):

    emb = [np.array(model[t]).tolist() for t in text if t in model.key_to_index]
    word_emb = np.mean(emb, axis=0)
    return word_emb

In [59]:
def load_data():
    with open('../articles_data/all_articles_with_thumbnail_metadata.json') as f:
        papers = json.load(f)

    # To encode the papers, we must combine the title and the abstracts to a single string
    article_texts = [preprocess_sentence_returns_list(paper['article_title'] + ' ' + paper['abstract']) for paper in
                     papers]
    article_titles = [paper['article_title'] for paper in papers]
    emb = [list(get_embedding(article)) for article in article_texts]
    return emb, article_titles


In [62]:
corpus_embeddings, article_titles = load_data()
umap_embeddings = UMAP(n_neighbors=5, n_components=2, metric='cosine', random_state=42)
low_dim_embeddings = umap_embeddings.fit_transform(corpus_embeddings)
low_dim_embeddings = low_dim_embeddings.tolist()
print(low_dim_embeddings)
result_high = dict(zip(article_titles, corpus_embeddings))
with open('../embeddings/high_dim/biowordvec_high_dim.json', 'x') as f:
        json.dump(result_high, f)



[[8.195077896118164, 3.98250675201416], [3.8977482318878174, 4.275808334350586], [3.6691184043884277, 6.704298496246338], [4.513609409332275, 7.0889892578125], [5.639538288116455, 2.183544635772705], [5.700282096862793, 5.997064590454102], [6.414506912231445, 4.312767028808594], [0.6587456464767456, 3.981008529663086], [8.556120872497559, 4.587789058685303], [3.913581371307373, 2.071776866912842], [5.4987640380859375, 3.4789390563964844], [3.1466689109802246, 2.3796122074127197], [8.384326934814453, 4.09692907333374], [4.045699119567871, 3.13802433013916], [3.0664050579071045, 6.4905853271484375], [3.296949625015259, 5.726902008056641], [2.951202869415283, 4.952203750610352], [3.797516107559204, 2.802835702896118], [5.635402202606201, 6.291025638580322], [7.450741767883301, 2.7613208293914795], [3.178227424621582, 3.578030824661255], [5.877899646759033, 5.164550304412842], [2.395298480987549, 2.9500789642333984], [3.2115116119384766, 5.044594764709473], [5.740634441375732, 5.4747781753

In [63]:
result_low = dict(zip(article_titles, low_dim_embeddings))
with open('../embeddings/low_dim/biowordvec_low_dim.json', 'x') as f:
    json.dump(result_low, f)