In [6]:
import json
import re
from string import punctuation

import nltk
import numpy as np
from gensim.models import KeyedVectors
from nltk import word_tokenize
from nltk.corpus import stopwords
from umap import UMAP

In [7]:
model = KeyedVectors.load_word2vec_format(
    '/Users/prush/PycharmProjects/thesis/biomedvis/app/static/mlmodels/crawl-300d-2M.vec',
    binary=False)

In [12]:
def preprocess_sentence_returns_list(text):
    stop_words = set(stopwords.words('english'))
    # stop_words.extend(['from', 'subject', 're', 'edu', 'use','figure', 'fig'])
    # stemmer = SnowballStemmer("english")
    lem = nltk.stem.wordnet.WordNetLemmatizer()
    cleanr = re.compile('\[(.*?)\]')
    text = re.sub(cleanr, '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub("[0-9]{2}", '', text)
    text = text.replace('/', ' ')
    text = text.replace('\'', ' \' ')
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]'
    text = re.sub(pat, '', text)
    text = text.lower()

    # Tokenise the text - try with bert tokeniser later
    words = word_tokenize(text)
    # words = text.split()
    # text = ' '.join([stemmer.stem(word) for word in words])
    # words = [stemmer.stem(word) for word in words]
    text = ' '.join([lem.lemmatize(word) for word in words])
    # text = ' '.join(words)
    text = ' '.join([w for w in text.split() if len(w) > 1])
    text = text.replace('/`/', '')
    text = text.replace('/"/', '')
    text = text.replace("/'/", "")

    tokens = [token for token in word_tokenize(text) if token not in punctuation and token not in stop_words]
    return tokens

In [13]:
def get_embedding(text):
    emb = [np.array(model[t]).tolist() for t in text if t in model.key_to_index]
    word_emb = np.mean(emb, axis=0)
    return word_emb

In [14]:
def load_data():
    with open('../articles_data/all_articles_with_thumbnail_metadata.json') as f:
        papers = json.load(f)

    # To encode the papers, we must combine the title and the abstracts to a single string
    article_texts = [preprocess_sentence_returns_list(paper['article_title'] + ' ' + paper['abstract']) for paper in
                     papers]
    article_titles = [paper['article_title'] for paper in papers]
    emb = [list(get_embedding(article)) for article in article_texts]
    return emb, article_titles

In [15]:
corpus_embeddings, article_titles = load_data()
umap_embeddings = UMAP(n_neighbors=5, n_components=2, metric='cosine', random_state=42)
low_dim_embeddings = umap_embeddings.fit_transform(corpus_embeddings)
low_dim_embeddings = low_dim_embeddings.tolist()
print(low_dim_embeddings)
result_high = dict(zip(article_titles, corpus_embeddings))
with open('../embeddings/high_dim/generic_fasttext_high_dim.json', 'x') as f:
    json.dump(result_high, f)

OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


[[5.566387176513672, 6.037216663360596], [10.400812149047852, 5.693390369415283], [12.402023315429688, 4.308645725250244], [11.111598014831543, 4.491008281707764], [5.75358247756958, 3.497481346130371], [8.402742385864258, 6.307307243347168], [6.334397315979004, 6.739317893981934], [10.809039115905762, 2.9369852542877197], [5.906482696533203, 5.125293731689453], [6.756939888000488, 2.434736967086792], [12.347725868225098, 4.519011974334717], [7.819489479064941, 3.223146438598633], [5.625209808349609, 5.755670070648193], [6.9903764724731445, 4.004396438598633], [11.79672622680664, 4.524794101715088], [11.821490287780762, 4.773310661315918], [11.770257949829102, 4.981322765350342], [7.247778415679932, 3.0856595039367676], [9.016688346862793, 5.494049072265625], [3.4761247634887695, 5.047966003417969], [9.965956687927246, 5.416987895965576], [7.943117141723633, 5.658888339996338], [9.828211784362793, 5.017914772033691], [11.56825065612793, 5.460666656494141], [8.061732292175293, 5.7448463

In [16]:
result_low = dict(zip(article_titles, low_dim_embeddings))
with open('../embeddings/low_dim/generic_fasttext_low_dim.json', 'x') as f:
    json.dump(result_low, f)

In [None]:
import json

f = open('../embeddings/low_dimension/biowordvec_low_dim.json')
data = json.load(f)
embeddings = list(data.values())
titles = list(data.keys())


In [None]:
import plotly.express as px

fig = px.scatter(embeddings, x=0, y=1, opacity=1, hover_name=titles)

In [None]:
fig.show()

In [6]:
import json

f = open('../embeddings/low_dimension/biowordvec_low_dim.json')
data = json.load(f)
embeddings = list(data.values())
titles = list(data.keys())


In [7]:
import plotly.express as px

fig = px.scatter(embeddings, x=0, y=1, opacity=1, hover_name=titles)

In [8]:
fig.show()