This notebook compares three different ways of obtaining document embeddings:  

- OpenAI (using text-embedding-3-large)
- fasttext (using a 300-dimension embedding learned from the full corpus of papers)
- sentence_transformers (using all-mpnet-base-v2)

We select a random subset of 250 papers that are short enough to fit in the OpenAI context window (7500 tokens).

We compare the results using representational similarity analysis, in which we first compute the correlation matrix across papers over each embedding, and the compute the correlation between the upper triangle elements in the correlation matrix.  This tells us how well the similarities between papers are matched across embeddings.

TL/DR: the fasttext embedding was very close to the OpenAI embedding (r = 0.964).  Given that it's free we will use that for subsequent analyses.

The fasttext embedding is saved at `<datadir>/fulltext.bin` for reuse.

In [26]:
from pathlib import Path
from ontology_learner.lit_mining.litmining_utils import get_fulltext_from_pmcid_json
import re
import string
from nltk.corpus import stopwords
import pickle
from pathlib import Path
from tqdm import tqdm
from openai import OpenAI
import os
import random
import numpy as np
import fasttext
import json


In [2]:
datadir = Path('/Users/poldrack/Dropbox/data/ontology-learner/data/json')
datafiles = list(datadir.glob('*.json'))
print(f'found {len(datafiles)} data files')


found 132257 data files


we need to generate a single 

In [3]:

fulltext = {}

def process_fulltext(fulltext):
    # remove all newlines
    fulltext = fulltext.replace('\n', ' ')
    # remove all punctuation
    fulltext = fulltext.translate(str.maketrans('', '', string.punctuation))
    # remove all numbers
    fulltext = re.sub(r'\d+', '', fulltext)
    # remove stopwords
    fulltext = ' '.join([word for word in fulltext.split() if word not in stopwords.words('english')])
    return fulltext

outfile = Path('/Users/poldrack/Dropbox/data/ontology-learner/data/fulltext.pkl')
if not outfile.exists():
    for datafile in tqdm(datafiles):
        pmcid = datafile.stem
        fulltext[pmcid] = get_fulltext_from_pmcid_json(pmcid, datadir)
        # processed_text = process_fulltext(fulltext[pmcid])
        
    with open(outfile, 'wb') as f:
        pickle.dump(fulltext, f)
else:
    with open(outfile, 'rb') as f:
        fulltext = pickle.load(f)

# %%


In [4]:
print(f'found {len(fulltext)} fulltext')

found 132257 fulltext


Use openai embedding model to get embeddings for each paper.

In [5]:
tokenlength_file = Path('/Users/poldrack/Dropbox/data/ontology-learner/data/tokenlengths.pkl')
if not tokenlength_file.exists():
    import tiktoken
    enc = tiktoken.get_encoding("o200k_base")

    tokenlengths = {}
    for pmcid in tqdm(fulltext):
        foo = enc.encode(fulltext[pmcid])
        tokenlengths[pmcid] = len(foo)

    with open(tokenlength_file, 'wb') as f:
        pickle.dump(tokenlengths, f)
else:
    with open(tokenlength_file, 'rb') as f:
        tokenlengths = pickle.load(f)
# %%

In [6]:
import numpy as np
# this was .395 with previous version
np.sum(np.array(list(tokenlengths.values())) < 7500)



52309

## LLM embeddings

As a first pass, we will get a few hundred document embeddings from openai, and compare them to the embeddings obtained using fasttext (learning directly on the full corpus)

In [28]:
api_key = os.getenv("OPENAI")
client = OpenAI(api_key=api_key)

openai_embeddings_file = Path('/Users/poldrack/Dropbox/data/ontology-learner/data/openai_embeddings.npy')
sample_docs_file = Path('/Users/poldrack/Dropbox/data/ontology-learner/data/sample_docs.json')

if not openai_embeddings_file.exists():
    max_tokens = 7000
    n_samples = 250
    # get documents that are less than max_tokens

    docs = {pmcid: fulltext[pmcid] for pmcid in fulltext if tokenlengths[pmcid] < max_tokens}
    # get a random sample of 500 documents, indexex by pmcid

    sample_pmcids = random.sample(list(docs.keys()), 250)
    sample_docs = {pmcid: docs[pmcid] for pmcid in sample_pmcids}

    # submit each document to openai for embedding

    embeddings = {}
    for pmcid, doc in tqdm(sample_docs.items()):
        response = client.embeddings.create(
            input=doc,
            model="text-embedding-3-large"
        )
        embeddings[pmcid] = response.data[0].embedding

    # convert to numpy array, preserving pmcid order
    pmcids = list(embeddings.keys())
    embeddings_array = np.array([embeddings[pmcid] for pmcid in pmcids])

    np.save(openai_embeddings_file, embeddings_array)
    with open(sample_docs_file, 'w') as f:
        json.dump(sample_docs, f)
else:
    print(f'loading openai embeddings from {openai_embeddings_file}')
    embeddings_array = np.load(openai_embeddings_file)
    with open(sample_docs_file, 'r') as f:
        sample_docs = json.load(f)

# %%


loading openai embeddings from /Users/poldrack/Dropbox/data/ontology-learner/data/openai_embeddings.npy


In [29]:
# compute embeddings on full sample using fasttext unsupervised

# first create text file with one document per line for the entire fulltext corpus
fulltext_file = Path('/Users/poldrack/Dropbox/data/ontology-learner/data/fulltext.txt') 
if not fulltext_file.exists():
    with open(fulltext_file, 'w') as f:
        for doc in fulltext.values():
            f.write(f'{doc}\n')

model_file = fulltext_file.with_suffix('.bin')
if not model_file.exists():
    # Skipgram model :
    model = fasttext.train_unsupervised(fulltext_file.as_posix(), dim=300)
    model.save_model(model_file.as_posix())
else:
    model = fasttext.load_model(model_file.as_posix())



In [31]:
embeddings_fasttext = {}
for pmcid, doc in tqdm(sample_docs.items()):
    response = model.get_sentence_vector(doc.replace('\n', ' '))
    embeddings_fasttext[pmcid] = response

# convert to numpy array, preserving pmcid order
pmcids = list(embeddings_fasttext.keys())
embeddings_fasttext_array = np.array([embeddings_fasttext[pmcid] for pmcid in pmcids])

np.save('/Users/poldrack/Dropbox/data/ontology-learner/data/fasttext_embeddings.npy', embeddings_fasttext_array)

100%|██████████| 250/250 [00:01<00:00, 147.30it/s]


Also try using sentence_transformers

In [22]:
from sentence_transformers import SentenceTransformer

# Initialize the sentence transformer model
model = SentenceTransformer('all-mpnet-base-v2')  # Using a good general-purpose model
embeddings_st = {}
for pmcid, doc in tqdm(sample_docs.items()):
    # Get embeddings directly from the model
    embedding = model.encode(doc)
    embeddings_st[pmcid] = embedding

# convert to numpy array, preserving pmcid order
pmcids = list(embeddings_st.keys())
embeddings_array_st = np.array([embeddings_st[pmcid] for pmcid in pmcids])


100%|██████████| 250/250 [00:06<00:00, 40.36it/s]


get representaitonal similarty for each

In [35]:
cc_openai = np.corrcoef(embeddings_array)
cc_fasttext = np.corrcoef(embeddings_fasttext_array)
cc_st = np.corrcoef(embeddings_array_st)

# %%
assert cc_openai.shape == (250, 250)
assert cc_fasttext.shape == (250, 250)
assert cc_st.shape == (250, 250)
# %%


In [37]:
# get upper triangle of each matrix
cc_openai_upper = np.triu(cc_openai)
cc_fasttext_upper = np.triu(cc_fasttext)
cc_st_upper = np.triu(cc_st)
# %%

rsa_openai_fasttext = np.corrcoef(cc_openai_upper.flatten(), cc_fasttext_upper.flatten())
rsa_openai_st = np.corrcoef(cc_openai_upper.flatten(), cc_st_upper.flatten())
rsa_fasttext_st = np.corrcoef(cc_fasttext_upper.flatten(), cc_st_upper.flatten())

print(f'RSA between openai and fasttext: {rsa_openai_fasttext[0, 1]:.3f}')
print(f'RSA between openai and st: {rsa_openai_st[0, 1]:.3f}')
print(f'RSA between fasttext and st: {rsa_fasttext_st[0, 1]:.3f}')
# %%


RSA between openai and fasttext: 0.964
RSA between openai and st: 0.825
RSA between fasttext and st: 0.750


Compute fasttext embeddings for all of the papers.

In [38]:
embeddings_fasttext_fulltext = {}
for pmcid, doc in tqdm(fulltext.items()):
    response = model.get_sentence_vector(doc.replace('\n', ' '))
    embeddings_fasttext_fulltext[pmcid] = response

# convert to numpy array, preserving pmcid order
pmcids = list(embeddings_fasttext_fulltext.keys())
embeddings_fasttext_fulltext_array = np.array([embeddings_fasttext_fulltext[pmcid] for pmcid in pmcids])

np.save('/Users/poldrack/Dropbox/data/ontology-learner/data/fasttext_embeddings_fulltext.npy', embeddings_fasttext_fulltext_array)

100%|██████████| 132257/132257 [24:54<00:00, 88.47it/s] 


In [41]:
with open('/Users/poldrack/Dropbox/data/ontology-learner/data/pmcids_fulltext.pkl', 'wb') as f:
    pickle.dump(pmcids, f)
# %%
