For each task identified in learn_task_ontology.ipynb, do a pubmed abstract and retrieve up to 50 abstracts. Then fit a fasttext embedding model to the data using supervised learning.

In [2]:
# autoreload
%load_ext autoreload
%autoreload 2

from pubmedutils.pubmed import (
    get_pubmed_data, 
    parse_pubmed_record,
    parse_pubmed_pubs
)
import json
from pathlib import Path
from dotenv import load_dotenv
import os
import time
import tqdm
import fasttext

load_dotenv()

datadir = Path(os.getenv('DATADIR'))
print(datadir)



/Users/poldrack/Dropbox/data/ontology-learner/data


Load task entries

In [3]:
with open(datadir / 'gpt4/task_entries.json', 'r') as f:
    task_entries = json.load(f)



In [4]:

if (datadir / 'gpt4/task_results.json').exists():
    with open(datadir / 'gpt4/task_results.json', 'r') as f:
        task_results = json.load(f)
    print(f"Loaded {len(task_results)} task results from {datadir / 'gpt4/task_results.json'}")
else:
    print(f"No task results found at {datadir / 'gpt4/task_results.json'}, retrieving new data")
    task_results = {}
    errors = {}

    for k, v in tqdm.tqdm(task_entries.items()):
        if k in task_results:
            continue
        try:
            d = get_pubmed_data(query=v['name'], email='poldrack@stanford.edu', retmax=50)
            task_results[k] = parse_pubmed_pubs(d)
        except Exception as e:
            errors[k] = str(e)
            task_results[k] = []
        # sleep for 100 ms
        time.sleep(0.1)



Loaded 8432 task results from /Users/poldrack/Dropbox/data/ontology-learner/data/gpt4/task_results.json


In [5]:
empty_results = [k for k in task_results if len(task_results[k]) == 0]
len(empty_results)

657

### Text embedding model fitting

We want to fit an embedding using fasttext based on the retrieved abstracts.  

First let's try using supervised learning, where the labels are the task keys.  

The first thing we need to do is to save the text out to a file with the labels.

In [6]:
train_test_dict = {}

trainfile = open(datadir / 'gpt4/task_abstracts_train.txt', 'w')
testfile = open(datadir / 'gpt4/task_abstracts_test.txt', 'w')

for k, v in task_results.items():
    if len(v) == 0:
        continue
    ctr = 0
    label = '__label__' + k
    for doi, pub in v.items():
        if ctr < 40:
            trainfile.write(f"{label} {pub['title']} {pub['Abstract']}\n")
        else:
            testfile.write(f"{label} {pub['title']} {pub['Abstract']}\n")
        ctr += 1
    
trainfile.close()
testfile.close()


In [7]:

ndims = 100
modeltype = 'unsupervised'
infile = (datadir / 'gpt4/task_abstracts_train.txt').as_posix()
modelfile = datadir / f'gpt4/task_abstracts_model_{ndims}dims_{modeltype}.bin'

if not modelfile.exists():
    if modeltype == 'unsupervised':
        model = fasttext.train_unsupervised(input=infile, dim=ndims)
    else:
        model = fasttext.train_supervised(input=infile, dim=ndims)
        model.quantize(input=infile, retrain=True)

    model.save_model(modelfile.as_posix())

else:
    print(f"Loading model from {modelfile}")
    model = fasttext.load_model(modelfile.as_posix())




Read 72M words
Number of words:  296606
Number of labels: 7775
Progress: 100.0% words/sec/thread:   97836 lr:  0.000000 avg.loss:  0.851920 ETA:   0h 0m 0s


ValueError: For now we only support quantization of supervised models

### Get task embeddings