# GPT-4 Experiments 

In [1]:
from pprint import pprint
from typing import Dict, List
import numpy as np
from tqdm import tqdm
from dataclasses import dataclass
from collections import Counter
from dataclasses import asdict
import json

from nxontology_ml.data import read_training_data, get_efo_otar_slim
from nxontology_ml.gpt_tagger import TaskConfig, GptTagger
from nxontology_ml.utils import ROOT_DIR
from experimentation.model_utils import mean_absolute_error, one_h_enc

In [2]:
##
# Common code across experiments

N_SAMPLES = 500

CLASS_MAP: Dict[str, int] = {
    "high": 0,  # "01-disease-subtype"
    "medium": 1,  # "02-disease-root"
    "low": 2,  # "03-disease-area"
}

dists = {
    "01-disease-subtype": {"low": 2, "medium": 1, "high": 0},
    "02-disease-root": {"low": 1, "medium": 0, "high": 1},
    "03-disease-area": {"low": 0, "medium": 1, "high": 2},
}


@dataclass
class NodeDist:
    efo_id: str
    true_label: str
    precisions: List[str]
    efo_label: str
    efo_definition: str | None
    dist: int

    def __lt__(self, other):
        return self.dist < other.dist


# Get a few EFO nodes
X, y = read_training_data(take=N_SAMPLES, filter_out_non_disease=True)
nxo = get_efo_otar_slim()
nodes = [nxo.node_info(node) for node in X]

## 1. Experiment: `n=3` completions

In [3]:
CHOICES = 3

# Create a config for EFO nodes labelling
config = TaskConfig(
    name="precision",
    prompt_path=ROOT_DIR / "prompts/precision_v1.txt",
    openai_model_name="gpt-4",
    node_attributes=["efo_id", "efo_label", "efo_definition"],
    model_n=CHOICES,
    prompt_token_ratio=0.5,
    allowed_labels=frozenset({"low", "medium", "high"}),
)


# Get their labels
tagger = GptTagger.from_config(config)
labeled_nodes = []
for ln in tqdm(
    tagger.fetch_labels(nodes),
    total=len(X),
    desc="Fetching node tags using GPT-4",
    ncols=100,
):
    labeled_nodes.append(ln)

# Inspect metrics
print("\nTagger metrics:")
pprint(tagger.get_metrics())

Fetching node tags using GPT-4: 100%|███████████████████████████| 500/500 [00:00<00:00, 4355.06it/s]


Tagger metrics:
Counter({'Cache/get': 500, 'Cache/hits': 500})





### 1.1 MAE scores

In [4]:
##
# Map labeled nodes into probabilities

y_probas: List[np.array] = []
for node, ln in zip(nodes, labeled_nodes):
    assert node.identifier == ln.node_efo_id, f"{node.identifier} != {ln.node_efo_id}"
    w = np.zeros(len(CLASS_MAP), dtype=np.float32)
    for label in ln.labels:
        try:
            w[CLASS_MAP[label.lower()]] += 1.0 / CHOICES
        except KeyError as e:
            print(f"Wrong key: {label} (for node: {node.identifier=}; {ln.labels=})")
            raise e
    y_probas.append(w)

# Biased MAE
y_true = np.array([one_h_enc[l] for l in y])
print(
    f"(Label counts as probas) BiasedMAE: {mean_absolute_error(y_true, np.array(y_probas)):.3f}"
)

(Label counts as probas) BiasedMAE: 0.267


In [5]:
##
# Find most common label for each node

y_probas: List[np.array] = []
for node, ln in zip(nodes, labeled_nodes):
    assert node.identifier == ln.node_efo_id, f"{node.identifier} != {ln.node_efo_id}"
    w = np.zeros(len(CLASS_MAP), dtype=np.float32)
    c = Counter(ln.labels)
    w[CLASS_MAP[c.most_common(1)[0][0].lower()]] = 1.0
    y_probas.append(w)

# Biased MAE
y_true = np.array([one_h_enc[l] for l in y])
print(
    f"(Most common label) BiasedMAE: {mean_absolute_error(y_true, np.array(y_probas)):.3f}"
)

(Most common label) BiasedMAE: 0.261


### 1.2 Labels analysis

In [6]:
##
# Metrics on tags
import json

labels_cnt = Counter()
for key, val in tagger._cache._storage:
    labels = json.loads(val)
    assert len(labels) == 3
    labels_cnt.update(labels)

print(
    f"GPT-4 labels proportion:\n\t{', '.join(f'{k}={v/N_SAMPLES/3:.2f}' for k, v in labels_cnt.most_common())}"
)
print(
    f"Samples (true) labels proportion:\n\t{', '.join(f'{k}={v/N_SAMPLES:.2f}' for k, v in Counter(y).most_common())}"
)

GPT-4 labels proportion:
	high=0.47, medium=0.36, low=0.17
Samples (true) labels proportion:
	01-disease-subtype=0.51, 02-disease-root=0.38, 03-disease-area=0.11


In [7]:
##
# Classification metrics

from sklearn.metrics import classification_report

print(classification_report(y_true, np.array(y_probas)))

              precision    recall  f1-score   support

           0       0.65      0.61      0.63       256
           1       0.44      0.41      0.42       191
           2       0.44      0.68      0.53        53

   micro avg       0.54      0.54      0.54       500
   macro avg       0.51      0.57      0.53       500
weighted avg       0.55      0.54      0.54       500
 samples avg       0.54      0.54      0.54       500



### 1.3 Misclassified samples

In [8]:
nodes_dist = []
for (node, ln), true_y in zip(zip(nodes, labeled_nodes), y):
    nodes_dist.append(
        NodeDist(
            efo_id=node.identifier,
            true_label=true_y,
            precisions=ln.labels,
            efo_label=node.data["efo_label"],
            efo_definition=node.data["efo_definition"],
            dist=sum(dists[true_y][l.lower()] for l in ln.labels),
        )
    )

print("Distance distribution:")
pprint(dict(sorted(Counter(nd.dist for nd in nodes_dist).items())))

Distance distribution:
{0: 175, 1: 90, 2: 91, 3: 104, 4: 24, 5: 9, 6: 7}


In [9]:
N = 5
print(f"Top {N} nodes ordered by decresing distance from true label:")
pprint([asdict(n) for n in sorted(nodes_dist, reverse=True)][:N])
print("For a list of 25 nodes, see gist below")

Top 5 nodes ordered by decresing distance from true label:
[{'dist': 6,
  'efo_definition': 'An inflammation of both larynx and trachea.',
  'efo_id': 'MONDO:0000263',
  'efo_label': 'laryngotracheitis',
  'precisions': ['low', 'low', 'low'],
  'true_label': '01-disease-subtype'},
 {'dist': 6,
  'efo_definition': 'A viral infectious disease that results_in infection in '
                    'sheep and rarely humans, has_material_basis_in Louping '
                    'ill virus, which is transmitted_by sheep tick, Ixodes '
                    'ricinus. The infection has_symptom lethargy, has_symptom '
                    'muscle pains, has_symptom fever, and has_symptom focal '
                    'neurological signs.',
  'efo_id': 'EFO:0007348',
  'efo_label': 'louping ill',
  'precisions': ['low', 'low', 'low'],
  'true_label': '01-disease-subtype'},
 {'dist': 6,
  'efo_definition': 'An abnormally high level of uric acid.',
  'efo_id': 'EFO:0009104',
  'efo_label': 'hyperuricemia',
 

Notes:
* **⇒** The list of "Top 25 nodes tagged by GPT-4 ordered by decreasing distance from true label" can be found in [**this gist**](https://gist.github.com/yonromai/f598c1ab39f4c7d42553212231a515f8)
* Most high distance from true label nodes are of class `01-disease-subtype`
* Lots of mislassified nodes are missing `efo_definition`

## 2. Experiment: `n=4` completions

In [10]:
CHOICES_4 = 4

# Create a config for EFO nodes labelling
config = TaskConfig(
    name="precision",
    prompt_path=ROOT_DIR / "prompts/precision_v1.txt",
    openai_model_name="gpt-4",
    node_attributes=["efo_id", "efo_label", "efo_definition"],
    model_n=CHOICES_4,
    prompt_token_ratio=0.5,
    allowed_labels=frozenset({"low", "medium", "high"}),
)

# Get their labels
tagger = GptTagger.from_config(config)
labeled_nodes = []
for ln in tqdm(
    tagger.fetch_labels(nodes),
    total=len(X),
    desc="Fetching node tags using GPT-4",
    ncols=100,
):
    labeled_nodes.append(ln)

# Inspect metrics
print("\nTagger metrics:")
pprint(tagger.get_metrics())

Fetching node tags using GPT-4: 100%|███████████████████████████| 500/500 [00:00<00:00, 4591.45it/s]


Tagger metrics:
Counter({'Cache/get': 500, 'Cache/hits': 500})





Notes:
* Sanity check: this request costed $~2.5 (pre-caching..)
* TODO: Persist the tagger metrics on disk across invokations

### 2.1 Request / Response samples
For debugging purposes, I peaked at the payloads to and from the GPT API, here are a couple of samples:
* Sample 1:
    * Request:
        * [Json payload](https://gist.github.com/yonromai/b3f9475425a84f7d500457c4f49ff474)
        * [Prompt](https://gist.github.com/yonromai/65202b1fb90b3a18f4fd2fb981692121)  (parsed from json request)
    * Response:
        * [Json payload](https://gist.github.com/yonromai/6bec0c6744e840b57b7bda214e88c6e1)
        * Completions: [#0](https://gist.github.com/yonromai/055b1d1b18848d4ecfbbe27136253390), [#1](https://gist.github.com/yonromai/9c473e4113c017158905f25fbc474a36), [#2](https://gist.github.com/yonromai/1019910cb1d938c17848035b87633e2f) & [#3](https://gist.github.com/yonromai/f4606678efbb504c1636cd5f69eb3ab6) (parsed from json response)
* Sample 2:
    * Request:
        * [Json payload](https://gist.github.com/yonromai/e4a5b09c65918fc9ad44011c65eb567a)
        * [Prompt](https://gist.github.com/yonromai/6e997d203a8e213df011d48c54349907)  (parsed from json request)
    * Response:
        * [Json payload](https://gist.github.com/yonromai/306022a79b8c791bd02bbcab9068d6dd)
        * Completions: [#0](https://gist.github.com/yonromai/1249b944273b4aec351e3e7c828c9441), [#1](https://gist.github.com/yonromai/decf5e3f1d0d6d94e1f76016f05f7d4f), [#2](https://gist.github.com/yonromai/56027326e01856fdfa6117ab82dfd9c3) & [#3](https://gist.github.com/yonromai/fce39cdb3b0f5b4cb4636b554ae7baab) (parsed from json response)

### 2.2 MAE scores

In [11]:
##
# Map labeled nodes into probabilities

y_probas: List[np.array] = []
for node, ln in zip(nodes, labeled_nodes):
    assert node.identifier == ln.node_efo_id, f"{node.identifier} != {ln.node_efo_id}"
    w = np.zeros(len(CLASS_MAP), dtype=np.float32)
    for label in ln.labels:
        try:
            w[CLASS_MAP[label.lower()]] += 1.0 / CHOICES_4
        except KeyError as e:
            print(f"Wrong key: {label} (for node: {node.identifier=}; {ln.labels=})")
            raise e
    y_probas.append(w)

# Biased MAE
y_true = np.array([one_h_enc[l] for l in y])
print(
    f"(Label counts as probas) BiasedMAE: {mean_absolute_error(y_true, np.array(y_probas)):.3f}"
)

(Label counts as probas) BiasedMAE: 0.261


### 2.3 Misclassified samples

In [12]:
nodes_dist = []
for (node, ln), true_y in zip(zip(nodes, labeled_nodes), y):
    nodes_dist.append(
        NodeDist(
            efo_id=node.identifier,
            true_label=true_y,
            precisions=ln.labels,
            efo_label=node.data["efo_label"],
            efo_definition=node.data["efo_definition"],
            dist=sum(dists[true_y][l.lower()] for l in ln.labels),
        )
    )

print("Distance distribution:")
pprint(dict(sorted(Counter(nd.dist for nd in nodes_dist).items())))

Distance distribution:
{0: 150, 1: 93, 2: 67, 3: 60, 4: 97, 5: 13, 6: 12, 7: 4, 8: 4}


In [13]:
N = 5
print(f"Top {N} nodes ordered by decresing distance from true label:")
pprint([asdict(n) for n in sorted(nodes_dist, reverse=True)][:N])

Top 5 nodes ordered by decresing distance from true label:
[{'dist': 8,
  'efo_definition': 'Spasm of the large- or medium-sized coronary arteries.',
  'efo_id': 'EFO:0004225',
  'efo_label': 'Coronary Vasospasm',
  'precisions': ['low', 'low', 'low', 'low'],
  'true_label': '01-disease-subtype'},
 {'dist': 8,
  'efo_definition': 'An abnormally high level of uric acid.',
  'efo_id': 'EFO:0009104',
  'efo_label': 'hyperuricemia',
  'precisions': ['low', 'low', 'low', 'low'],
  'true_label': '01-disease-subtype'},
 {'dist': 8,
  'efo_definition': 'A dysentery that involves protozoan infection.',
  'efo_id': 'MONDO:0001955',
  'efo_label': 'protozoal dysentery',
  'precisions': ['low', 'low', 'low', 'low'],
  'true_label': '01-disease-subtype'},
 {'dist': 8,
  'efo_definition': 'A epilepsy syndrome that occurs during childhood.',
  'efo_id': 'MONDO:0020072',
  'efo_label': 'childhood-onset epilepsy syndrome',
  'precisions': ['low', 'low', 'low', 'low'],
  'true_label': '01-disease-subtyp

## 3. Experiment: and`n=2` completions

In [14]:
CHOICES_2 = 2

# Create a config for EFO nodes labelling
config = TaskConfig(
    name="precision",
    prompt_path=ROOT_DIR / "prompts/precision_v1.txt",
    openai_model_name="gpt-4",
    node_attributes=["efo_id", "efo_label", "efo_definition"],
    model_n=CHOICES_2,
    prompt_token_ratio=0.5,
    allowed_labels=frozenset({"low", "medium", "high"}),
)

# Get their labels
tagger = GptTagger.from_config(config)
labeled_nodes = []
for ln in tqdm(
    tagger.fetch_labels(nodes),
    total=len(X),
    desc="Fetching node tags using GPT-4",
    ncols=100,
):
    labeled_nodes.append(ln)

# Inspect metrics
print("\nTagger metrics:")
pprint(tagger.get_metrics())

Fetching node tags using GPT-4: 100%|███████████████████████████| 500/500 [00:00<00:00, 3173.37it/s]


Tagger metrics:
Counter({'Cache/get': 500, 'Cache/hits': 500})





### 3.1 MAE scores

In [15]:
##
# Map labeled nodes into probabilities

y_probas: List[np.array] = []
for node, ln in zip(nodes, labeled_nodes):
    assert node.identifier == ln.node_efo_id, f"{node.identifier} != {ln.node_efo_id}"
    w = np.zeros(len(CLASS_MAP), dtype=np.float32)
    for label in ln.labels:
        try:
            w[CLASS_MAP[label.lower()]] += 1.0 / CHOICES_2
        except KeyError as e:
            print(f"Wrong key: {label} (for node: {node.identifier=}; {ln.labels=})")
            raise e
    y_probas.append(w)

# Biased MAE
y_true = np.array([one_h_enc[l] for l in y])
print(
    f"(Label counts as probas) BiasedMAE: {mean_absolute_error(y_true, np.array(y_probas)):.3f}"
)

(Label counts as probas) BiasedMAE: 0.270


## 4. Experiment: "Rav CoT prompt"

In [16]:
cot_config = TaskConfig(
    name="cot_precision",
    prompt_path=ROOT_DIR / "prompts/rav_cot_precision_v1.txt",
    openai_model_name="gpt-4",
    node_attributes=["efo_id", "efo_label", "efo_definition"],
    model_n=2,
    prompt_token_ratio=0.5,
    end_of_cot_marker="<END_OF_COT>",
    allowed_labels=frozenset({"low", "medium", "high"}),
)

# Get their labels
tagger = GptTagger.from_config(cot_config)
labeled_nodes = []
for ln in tqdm(
    tagger.fetch_labels(nodes),
    total=len(X),
    desc="Fetching node tags using GPT-4",
    ncols=100,
):
    labeled_nodes.append(ln)

# Inspect metrics
print("\nTagger metrics:")
pprint(tagger.get_metrics())

Fetching node tags using GPT-4: 100%|███████████████████████████| 500/500 [00:00<00:00, 4623.18it/s]


Tagger metrics:
Counter({'Cache/get': 500, 'Cache/hits': 500})





Example of [prompt](https://gist.github.com/yonromai/c637c1dabea0f66bae849acbb3a77053) and response choices ([#1](https://gist.github.com/yonromai/597dfce428c8fdd98350cc19abdbd79f), [#2](https://gist.github.com/yonromai/7ac91e6543df6d69ab9de264e481b75c))

### 4.1 MAE scores

In [17]:
##
# Map labeled nodes into probabilities

y_probas: List[np.array] = []
for node, ln in zip(nodes, labeled_nodes):
    assert node.identifier == ln.node_efo_id, f"{node.identifier} != {ln.node_efo_id}"
    w = np.zeros(len(CLASS_MAP), dtype=np.float32)
    for label in ln.labels:
        try:
            w[CLASS_MAP[label.lower()]] += 1.0 / CHOICES_2
        except KeyError as e:
            print(f"Wrong key: {label} (for node: {node.identifier=}; {ln.labels=})")
            raise e
    y_probas.append(w)

# Biased MAE
y_true = np.array([one_h_enc[l] for l in y])
print(
    f"(Label counts as probas) BiasedMAE: {mean_absolute_error(y_true, np.array(y_probas)):.3f}"
)

(Label counts as probas) BiasedMAE: 0.275


## 5. Experiment: "Precision prompt v2" (with extra few-shot examples)

In [18]:
CHOICES = 3

# Create a config for EFO nodes labelling
config = TaskConfig(
    name="precision",
    prompt_version="v2",
    prompt_path=ROOT_DIR / "prompts/precision_v2.txt",
    openai_model_name="gpt-4",
    node_attributes=["efo_id", "efo_label", "efo_definition"],
    model_n=CHOICES,
    prompt_token_ratio=0.5,
    allowed_labels=frozenset({"low", "medium", "high"}),
)


# Get their labels
tagger = GptTagger.from_config(config)
labeled_nodes = []
for ln in tqdm(
    tagger.fetch_labels(nodes),
    total=len(X),
    desc="Fetching node tags using GPT-4",
    ncols=100,
):
    labeled_nodes.append(ln)

# Inspect metrics
print("\nTagger metrics:")
pprint(tagger.get_metrics())

Fetching node tags using GPT-4: 100%|███████████████████████████| 500/500 [00:00<00:00, 4695.81it/s]


Tagger metrics:
Counter({'Cache/get': 500, 'Cache/hits': 500})





### 5.1 Request / Response samples
For debugging purposes, I peaked at the payloads to and from the GPT API, here are a couple of samples:
* Sample 1:
    * Request:
        * [Prompt](https://gist.github.com/yonromai/8799f47c3911e8f7ea51aefdfc5cf27d)  (parsed from json request)
    * Response:
        * Completions: [#0](https://gist.github.com/yonromai/b53c0c5c5c64ee0376d3564a1e1cadda), [#1](https://gist.github.com/yonromai/c923eb74b8a4a70ff66103b90d06f15c) & [#2](https://gist.github.com/yonromai/a007cdaa0fd6971f596567a0569224ee) (parsed from json response)
* Sample 2:
    * Request:
        * [Prompt](https://gist.github.com/yonromai/daa0bd3c8e9f81250a68c3f6b614598d)  (parsed from json request)
    * Response:
        * Completions: [#0](https://gist.github.com/yonromai/68e74c7a8032d9f59e427f6196652d91), [#1](https://gist.github.com/yonromai/f2b852e20f42bfea2da13b3e755267fe) & [#2](https://gist.github.com/yonromai/66ff9de3141cb4dcd35cd11cd06c936e) (parsed from json response)

### 5.2 MAE scores

In [19]:
##
# Map labeled nodes into probabilities

y_probas: List[np.array] = []
for node, ln in zip(nodes, labeled_nodes):
    assert node.identifier == ln.node_efo_id, f"{node.identifier} != {ln.node_efo_id}"
    w = np.zeros(len(CLASS_MAP), dtype=np.float32)
    for label in ln.labels:
        try:
            w[CLASS_MAP[label.lower()]] += 1.0 / CHOICES
        except KeyError as e:
            print(f"Wrong key: {label} (for node: {node.identifier=}; {ln.labels=})")
            raise e
    y_probas.append(w)

# Biased MAE
y_true = np.array([one_h_enc[l] for l in y])
print(
    f"(Label counts as probas) BiasedMAE: {mean_absolute_error(y_true, np.array(y_probas)):.3f}"
)

(Label counts as probas) BiasedMAE: 0.268


### 5.3 Misclassified samples

In [20]:
nodes_dist = []
for (node, ln), true_y in zip(zip(nodes, labeled_nodes), y):
    nodes_dist.append(
        NodeDist(
            efo_id=node.identifier,
            true_label=true_y,
            precisions=ln.labels,
            efo_label=node.data["efo_label"],
            efo_definition=node.data["efo_definition"],
            dist=sum(dists[true_y][l.lower()] for l in ln.labels),
        )
    )

print("Distance distribution:")
pprint(dict(sorted(Counter(nd.dist for nd in nodes_dist).items())))

Distance distribution:
{0: 174, 1: 90, 2: 93, 3: 113, 4: 16, 5: 6, 6: 8}


In [21]:
N = 10
print(f"Top {N} nodes ordered by decresing distance from true label:")
pprint([asdict(n) for n in sorted(nodes_dist, reverse=True)][:N])

Top 10 nodes ordered by decresing distance from true label:
[{'dist': 6,
  'efo_definition': 'Long-standing obesity without metbolic abnormalities or '
                    'obesity-related comorbidities such as type 2 diabetes or '
                    'heart disease',
  'efo_id': 'EFO:0009382',
  'efo_label': 'metabolically healthy obesity',
  'precisions': ['low', 'low', 'low'],
  'true_label': '01-disease-subtype'},
 {'dist': 6,
  'efo_definition': 'An overwhelming, irrational, and persistent fear of being '
                    'diagnosed with cancer.',
  'efo_id': 'EFO:1001879',
  'efo_label': 'cancerophobia',
  'precisions': ['low', 'low', 'low'],
  'true_label': '01-disease-subtype'},
 {'dist': 6,
  'efo_definition': 'A viral infectious disease that results_in infection in '
                    'sheep and rarely humans, has_material_basis_in Louping '
                    'ill virus, which is transmitted_by sheep tick, Ixodes '
                    'ricinus. The infection has_sympto