In [1]:
import pyobo
import pandas as pd
from IPython.display import Markdown
import networkx as nx
from tqdm.auto import tqdm
from collections import defaultdict
import bioontologies

In [2]:
obograph = bioontologies.get_obograph_by_prefix("vo").guess("vo").standardize()

standardizing nodes:   0%|          | 0.00/8.40k [00:00<?, ?it/s]

standardizing edges:   0%|          | 0.00/20.0k [00:00<?, ?it/s]

In [3]:
names = obograph.get_curie_to_name()

In [4]:
graph = nx.MultiDiGraph()
for edge in tqdm(obograph.edges, unit_scale=True):
    if edge.subject and edge.predicate and edge.object:
        graph.add_edge(edge.subject.curie, edge.object.curie, key=edge.predicate.curie)
for node in graph:
    if node in names:
        graph.nodes[node]['name'] = names[node]

  0%|          | 0.00/20.0k [00:00<?, ?it/s]

In [5]:
list(graph.nodes())[:5]

['bfo:0000001', 'owl:Thing', 'bfo:0000002', 'bfo:0000003', 'bfo:0000004']

In [6]:
# subset unannotated to only viral vaccines
viral_subset = nx.ancestors(graph, "vo:0000609") # all viral vaccines
covid_subset = nx.ancestors(graph, "vo:0004908")
ebola_subset = nx.ancestors(graph, "vo:0000726")
burk_subset = nx.ancestors(graph, "vo:0012175")
phase_1_subset = covid_subset | ebola_subset | burk_subset

In [7]:
VACCINE_CURIE = "vo:0000001"
COVID_VACCINE_CURIE = "vo:0004908"
EXAMPLE_CURIE = "vo:0005336"

In [27]:
manually_annotated_df = pd.read_csv("unannotated_phase_1.tsv", sep='\t')

idx = manually_annotated_df["platform_curie"].notna()
manually_annotated_vaccine_curies = dict(
    manually_annotated_df[idx][["vaccine_curie", "platform_curie"]].values
)
manually_annotated_df

Unnamed: 0,platform_curie,platform_name,vaccine_curie,vaccine_name,vaccine_link
0,,,vo:0000779,NP-VRP,https://bioregistry.io/vo:0000779
1,,,vo:0000780,GP-VRP,https://bioregistry.io/vo:0000780
2,,,vo:0000781,VRP expressing VP24,https://bioregistry.io/vo:0000781
3,,,vo:0000782,VRP expressing VP30,https://bioregistry.io/vo:0000782
4,,,vo:0000783,VRP expressing VP35,https://bioregistry.io/vo:0000783
...,...,...,...,...,...
80,,,vo:0011387,NP-VRP,https://bioregistry.io/vo:0011387
81,,,vo:0011436,B. pseudomallei Subunit LolC Protein Vaccine,https://bioregistry.io/vo:0011436
82,,,vo:0011438,B. pseudomallei Subunit PotF Protein Vaccine,https://bioregistry.io/vo:0011438
83,vo:0000032,DNA Vaccine,vo:0011465,Ebola virus DNA vaccine encoding ZEBOV GP and ...,https://bioregistry.io/vo:0011465


In [24]:
needs_annotation_targeted_df = pd.read_csv(
    "/Users/cthoyt/dev/kestrel/src/kestrel/curation/vaccines/phase1_vaccine_ner_freq.csv",
    names=["vaccine_curie", "vaccine_name", "count", "platform_curie"],
    header=None,
)
needs_annotation_targeted_df = needs_annotation_targeted_df[needs_annotation_targeted_df["platform_curie"].isna()]
needs_annotation_targeted_df

Unnamed: 0,vaccine_curie,vaccine_name,count,platform_curie
0,vo:0004908,COVID-19 Vaccines,1070,
1,mesh:D000090982,Abdavomeran,488,
2,mesh:C000711953,mRNA Vaccine,365,
3,vo:0000726,Ebola virus vaccine,312,
6,mesh:C000722934,mRNA COVID-19 Vaccine,139,
...,...,...,...,...
136,vo:0001178,live attenuated influenza vaccine,1,
137,vo:0003134,YF-17D vaccine,1,
138,vo:0000768,Salmonella vaccine,2,
139,ncit:C28310,Dendritic Cell Vaccine,1,


## Count Vaccines in VO

The Vaccine Ontology (VO) has a base term for vaccines (`vo:0000001`). Terms appearing under it in the parent/child hierarchy are either vaccine platforms, more detailed classifications, or actual vaccines themselves. It doesn't appear there is a formal pattern for annotating metaclasses/classes, so we assume that leaf nodes (i.e., those with no children) are the most likely to be vaccines (with the possibility of false positives).

In [8]:
all_vaccines = nx.ancestors(graph, VACCINE_CURIE)
leaf_vaccines = {
    curie
    for curie in all_vaccines
    if not list(graph.predecessors(curie))
}

print(
    f"There are {len(all_vaccines):,} vaccines in VO v{obograph.version}. {len(leaf_vaccines):,} of these "
    "are leaves, meaning they have no children. These are likely to be actual vaccines and not classifications"
)

There are 5,092 vaccines in VO v1.1.245. 3,719 of these are leaves, meaning they have no children. These are likely to be actual vaccines and not classifications


## Retrieve Vaccine Platforms from VO

The Vaccine Ontology (VO) has a base term for vaccines (`vo:0000001`). It doesn't use a multihierarchy, so there's a combination of different classification types that appear as children under this term. In the cell below, I manually curated out terms related to categorization by target (e.g., like viral vaccine (`vo:0000609`)) and other assorted categorizations. 

I filtered out putative vaccine platforms to which there were one or no vaccines annotated. This included:
- `vo:0000370` live virulent vaccine
- `vo:0000598` whole organism vaccine
- `vo:0000641` combination vaccine

In [9]:
skip = {
    "vo:0003444", # vaccine foreign to USA
    "vo:0000363", # licensed vaccine
    "vo:0000755", # edible vaccine
    "vo:0001218", # veterinary vaccine
    "vo:0000400", # prime-boost vaccine
    "vo:0000455", # prophylactic vaccine (empty)
    "vo:0000550", # therapeutic vaccine (empty)
    "vo:0000754", # marker vaccine (empty)
    "vo:0001395", # cocktail vaccine
    "vo:0000556", # multivalent vaccine
    
}
wrong_hierarchy = {
    "vo:0000002", # vaccination
    "vo:0000494", # vaccine immunization
    "vo:0000288", # vaccine-induced host response
    "vo:0000596", # vaccine clinical trial
    "vo:0000590", # vaccine preparation
}
vaccine_by_target = {
    "vo:0000165", # Bacterial
    "vo:0000177", # cancer
    "vo:0000267", # fungal
    "vo:0000609", # viral
    "vo:0000561", # toxoid
    "vo:0000372", # profertility
    "vo:0000417", # protozoan
    "vo:0000759", # contraceptive
    "vo:0002813", # non-infectious disease
}
rows = [
    (
        d, 
        pyobo.get_name_by_curie(d),
        n_descendants
    )
    for d in graph.predecessors("vo:0000001")
    if (
        d.startswith("vo:") 
        and d not in skip 
        and d not in vaccine_by_target 
        and d not in wrong_hierarchy
        and (n_descendants := len(list(nx.ancestors(graph, d)))) > 1
    )
]

platforms_df = pd.DataFrame(rows, columns=["curie", "label", "number_descendants"]).sort_values("curie")
print(platforms_df.to_markdown(index=False))

| curie      | label                       |   number_descendants |
|:-----------|:----------------------------|---------------------:|
| vo:0000032 | DNA vaccine                 |                   18 |
| vo:0000186 | RNA vaccine                 |                   55 |
| vo:0000195 | conjugate vaccine           |                   40 |
| vo:0000315 | inactivated vaccine         |                  984 |
| vo:0000367 | live attenuated vaccine     |                 1009 |
| vo:0000484 | recombinant vector vaccine  |                   45 |
| vo:0000534 | subunit vaccine             |                  595 |
| vo:0004914 | virus-like particle vaccine |                    9 |
| vo:0005439 | passive vaccine             |                   16 |


In [10]:
# get the list of all descendants from these platforms that are themselves leaves.

def get_classifications(parents):
    rv = defaultdict(set)
    for parent in parents:
        parent_name = names[parent]
        for descendant in nx.ancestors(graph, parent):
            if descendant not in leaf_vaccines:
                continue
            rv[descendant].add(parent)
    return dict(rv)


vaccine_to_platform = get_classifications(platforms_df.curie)

# TODO come back to this
vaccine_to_target = get_classifications(vaccine_by_target)

In [11]:
precurations = {
    "vo:0000785": ("vo:0000032", "DNA Vaccine"),
}

def _guess_type(curie):
    name = names[curie]
    if "dna vaccine" in name.lower():
        return "vo:0000032", "DNA Vaccine"
    return None, None

# get unannotated vaccines
unannotated = [
    (*_guess_type(curie), curie, names[curie], f"https://bioregistry.io/{curie}")
    for curie in leaf_vaccines
    if curie not in vaccine_to_platform and not names[curie].endswith("unspecified formulation")
]
print(
    f"{len(unannotated):,}/{len(leaf_vaccines):,} ({len(unannotated)/len(leaf_vaccines):.1%}) "
    "leaf vaccines are not annotated to a platform"
)

unannotated_df = pd.DataFrame(
    unannotated, 
    columns=["platform_curie", "platform_name", "vaccine_curie", "vaccine_name", "vaccine_link"]
).sort_values('vaccine_curie')
# TODO, later when we are done with everything else
# unannotated_df.to_csv("unannotated.tsv", sep='\t', index=False)

unannotated_viral_df = unannotated_df[unannotated_df["vaccine_curie"].isin(phase_1_subset)]
unannotated_viral_df.to_csv("unannotated_phase_1.tsv", sep='\t', index=False)
unannotated_viral_df

2,363/3,719 (63.5%) leaf vaccines are not annotated to a platform


Unnamed: 0,platform_curie,platform_name,vaccine_curie,vaccine_name,vaccine_link
1493,,,vo:0000779,NP-VRP,https://bioregistry.io/vo:0000779
1126,,,vo:0000780,GP-VRP,https://bioregistry.io/vo:0000780
75,,,vo:0000781,VRP expressing VP24,https://bioregistry.io/vo:0000781
92,,,vo:0000782,VRP expressing VP30,https://bioregistry.io/vo:0000782
461,,,vo:0000783,VRP expressing VP35,https://bioregistry.io/vo:0000783
...,...,...,...,...,...
1117,,,vo:0011387,NP-VRP,https://bioregistry.io/vo:0011387
659,,,vo:0011436,B. pseudomallei Subunit LolC Protein Vaccine,https://bioregistry.io/vo:0011436
1573,,,vo:0011438,B. pseudomallei Subunit PotF Protein Vaccine,https://bioregistry.io/vo:0011438
682,vo:0000032,DNA Vaccine,vo:0011465,Ebola virus DNA vaccine encoding ZEBOV GP and ...,https://bioregistry.io/vo:0011465


In [12]:
single_platform_count = sum(len(platforms) == 1 for v, platforms in vaccine_to_platform.items() if v in phase_1_subset)
print(
    f"{single_platform_count:,}/{len(leaf_vaccines):,} ({single_platform_count/len(leaf_vaccines):.1%}) " 
    "vaccines are annotated to a single platform"
)

142/3,719 (3.8%) vaccines are annotated to a single platform


In [13]:
single_platform_df = pd.DataFrame(
    [
        (
            vaccine_curie, 
            names[vaccine_curie], 
            platform_curie := list(platform_curie_set)[0], 
            names[platform_curie],
        )
        for vaccine_curie, platform_curie_set in vaccine_to_platform.items()
    ],
    columns=["vaccine_curie", "vaccine_name", "platform_curie", "platform_name"]
)
single_platform_df.to_csv('existing_annotations.tsv', sep='\t', index=False)

It turns out that some vaccines are annotated to multiple platforms

In [14]:
rows = [
    (v, names[v], ", ".join(sorted(names[p] for p in platforms)))
    for v, platforms in vaccine_to_platform.items()
    if len(platforms) > 1
]
multiplatform_vaccines_df = pd.DataFrame(rows, columns=["curie", "name", "platforms"])

print(f"{len(multiplatform_vaccines_df)}/{len(leaf_vaccines):,} ({len(multiplatform_vaccines_df)/len(leaf_vaccines):.1%}) "
      "vaccines are annotated with multiple platforms. Here's a sample:")

Markdown(multiplatform_vaccines_df.sample(10).to_markdown())

189/3,719 (5.1%) vaccines are annotated with multiple platforms. Here's a sample:


|     | curie      | name                                                                                                                                                                                                                | platforms                                                     |
|----:|:-----------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------|
|  34 | vo:0003585 | influenza A virus A/California/7/2009 (H1N1) antigen 0.12 MG/ML / influenza A virus A/Hong Kong/4801/2014 (H3N2) antigen 0.12 MG/ML / influenza B virus B/Brisbane/60/2008 antigen 0.12 MG/ML [Fluzone 2016-2017]   | inactivated vaccine, live attenuated vaccine, subunit vaccine |
|  99 | vo:0001864 | Feline Rhinotracheitis-Calici-Panleukopenia-Chlamydia Psittaci Modified Live Virus, Killed Chlamydia Vaccine (USDA: 16E6.20)                                                                                        | inactivated vaccine, live attenuated vaccine                  |
|  43 | vo:0003234 | 0.5 ML Hepatitis B Surface Antigen Vaccine 0.04 MG/ML Injection                                                                                                                                                     | inactivated vaccine, subunit vaccine                          |
| 115 | vo:0003809 | FluMist Quadrivalent, 2015-2016 Formula Nasal Spray                                                                                                                                                                 | inactivated vaccine, live attenuated vaccine, subunit vaccine |
|  89 | vo:0003648 | Fluzone Quadrivalent 2017-2018 vaccine 0.1 ML Prefilled Syringe                                                                                                                                                     | inactivated vaccine, live attenuated vaccine, subunit vaccine |
|  59 | vo:0001978 | Bovine Rhinotracheitis-Virus Diarrhea-Parainfluenza 3-Respiratory Syncytial Virus Modified Live & Killed Virus Vaccine-Leptospira Canicola-Grippotyphosa-Hardjo-Icterohaemorrhagiae-Pomona Bacterin (USDA: 4469.20) | inactivated vaccine, live attenuated vaccine                  |
|  48 | vo:0003654 | Fluzone Quadrivalent 2018-2019 vaccine Injectable Suspension                                                                                                                                                        | inactivated vaccine, live attenuated vaccine, subunit vaccine |
| 127 | vo:0001977 | Bovine Rhinotracheitis-Virus Diarrhea-Parainfluenza 3-Respiratory Syncytial Virus Modified Live & Killed Virus Vaccine-Haemophilus somnus Bacterin (USDA: 44F9.20)                                                  | inactivated vaccine, live attenuated vaccine                  |
|  37 | vo:0003973 | Pentacel vaccine 0.5 ML Injection                                                                                                                                                                                   | inactivated vaccine, subunit vaccine                          |
| 154 | vo:0002230 | Canine Distemper-Adenovirus Type 2-Coronavirus-Parainfluenza-Parvovirus Modified Live & Killed Virus Vaccine-Leptospira Canicola-Icterohaemorrhagiae Bacterial Extract (USDA: 46J9.21)                              | inactivated vaccine, live attenuated vaccine                  |