# Graph generation comparison
In this notebook, we'll compare the outputs of a few different approaches to  graph building for characteristics like degree skew and entity grounding.

In [1]:
import jsonlines
import pandas as pd
from ontogpt.io.csv_wrapper import parse_yaml_predictions
from os import listdir
from os.path import splitext
import numpy as np
import networkx as nx
from random import sample
from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


## Read in data

In [2]:
schema_path = '../knowledge_graph/schema/desiccation.yaml'

In [3]:
onto_full_ents, onto_full_rels = parse_yaml_predictions('../data/ontogpt_output/test_1000_full/output.txt', schema_path)

100%|██████████| 926/926 [00:00<00:00, 110004.40it/s]


In [4]:
onto_slim_ents, onto_slim_rels = parse_yaml_predictions('../data/ontogpt_output/test_1000_slim/output.txt', schema_path)

100%|██████████| 929/929 [00:00<00:00, 112557.29it/s]


In [5]:
uids_to_keep = [splitext(f)[0] for f in listdir('/mnt/scratch/lotrecks/drought_and_des_1000_subset_15Apr2024/')]

In [6]:
with jsonlines.open('../data/dygiepp/model_predictions/09Apr24_dygiepp_test_formatted_data_pickle_predictions.jsonl') as reader:
    dygiepp = [obj for obj in reader if obj['doc_key'] in uids_to_keep]

## Process data
Convert DyGIE++ to csv, then all three to networkx objects.

In [7]:
dygiepp_ents = {'id': [], 'category': [], 'name': [], 'provided_by': []}
dygiepp_rels = {'id': [], 'category': [], 'provided_by': [], 'predicate': [], 'subject': [], 'object': []}
for doc in dygiepp:
    all_text = [tok for sent in doc['sentences'] for tok in sent]
    for sent in doc['predicted_ner']:
        for ent in sent:
            dygiepp_ents['id'].append(np.nan)
            dygiepp_ents['category'].append(ent[2])
            dygiepp_ents['name'].append(' '.join(all_text[ent[0]: ent[1]+1]).lower())
            dygiepp_ents['provided_by'].append(doc['doc_key'])
    for sent in doc['predicted_relations']:
        for rel in sent:
            dygiepp_rels['id'].append(np.nan)
            dygiepp_rels['category'].append(rel[4])
            dygiepp_rels['provided_by'].append(doc['doc_key'])
            dygiepp_rels['predicate'].append(rel[4])
            dygiepp_rels['subject'].append(' '.join(all_text[rel[0]: rel[1]+1]).lower())
            dygiepp_rels['object'].append(' '.join(all_text[rel[2]: rel[3]+1]).lower())
dygiepp_ent_df = pd.DataFrame(dygiepp_ents)
dygiepp_rel_df = pd.DataFrame(dygiepp_rels)

In [8]:
def kgx_to_networkx(ent_df, rel_df, source='onto'):
    """
    Convert two KGX-formatted df's into a networkx graph.
    """
    if source == 'onto':
        nodes = [(row.id, {'ent_type': row.category, 'name': row['name']}) for i, row in ent_df.iterrows()]
    else:
        nodes = [(row.name, {'ent_type': row.category}) for i, row in ent_df.iterrows()]
    edges = [(row.subject, row.object, {'rel_type': row.predicate}) for i, row in rel_df.iterrows()]
    graph = nx.DiGraph()
    _ = graph.add_nodes_from(nodes)
    _ = graph.add_edges_from(edges)
    return graph

In [9]:
dygiepp_graph = kgx_to_networkx(dygiepp_ent_df, dygiepp_rel_df, source='dygiepp')

In [10]:
onto_slim_graph = kgx_to_networkx(onto_slim_ents, onto_slim_rels)

In [11]:
onto_full_graph = kgx_to_networkx(onto_full_ents, onto_full_rels)

In [12]:
nets = {
    'dygiepp': dygiepp_graph,
    'onto_slim': onto_slim_graph,
    'onto_full': onto_full_graph
}

## Compare

### Basic network statistics

In [19]:
for net_name, net in nets.items():
    print(f'In network {net_name}, there are {len(net.nodes)} nodes and {len(net.edges)} edges, with a ratio of {len(net.edges)/len(net.nodes)}.')

In network dygiepp, there are 18360 nodes and 1448 edges, with a ratio of 0.07886710239651416.
In network onto_slim, there are 2940 nodes and 770 edges, with a ratio of 0.2619047619047619.
In network onto_full, there are 2893 nodes and 692 edges, with a ratio of 0.23919806429312132.


The number of edges is bad in all methods, but OntoGPT extracts orders of magnitude more entities.

### Check relation quality
A previous glance at the DyGIE++ relations showed that the majority of them were trivial -- we'll take a look at a few sets of triples from each graph to get an idea of whether or not they're meaningful.

In [14]:
for net_name, net in nets.items():
    trips = [(e1, attrs['rel_type'], e2) for e1, e2, attrs in net.edges(data=True)]
    if net_name in ['onto_slim', 'onto_full']:
        id2name = nx.get_node_attributes(net, 'name')
        updated_trips = []
        for e1, rt, e2 in trips:
            try:
                updated_trip = (id2name[e1], rt, id2name[e2])
                updated_trips.append(updated_trip)
            except KeyError:
                continue
        trips = updated_trips
    to_print = sample(trips, 10)
    print(f'Ten random triples from {net_name}:')
    print('----------------------------------------------------')
    for trip in to_print:
        print(trip)
    print('\n\n\n')

Ten random triples from dygiepp:
----------------------------------------------------
('lq1230', 'activates', 'iaa')
('salt tolerance', 'is-in', 'arabidopsis and grape callus')
('gibberellins', 'activates', 'ramie growth')
('gmnced3s', 'is-in', 'soybean')
('vvmapk9', 'interacts', 'reactive oxygen species')
('dhn1a_s', 'inhibits', 'b. cinerea growth')
('aba', 'activates', "tapp2abb '' -gamma")
('dhn1a_s', 'inhibits', 'lactate dehydrogenase activity')
('dof family factors', 'is-in', 'chinese cabbage')
('abscisic-acid', 'is-in', 'mosses')




Ten random triples from onto_slim:
----------------------------------------------------
('NtNAK', 'ProteinOrganismRelationship', 'tobacco')
('NF-YC', 'ProteinOrganismRelationship', 'Arabidopsis thaliana')
('IVR1.2-3B', 'GeneOrganismRelationship', 'wheat (Triticum aestivum L.)')
('AVP1', 'GeneProteinInteraction', 'H+-PPase')
('YSK2-type dehydrin', 'ProteinOrganismRelationship', 'grape (Vitis vinifera)')
('glutathione peroxidase', 'GeneProteinInteracti

These actually don't look bad at all! Certainly there are not enough of them, and there are a fair share of relations involving NaN or nonsense characters on the part of OntoGPT, but much less bad than I expected overall.

### Check grounding percentages and entity recovery for the slim and full OntoGPT versions
I'm concerned that the slim version, while faster, results in substantially more entities not getting a grounding, so we want to check that here.

In [15]:
slim_groundings = onto_slim_ents[['name', 'id', 'category']].copy()
slim_groundings['name'] = slim_groundings['name'].str.lower()
slim_groundings = slim_groundings.drop_duplicates().reset_index(drop=True)
full_groundings = onto_full_ents[['name', 'id', 'category']].copy()
full_groundings['name'] = full_groundings['name'].str.lower()
full_groundings = full_groundings.drop_duplicates().reset_index(drop=True)

First, let's remove `AUTO` groundings if the same entity also has a true database grounding.

In [66]:
full_groundings

Unnamed: 0,name,id,category
0,abscisic acid,CHEBI:22152,Molecule
1,norway maple,NCBITaxon:4025,Organism
2,sycamore,AUTO:sycamore,Organism
3,-,AUTO:-,Gene
4,acacia holosericea,NCBITaxon:1120455,Organism
...,...,...,...
2887,gene,AUTO:gene,Gene
2888,plants,AUTO:plants,Gene
2889,brassica,NCBITaxon:3705,Organism
2890,plant/algal macrofossils,AUTO:plant/algal%20macrofossils,Molecule


In [103]:
def remove_auto_dups(grounding_df):
    """
    Remove rows with AUTO annotations if there is another row of the same entity with a better grounding.
    """
    counted = grounding_df.groupby(by='name').count()
    duped = counted[counted['id'] >= 2].index.tolist()
    print(f'\nThere are {len(duped)} entities with multiple groundings.')
    to_keep = []
    for dup in duped:
        grounds = grounding_df[grounding_df['name'] == dup].id.values.tolist()
        prefixes = [g.split(':')[0] for g in grounds]
        if 'AUTO' in prefixes:
            # If there's only AUTO, just keep one, the difference is capitalization so it doesn't matter which
            if len(set(prefixes)) == 1:
                ground_to_keep = grounds[0]
            # If there are non-AUTO groundings, check if there's more than one, if so, report and pick randomly for now
            if len(set(prefixes)) > 1:
                potential_grounds = [g for g in grounds if g.split(':')[0] != 'AUTO']
                if len(set(potential_grounds)) == 1:
                    ground_to_keep = potential_grounds[0]
                else:
                    print(f'There is more than one true grounding for entity {dup}:')
                    print(potential_grounds)
                    ground_to_keep = potential_grounds[0]
                    print(f'Choosing first option for now: {ground_to_keep}')
        # Get the category corresponding to the correct grounding
        try:
            cat = grounding_df.loc[(grounding_df['name'] == dup) & (grounding_df['id'] == ground_to_keep), 'category'].values[0]
        except IndexError:
            cat = 'UNKNOWN'
        # Make dict for new row
        to_keep.append({'name': dup, 'id': ground_to_keep, 'category': cat})
    
    # Drop rows with dups, then add back the groundings we keep
    grounding_df = grounding_df[~grounding_df['name'].isin(duped)]
    grounding_df = pd.concat([grounding_df, pd.DataFrame(to_keep)])
    
    return grounding_df

In [104]:
full_grounds = remove_auto_dups(full_groundings)
slim_grounds = remove_auto_dups(slim_groundings)


There are 182 entities with multiple groundings.
There is more than one true grounding for entity na:
['PR:A3DRP3', 'CHEBI:26708']
Choosing first option for now: PR:A3DRP3

There are 150 entities with multiple groundings.
There is more than one true grounding for entity acc:
['CHEBI:18053', 'PR:000022034']
Choosing first option for now: CHEBI:18053


In [105]:
for ty, gr in {'slim': slim_grounds, 'full': full_grounds}.items():
    print(f'\n\nFor {ty} OntoGPT:')
    print(f'There are {len(gr)} unique entities.')
    labs = gr.id.values.tolist()
    counted_prefixes = Counter([i.split(':')[0] for i in labs])
    print('Number of entities grounded to each ontology:')
    for ont, num in counted_prefixes.items():
        print('  ', ont, ':', num)



For slim OntoGPT:
There are 2781 unique entities.
Number of entities grounded to each ontology:
   CHEBI : 348
   NCBITaxon : 286
   AUTO : 2031
   GO : 24
   PR : 92


For full OntoGPT:
There are 2700 unique entities.
Number of entities grounded to each ontology:
   CHEBI : 325
   NCBITaxon : 503
   AUTO : 1762
   GO : 28
   PR : 82


It's looking like changing over to the slim NCBI Taxonomy resulted in a loss of almost half of the species groundings. Let's take a more detailed look. First, let's see what the overlap is between the two sets of NCBI annotations:

In [106]:
slim_ncbi = slim_grounds[slim_grounds['id'].str.contains('NCBITaxon')]
full_ncbi = full_grounds[full_grounds['id'].str.contains('NCBITaxon')]

In [107]:
in_common = set(slim_ncbi['name']).intersection(set(full_ncbi['name']))
print(f'There are {len(in_common)} entities in common between the NCBI-grounded entities of the slim and full OntoGPT runs.')

There are 214 entities in common between the NCBI-grounded entities of the slim and full OntoGPT runs.


This means there are 72 entities that got a grounding with the slim ontology that didn't get one with the full ontology. However, we don't know that the remaining 217 entities were even identified. So let's see if they're in the `AUTO` category:

In [108]:
full_ncbi_names = full_ncbi['name'].values.tolist()
identified_in_slim = slim_grounds[slim_grounds['name'].isin(full_ncbi_names)].copy()
identified_in_slim['prefix'] = identified_in_slim['id'].str.split(':').str[0]

In [109]:
print(f'{len(identified_in_slim)} of the NCBITaxon-gronded names from the full ontology extraction exist in the slim extraction.')
counted_cats = identified_in_slim.groupby(by='prefix').count()
print(f'Of these, {counted_cats.loc["AUTO", "id"]} are AUTO-grounded.')

392 of the NCBITaxon-gronded names from the full ontology extraction exist in the slim extraction.
Of these, 176 are AUTO-grounded.


This indicates that almost 100 entities weren't even identified; let's see what they were:

In [110]:
missing_in_slim = full_ncbi[~full_ncbi['name'].isin(identified_in_slim.name)]

In [114]:
missing_in_slim.name.values.tolist()[:20]

['macrobiotus hufelandi',
 'green gram',
 'black gram',
 'sesamum',
 'anna (lowland)',
 'cassava',
 'heterorhabditis georgiana',
 'acheta domesticus',
 'agrotis ipsilon',
 'diaprepes abbreviatus',
 'musca domestica',
 'plodia interpunctella',
 'solenopsis invicta',
 'tenebrio molitor',
 'heterorhabditis floridensis',
 'heterorhabditis indica',
 'heterorhabditis mexicana',
 'steinernema carpocapsae',
 'steinernema feltiae',
 'steinernema rarum']