# Graph generation comparison
In this notebook, we'll compare the outputs of a few different approaches to  graph building for characteristics like degree skew and entity grounding.

In [37]:
import jsonlines
import pandas as pd
from ontogpt.io.csv_wrapper import parse_yaml_predictions
from os import listdir
from os.path import splitext
import numpy as np
import networkx as nx
from random import sample
from collections import Counter

## Read in data

In [5]:
schema_path = '../knowledge_graph/schema/desiccation.yaml'

In [6]:
onto_full_ents, onto_full_rels = parse_yaml_predictions('../data/ontogpt_output/test_1000_full/output.txt', schema_path)

100%|██████████| 926/926 [00:00<00:00, 117346.23it/s]


In [7]:
onto_slim_ents, onto_slim_rels = parse_yaml_predictions('../data/ontogpt_output/test_1000_slim/output.txt', schema_path)

100%|██████████| 929/929 [00:00<00:00, 105004.54it/s]


In [8]:
uids_to_keep = [splitext(f)[0] for f in listdir('/mnt/scratch/lotrecks/drought_and_des_1000_subset_15Apr2024/')]

In [9]:
with jsonlines.open('../data/dygiepp/model_predictions/09Apr24_dygiepp_test_formatted_data_pickle_predictions.jsonl') as reader:
    dygiepp = [obj for obj in reader if obj['doc_key'] in uids_to_keep]

## Process data
Convert DyGIE++ to csv, then all three to networkx objects.

In [10]:
dygiepp_ents = {'id': [], 'category': [], 'name': [], 'provided_by': []}
dygiepp_rels = {'id': [], 'category': [], 'provided_by': [], 'predicate': [], 'subject': [], 'object': []}
for doc in dygiepp:
    all_text = [tok for sent in doc['sentences'] for tok in sent]
    for sent in doc['predicted_ner']:
        for ent in sent:
            dygiepp_ents['id'].append(np.nan)
            dygiepp_ents['category'].append(ent[2])
            dygiepp_ents['name'].append(' '.join(all_text[ent[0]: ent[1]+1]).lower())
            dygiepp_ents['provided_by'].append(doc['doc_key'])
    for sent in doc['predicted_relations']:
        for rel in sent:
            dygiepp_rels['id'].append(np.nan)
            dygiepp_rels['category'].append(rel[4])
            dygiepp_rels['provided_by'].append(doc['doc_key'])
            dygiepp_rels['predicate'].append(rel[4])
            dygiepp_rels['subject'].append(' '.join(all_text[rel[0]: rel[1]+1]).lower())
            dygiepp_rels['object'].append(' '.join(all_text[rel[2]: rel[3]+1]).lower())
dygiepp_ent_df = pd.DataFrame(dygiepp_ents)
dygiepp_rel_df = pd.DataFrame(dygiepp_rels)

In [11]:
def kgx_to_networkx(ent_df, rel_df, source='onto'):
    """
    Convert two KGX-formatted df's into a networkx graph.
    """
    if source == 'onto':
        nodes = [(row.id, {'ent_type': row.category, 'name': row['name']}) for i, row in ent_df.iterrows()]
    else:
        nodes = [(row.name, {'ent_type': row.category}) for i, row in ent_df.iterrows()]
    edges = [(row.subject, row.object, {'rel_type': row.predicate}) for i, row in rel_df.iterrows()]
    graph = nx.DiGraph()
    _ = graph.add_nodes_from(nodes)
    _ = graph.add_edges_from(edges)
    return graph

In [12]:
dygiepp_graph = kgx_to_networkx(dygiepp_ent_df, dygiepp_rel_df, source='dygiepp')

In [13]:
onto_slim_graph = kgx_to_networkx(onto_slim_ents, onto_slim_rels)

In [14]:
onto_full_graph = kgx_to_networkx(onto_full_ents, onto_full_rels)

In [15]:
nets = {
    'dygiepp': dygiepp_graph,
    'onto_slim': onto_slim_graph,
    'onto_full': onto_full_graph
}

## Compare

### Basic network statistics

In [16]:
for net_name, net in nets.items():
    print(f'In network {net_name}, there are {len(net.nodes)} nodes and {len(net.edges)} edges.')

In network dygiepp, there are 18360 nodes and 1448 edges.
In network onto_slim, there are 2940 nodes and 770 edges.
In network onto_full, there are 2893 nodes and 692 edges.


The number of edges is bad in all methods, but OntoGPT extracts orders of magnitude more entities.

### Check relation quality
A previous glance at the DyGIE++ relations showed that the majority of them were trivial -- we'll take a look at a few sets of triples from each graph to get an idea of whether or not they're meaningful.

In [17]:
for net_name, net in nets.items():
    trips = [(e1, attrs['rel_type'], e2) for e1, e2, attrs in net.edges(data=True)]
    if net_name in ['onto_slim', 'onto_full']:
        id2name = nx.get_node_attributes(net, 'name')
        updated_trips = []
        for e1, rt, e2 in trips:
            try:
                updated_trip = (id2name[e1], rt, id2name[e2])
                updated_trips.append(updated_trip)
            except KeyError:
                continue
        trips = updated_trips
    to_print = sample(trips, 10)
    print(f'Ten random triples from {net_name}:')
    print('----------------------------------------------------')
    for trip in to_print:
        print(trip)
    print('\n\n\n')

Ten random triples from dygiepp:
----------------------------------------------------
('pri', 'activates', 'c')
('abscisic acid and jasmonic acid signaling pathways', 'is-in', 'da92 - 2f6')
('nsc', 'is-in', 'oak and beech saplings')
('lea proteins', 'is-in', 'p. edulis')
('snrk2 gene family', 'is-in', 'zea mays')
('phenylethanoid glycosides', 'is-in', 'scrophularia striate')
('dfi', 'inhibits', 'sorghum')
('water', 'inhibits', 'f-ty')
('aba', 'interacts', 'vacuolar na+ sequestration')
('beta- hydroxyethylhydrazine', 'inhibits', 'polyamine')




Ten random triples from onto_slim:
----------------------------------------------------
('TGFβ', 'GeneGeneInteraction', 'SMAD3')
('NFYA5', 'GeneProteinInteraction', 'NF-YC')
('not provided', 'GeneOrganismRelationship', 'cv. Hira')
('sORFs', 'GeneMoleculeInteraction', 'NA')
('SAMDC', 'GeneProteinInteraction', 'H+-ATPase')
('BDN1', 'GeneOrganismRelationship', 'Boea crassifolia Hemsl')
('PCK', 'GeneProteinInteraction', 'CA')
('CKS2', 'GeneProteinIn

These actually don't look bad at all! Certainly there are not enough of them, and there are a fair share of relations involving NaN or nonsense characters on the part of OntoGPT, but much less bad than I expected overall.

### Check grounding percentages and entity recovery for the slim and full OntoGPT versions
I'm concerned that the slim version, while faster, results in substantially more entities not getting a grounding, so we want to check that here.

In [43]:
slim_groundings = onto_slim_ents[['name', 'id', 'category']].copy()
slim_groundings['name'] = slim_groundings['name'].str.lower()
slim_groundings = slim_groundings.drop_duplicates().reset_index(drop=True)
full_groundings = onto_full_ents[['name', 'id', 'category']].copy()
full_groundings['name'] = full_groundings['name'].str.lower()
full_groundings = full_groundings.drop_duplicates().reset_index(drop=True)

In [45]:
for ty, gr in {'slim': slim_groundings, 'full': full_groundings}.items():
    print(f'\n\nFor {ty} OntoGPT:')
    print(f'There are {len(gr)} unique entities.')
    labs = gr.id.values.tolist()
    counted_prefixes = Counter([i.split(':')[0] for i in labs])
    print('Number of entities grounded to each ontology:')
    for ont, num in counted_prefixes.items():
        print('  ', ont, ':', num)



For slim OntoGPT:
There are 2939 unique entities.
Number of entities grounded to each ontology:
   CHEBI : 348
   NCBITaxon : 285
   AUTO : 2181
   GO : 29
   PR : 96


For full OntoGPT:
There are 2892 unique entities.
Number of entities grounded to each ontology:
   CHEBI : 328
   NCBITaxon : 500
   AUTO : 1944
   GO : 34
   PR : 86


It's looking like changing over to the slim NCBI Taxonomy resulted in a loss of almost half of the species groundings. Let's take a more detailed look:

In [46]:
merged_groundings = slim_groundings.merge(full_groundings, how='outer', on='name', suffixes=('_slim', '_full'))

In [53]:
organisms = merged_groundings[(merged_groundings['category_slim'] == 'Organism') | (merged_groundings['category_full'] == 'Organism')]
overlapping_organisms = organisms.dropna()
mismatched_organisms = overlapping_organisms[overlapping_organisms['category_slim'] != overlapping_organisms['category_full']]
mismatched_organisms

Unnamed: 0,name,id_slim,category_slim,id_full,category_full
16,arabidopsis thaliana,NCBITaxon:3702,Organism,AUTO:Arabidopsis%20thaliana,Gene
17,arabidopsis thaliana,AUTO:Arabidopsis%20thaliana,Gene,NCBITaxon:3702,Organism
46,cotton,NCBITaxon:3635,Organism,AUTO:cotton,Molecule
47,cotton,AUTO:cotton,UNKNOWN,NCBITaxon:3635,Organism
62,rice,NCBITaxon:4530,Organism,AUTO:Rice,UNKNOWN
...,...,...,...,...,...
3014,xylella fastidiosa,AUTO:Xylella%20fastidiosa,Gene,NCBITaxon:2371,Organism
3015,xylella fastidiosa,NCBITaxon:2371,Organism,AUTO:Xylella%20fastidiosa,Gene
3063,hippophae rhamnoides,AUTO:Hippophae%20rhamnoides,UNKNOWN,NCBITaxon:193516,Organism
3082,biochar,AUTO:biochar,Organism,AUTO:biochar,Molecule
