# OntoGPT graph generation
We ran OntoGPT on the desiccation subset of our dataset; here, we'll post-process the output and format it as a graph.

In [9]:
import pandas as pd
from collections import Counter
import networkx as nx

## Read in converted OntoGPT output

### Entities

In [3]:
ent_df = pd.read_csv('../../data/kg/ontogpt_slim_ent_df_20May2024.csv')
ent_df.head()

Unnamed: 0,id,category,name,provided_by
0,AUTO:N/A,Protein,,1811d3fb-5fb9-4cbe-aa8b-84b4d4716af6
1,AUTO:lipid%20peroxidation,Molecule,lipid peroxidation,1811d3fb-5fb9-4cbe-aa8b-84b4d4716af6
2,CHEBI:16856,Molecule,glutathione,1811d3fb-5fb9-4cbe-aa8b-84b4d4716af6
3,CHEBI:29191,Molecule,hydroxyl radical,1811d3fb-5fb9-4cbe-aa8b-84b4d4716af6
4,AUTO:Trichilia%20dregeana,Organism,Trichilia dregeana,1811d3fb-5fb9-4cbe-aa8b-84b4d4716af6


Problematic entities by ID that we can observe just by scrolling through the list: `AUTO:N/A`, `AUTO:Not%20provided`, `AUTO:Not%20mentioned`, `AUTO:None`, `AUTO:Not%20found`, `AUTO:Unknown`... etc. Quantify prevalance and remove:

In [103]:
problem_ents = ['AUTO:N/A', 'AUTO:Not%20provided', 'AUTO:not%20provided', 'AUTO:Not%20mentioned', 'AUTO:None', 'AUTO:Not%20found', 'AUTO:Unknown',
               'AUTO:Not%20mentioned%20in%20the%20text.', 'AUTO:unknown', 'AUTO:none', 'AUTO:Not%20applicable',
               'AUTO:not%20available', 'AUTO:Not%20mentioned.', 'AUTO:No%20information%20provided.',
               'AUTO:Not%20provided%20in%20the%20text.', 'AUTO:Not%20specified', 'AUTO:not%20specified', 'AUTO:unspecified',
               'AUTO:n/a', 'AUTO:not%20applicable', 'AUTO:%5Bnot%20mentioned%5D']

In [104]:
entity_counts = Counter(ent_df.id.tolist())
print('Prevelance of problematic entities:')
for ent in problem_ents:
    print(ent, '--', entity_counts[ent])
print(f'Of a total of {len(ent_df)} entities.')

Prevelance of problematic entities:
AUTO:N/A -- 5237
AUTO:Not%20provided -- 5235
AUTO:not%20provided -- 5170
AUTO:Not%20mentioned -- 5227
AUTO:None -- 5226
AUTO:Not%20found -- 5078
AUTO:Unknown -- 5209
AUTO:Not%20mentioned%20in%20the%20text. -- 4849
AUTO:unknown -- 5177
AUTO:none -- 5176
AUTO:Not%20applicable -- 4678
AUTO:not%20available -- 4655
AUTO:Not%20mentioned. -- 3663
AUTO:No%20information%20provided. -- 2641
AUTO:Not%20provided%20in%20the%20text. -- 2593
AUTO:Not%20specified -- 5016
AUTO:not%20specified -- 5092
AUTO:unspecified -- 5092
AUTO:n/a -- 4259
AUTO:not%20applicable -- 4908
AUTO:%5Bnot%20mentioned%5D -- 3663
Of a total of 36380420 entities.


Not horrible in absolute numbers, but worse than we might like. Also, all of these hallucinated entities have at least 2,000 instances in the dataset, even something as seemingly strange as `AUTO:%5Bnot%20mentioned%5D`. Let's clean them:

In [105]:
ent_df_cleaned = ent_df[~ent_df['id'].isin(problem_ents)]
ent_df_cleaned.head()

Unnamed: 0,id,category,name,provided_by
1,AUTO:lipid%20peroxidation,Molecule,lipid peroxidation,1811d3fb-5fb9-4cbe-aa8b-84b4d4716af6
2,CHEBI:16856,Molecule,glutathione,1811d3fb-5fb9-4cbe-aa8b-84b4d4716af6
3,CHEBI:29191,Molecule,hydroxyl radical,1811d3fb-5fb9-4cbe-aa8b-84b4d4716af6
4,AUTO:Trichilia%20dregeana,Organism,Trichilia dregeana,1811d3fb-5fb9-4cbe-aa8b-84b4d4716af6
6,AUTO:lipid%20peroxidation,Molecule,lipid peroxidation,623dce16-704e-41a7-b549-22982835bff9


In [106]:
print(f'{len(ent_df) - len(ent_df_cleaned)} total entities were dropped on cleaning ({100*(len(ent_df) - len(ent_df_cleaned))/len(ent_df):.2f}%)')

97844 total entities were dropped on cleaning (0.27%)


In [107]:
unique_ents = ent_df.id.unique()
unique_after_cleaning = ent_df_cleaned.id.unique()
unique_dropped = problem_ents
assert len(unique_dropped) == len(unique_ents) - len(unique_after_cleaning)
print(f'This is {len(unique_ents) - len(unique_after_cleaning)} of {len(unique_ents)} unqiue entities lost ({100*(len(unique_ents) - len(unique_after_cleaning))/len(unique_ents):.2f}%)')

This is 21 of 12508 unqiue entities lost (0.17%)


Check how many are truly grounded:

In [113]:
clean_ent_counts = Counter(ent_df_cleaned.id.tolist())
true_grounded = sum([v for k, v in clean_ent_counts.items() if 'AUTO:' not in k])
print(f'{100*true_grounded/sum(clean_ent_counts.values()):.2f}% of entities have a true grounding.')

23.30% of entities have a true grounding.


### Relations

In [114]:
rel_df = pd.read_csv('../../data/kg/ontogpt_slim_rel_df_20May2024.csv')
rel_df.head()

Unnamed: 0,id,category,provided_by,predicate,subject,object
0,f40835d2-e716-4073-88c5-27eac763fe43,GeneGeneInteraction,1811d3fb-5fb9-4cbe-aa8b-84b4d4716af6,GeneGeneInteraction,AUTO:N/A,AUTO:N/A
1,c20331ec-e5aa-4af8-9483-618678e9d6ca,GeneOrganismRelationship,1811d3fb-5fb9-4cbe-aa8b-84b4d4716af6,GeneOrganismRelationship,AUTO:N/A,AUTO:N/A
2,d4dd7d59-1118-4577-a870-9a85d7f89f17,ProteinProteinInteraction,1811d3fb-5fb9-4cbe-aa8b-84b4d4716af6,ProteinProteinInteraction,AUTO:N/A,AUTO:N/A
3,09f0659d-9ea2-42d6-a331-d076507d012c,ProteinOrganismRelationship,1811d3fb-5fb9-4cbe-aa8b-84b4d4716af6,ProteinOrganismRelationship,AUTO:N/A,AUTO:N/A
4,af2466f5-44d4-4224-bf3e-befef9553aa3,GeneMoleculeInteraction,1811d3fb-5fb9-4cbe-aa8b-84b4d4716af6,GeneMoleculeInteraction,AUTO:N/A,AUTO:N/A


We need to get rid of any relations that rely on the three problem entities, and quantify the prevalence of both relations with two problem entities and relations with one problem entity.

In [120]:
rel_df_cleaned_just_both = rel_df[~((rel_df['subject'].isin(problem_ents)) & (rel_df['object'].isin(problem_ents)))]
print(f'{len(rel_df) - len(rel_df_cleaned_just_both)} relations were dropped for having both entities be problem entities ({100*(len(rel_df) - len(rel_df_cleaned_just_both))/len(rel_df):.2f}% of initial relations)')

3568 relations were dropped for having both entities be problem entities (44.68% of initial relations)


In [122]:
rel_df_cleaned = rel_df_cleaned_just_both[~((rel_df_cleaned_just_both['subject'].isin(problem_ents)) | (rel_df_cleaned_just_both['object'].isin(problem_ents)))]
print(f'{len(rel_df_cleaned_just_both) - len(rel_df_cleaned)} relations were dropped for having one entity be problem entities ({100*(len(rel_df_cleaned_just_both) - len(rel_df_cleaned))/len(rel_df):.2f}% of initial relations)')

292 relations were dropped for having one entity be problem entities (3.66% of initial relations)


In [123]:
rel_df_cleaned.head()

Unnamed: 0,id,category,provided_by,predicate,subject,object
7,f295e8fe-b138-42e3-91e3-40d943d2a050,GeneOrganismRelationship,5affdb3c-e22a-4c71-8755-d0bc2efdefcd,GeneOrganismRelationship,AUTO:Alphitobius%20diaperinus,AUTO:Alphitobius%20diaperinus
8,48f8a4d8-1151-4597-917c-9697c0387e4a,ProteinOrganismRelationship,5affdb3c-e22a-4c71-8755-d0bc2efdefcd,ProteinOrganismRelationship,AUTO:Alphitobius%20diaperinus,AUTO:Alphitobius%20diaperinus
16,4e28a40b-68d8-4f57-b1de-5d7b61c6b497,GeneOrganismRelationship,6635dc3a-a6ba-4fd3-9ba5-16d95f8108d7,GeneOrganismRelationship,AUTO:Delia%20antiqua,AUTO:Delia%20antiqua
23,ed9c4fc6-0721-429b-a130-bea4988c41b6,GeneMoleculeInteraction,a1fa005c-4606-4a90-aca3-65f02a03de6e,GeneMoleculeInteraction,AUTO:CsGolS1,CHEBI:17505
25,d4bf6963-8019-4ec4-b452-5e10e692b758,GeneMoleculeInteraction,effe7e16-b4ae-40b1-931c-d7abca0007bd,GeneMoleculeInteraction,AUTO:PAO,CHEBI:32875


Almost 50% of relations were dropped because they were hallucinated around non-existent entities -- not a great look.

## Format into a networkx graph

In [None]:
nodes = []
for ent_id in unique_after_cleaning:
    all_instances = ent_df_cleaned[ent_df_cleaned['id'] == ent_id]
    types = set(all_instances['category'].tolist())
    if len(types) == 1:
        ent_type = list(types)[0]
    else:
        print(f'Entity {ent_id} has multiple types, choosing one arbitrarily.')
        ent_type = list(types)[0]
    num_mentions = len(all_instances)
    docs_of_origin = ', '.join(all_instances['provided_by'].unique().tolist())
    semantic_name = all_instances['name'].unique().tolist()[0]
    nodes.append((ent_id, {
        'ent_type': ent_type,
        'num_mentions': num_mentions,
        'docs_of_origin': docs_of_origin,
        'semantic_name': semantic_name
    }))