In [1]:
#Set working directory
import os
os.chdir('/scratch1/sjw6257/MetaGen-Microbiome') 

# Install required packages
import pandas as pd
import yaml
import sys 
import csv
import json
import requests

In [2]:
import itertools
from itertools import product
from tqdm.notebook import tqdm
from collections import defaultdict

In [4]:
## Set API endpoint
url = "https://nodenormalization-sri.renci.org/1.5/get_normalized_nodes"
headers = {
    "Content-Type": "application/json"
}


In [5]:
## POST request to the API
response = requests.post(url, headers=headers, json=curies_json)

# Check Response status
if response.status_code == 200:
    print("Request was successful! \n")
    
    # Parse and display the JSON response
    normalized_data = response.json()
    #print(normalized_data)  # Pretty-print the JSON response (Optional)
else:
    print(f"Request failed with status code: {response.status_code}")
    print(response.text)

Request was successful! 



In [6]:
### End of Node Normalization ###

## Node Normalization Result
Given that some CURIES from Node Normalizer result do not contain name(s), I added MicrobiomeKG `name` column (from TSV file) to `df` for analysis purpose.
The MicrobiomeKG `name` will not be used for Node Synonymizer step in the pipe line.

In [10]:
f_path = 'node_norm_with_microKG_names.csv'
df_data = pd.read_csv(f_path, sep=',')

In [11]:
## MicroKG ID vs. NN Identifier (ID)
comparison = df_data['CURIE'] == df_data['Identifier']
false_rows = df_data[~comparison & df_data.notna().all(axis=1)]

# Note: Does not include rows with NaN values
false_rows 

Unnamed: 0,CURIE,Identifier,Name,Equivalent Identifiers,Equivalent Name,Types,MicroKG_Name
61,UniProtKB:Q9GZT9,NCBIGene:54583,EGLN1,"NCBIGene:54583, ENSEMBL:ENSG00000135766, HGNC:...","EGLN1, No label, EGLN1, No label, EGLN1 gene, ...","biolink:Gene, biolink:GeneOrGeneProduct, bioli...",EGLN1_HUMAN Egl nine homolog 1 (sprot)
161,UniProtKB:P98198,NCBIGene:57198,ATP8B2,"NCBIGene:57198, ENSEMBL:ENSG00000143515, HGNC:...","ATP8B2, No label, ATP8B2, No label, ATP8B2 gen...","biolink:Gene, biolink:GeneOrGeneProduct, bioli...",AT8B2_HUMAN Phospholipid-transporting ATPase I...
356,UniProtKB:P06239,NCBIGene:3932,LCK,"NCBIGene:3932, ENSEMBL:ENSG00000182866, HGNC:6...","LCK, No label, LCK, No label, LCK gene, A0A0S2...","biolink:Gene, biolink:GeneOrGeneProduct, bioli...",LCK_HUMAN Tyrosine-protein kinase Lck (sprot)
396,UniProtKB:Q9UBZ9,NCBIGene:51455,REV1,"NCBIGene:51455, ENSEMBL:ENSG00000135945, HGNC:...","REV1, No label, REV1, No label, REV1 gene, Q49...","biolink:Gene, biolink:GeneOrGeneProduct, bioli...",REV1_HUMAN DNA repair protein REV1 (sprot)
619,UniProtKB:Q9Y251,NCBIGene:10855,HPSE,"NCBIGene:10855, ENSEMBL:ENSG00000173083, HGNC:...","HPSE, No label, HPSE, No label, HPSE gene, HPS...","biolink:Gene, biolink:GeneOrGeneProduct, bioli...",HPSE_HUMAN Heparanase (sprot)
742,UniProtKB:P45983,NCBIGene:5599,MAPK8,"NCBIGene:5599, ENSEMBL:ENSG00000107643, HGNC:6...","MAPK8, No label, MAPK8, No label, MAPK8 gene, ...","biolink:Gene, biolink:GeneOrGeneProduct, bioli...",MK08_HUMAN Mitogen-activated protein kinase 8 ...
1106,UniProtKB:P10398,NCBIGene:369,ARAF,"NCBIGene:369, ENSEMBL:ENSG00000078061, HGNC:64...","ARAF, No label, ARAF, No label, ARAF gene, A0A...","biolink:Gene, biolink:GeneOrGeneProduct, bioli...",ARAF_HUMAN Serine/threonine-protein kinase A-R...
1223,UniProtKB:Q9Y4X5,NCBIGene:25820,ARIH1,"NCBIGene:25820, ENSEMBL:ENSG00000166233, HGNC:...","ARIH1, No label, ARIH1, No label, ARIH1 gene, ...","biolink:Gene, biolink:GeneOrGeneProduct, bioli...",ARI1_HUMAN E3 ubiquitin-protein ligase ARIH1 (...
1457,UniProtKB:P00568,NCBIGene:203,AK1,"NCBIGene:203, ENSEMBL:ENSG00000106992, HGNC:36...","AK1, No label, AK1, No label, AK1 gene, KAD1_H...","biolink:Gene, biolink:GeneOrGeneProduct, bioli...",KAD1_HUMAN Adenylate kinase isoenzyme 1 (sprot)
1721,UniProtKB:Q02318,NCBIGene:1593,CYP27A1,"NCBIGene:1593, ENSEMBL:ENSG00000135929, HGNC:2...","CYP27A1, No label, CYP27A1, No label, CYP27A1 ...","biolink:Gene, biolink:GeneOrGeneProduct, bioli...","CP27A_HUMAN Sterol 26-hydroxylase, mitochondri..."


In [12]:
## Compare NN `Name` to MicroKG `name`
comparison2 = df_data['Name'] == df_data['MicroKG_Name']
false = df_data[~comparison2 & df_data.notna().all(axis=1)]

# Note: This does not include rows where `Name` has NaN
false

Unnamed: 0,CURIE,Identifier,Name,Equivalent Identifiers,Equivalent Name,Types,MicroKG_Name
17,EC:3.1.12.1,EC:3.1.12.1,"""5' to 3' exodeoxyribonuclease (nucleoside 3'-...",EC:3.1.12.1,"""5' to 3' exodeoxyribonuclease (nucleoside 3'-...","biolink:MolecularActivity, biolink:Occurrent, ...",\
61,UniProtKB:Q9GZT9,NCBIGene:54583,EGLN1,"NCBIGene:54583, ENSEMBL:ENSG00000135766, HGNC:...","EGLN1, No label, EGLN1, No label, EGLN1 gene, ...","biolink:Gene, biolink:GeneOrGeneProduct, bioli...",EGLN1_HUMAN Egl nine homolog 1 (sprot)
161,UniProtKB:P98198,NCBIGene:57198,ATP8B2,"NCBIGene:57198, ENSEMBL:ENSG00000143515, HGNC:...","ATP8B2, No label, ATP8B2, No label, ATP8B2 gen...","biolink:Gene, biolink:GeneOrGeneProduct, bioli...",AT8B2_HUMAN Phospholipid-transporting ATPase I...
333,EC:1.1.1.292,EC:1.1.1.292,"""1,5-anhydro-D-fructose reductase (1,5-anhydro...",EC:1.1.1.292,"""1,5-anhydro-D-fructose reductase (1,5-anhydro...","biolink:MolecularActivity, biolink:Occurrent, ...",\
341,EC:1.6.1.1,EC:1.6.1.1,"""NAD(P)(+) transhydrogenase (Si-specific)""",EC:1.6.1.1,"""NAD(P)(+) transhydrogenase (Si-specific)""","biolink:MolecularActivity, biolink:Occurrent, ...",\
...,...,...,...,...,...,...,...
4922,EC:3.5.1.46,EC:3.5.1.46,"""6-aminohexanoate-oligomer exohydrolase""",EC:3.5.1.46,"""6-aminohexanoate-oligomer exohydrolase""","biolink:MolecularActivity, biolink:Occurrent, ...",\
4990,EC:3.5.1.101,EC:3.5.1.101,"""L-proline amide hydrolase""",EC:3.5.1.101,"""L-proline amide hydrolase""","biolink:MolecularActivity, biolink:Occurrent, ...",\
5030,UniProtKB:Q9HBU6,NCBIGene:55500,ETNK1,"NCBIGene:55500, ENSEMBL:ENSG00000139163, HGNC:...","ETNK1, No label, ETNK1, No label, ETNK1 gene, ...","biolink:Gene, biolink:GeneOrGeneProduct, bioli...",EKI1_HUMAN Ethanolamine kinase 1 (sprot)
5049,UniProtKB:Q03426,NCBIGene:4598,MVK,"NCBIGene:4598, ENSEMBL:ENSG00000110921, HGNC:7...","MVK, No label, MVK, No label, MVK gene, B2RDU6...","biolink:Gene, biolink:GeneOrGeneProduct, bioli...",KIME_HUMAN Mevalonate kinase (sprot)


In [13]:
# Missing Name and/or Identifier
missing_name_or_identifier = df_data[df_data['Name'].isnull() | df_data['Identifier'].isnull()]
missing_name_or_identifier

Unnamed: 0,CURIE,Identifier,Name,Equivalent Identifiers,Equivalent Name,Types,MicroKG_Name
9,NCBIGene:69618870,,,,,,frlB
123,NCBIGene:69622190,,,,,,aftC
157,NCBIGene:56333716,,,,,,I501_RS09505
194,NCBIGene:70044589,,,,,,hpdB
205,NCBIGene:86883341,,,,,,arnB
...,...,...,...,...,...,...,...
5187,GO:0140868,,,,,,"4,4'-diapophytoene desaturase (4,4'-diapolycop..."
5229,NCBIGene:60545508,,,,,,ftsL
5238,NCBIGene:57313172,,,,,,mtnK
5310,NCBIGene:67459097,,,,,,A39M_RS0122745


In [14]:
missing_both = df_data[df_data['Name'].isnull() & df_data['Identifier'].isnull()]
missing_both

Unnamed: 0,CURIE,Identifier,Name,Equivalent Identifiers,Equivalent Name,Types,MicroKG_Name
9,NCBIGene:69618870,,,,,,frlB
123,NCBIGene:69622190,,,,,,aftC
157,NCBIGene:56333716,,,,,,I501_RS09505
194,NCBIGene:70044589,,,,,,hpdB
205,NCBIGene:86883341,,,,,,arnB
...,...,...,...,...,...,...,...
5187,GO:0140868,,,,,,"4,4'-diapophytoene desaturase (4,4'-diapolycop..."
5229,NCBIGene:60545508,,,,,,ftsL
5238,NCBIGene:57313172,,,,,,mtnK
5310,NCBIGene:67459097,,,,,,A39M_RS0122745
