In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import ontology_mapper_new

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columns
pd.set_option('display.max_colwidth', None) # don't truncate wide columns

In [3]:
load_dotenv()
API_KEY = os.getenv("BIOPORTAL_API_KEY")

In [4]:
ontology_mapper_new.map_ontology(API_KEY, ["inchi key"], topn=1000)

Unnamed: 0,keyword,pref_label,synonyms,definition,class
0,inchi key,Inchi Key,(),"(A condensed, fixed-length version of the InChI, designed to be a more accessible way of indexing chemical substances.,)",https://purl.dsmz.de/schema/InchiKey
1,inchi key,inchi key,"(inchi key,)","(InChIKeys consist of 14 characters resulting from a hash of the connectivity information of the InChI, followed by a hyphen, followed by 9 characters resulting from a hash of the remaining layers of the InChI, followed by a single character indication the version of InChI used, another hyphen, followed by single checksum character,)",MI:0970


In [5]:
tier2 = pd.read_csv("../cdes/RADx-rad_tier2_dict_2024-08-22.csv", keep_default_na=False, dtype=str)

In [6]:
tier2.head()

Unnamed: 0,Id,Label,Section,Cardinality,Terms,Datatype,Unit,Enumeration,Notes,MissingValueCodes,Provenance,SeeAlso
0,protocol_id,Unique name or identifier for biosensor protocol,Technology Metadata,single,,string,,,example: ddPCR_SARS-CoV-2,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,
1,technology_platform,Abbreviation or short label for technology,Technology Metadata,single,,string,,,example: ddPCR,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,
2,technology_description,Short description of technology,Technology Metadata,single,,string,,,example: Droplet Digital PCR,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,
3,technology_reference,"URL of publication, preprint, or website that describes technology",Technology Metadata,single,,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,
4,biorecognition_type,Type of molecular entity used for biosensor fabrication,Technology Metadata,multiple,,string,,"""aptamer""=[aptamer] | ""antibody""=[antibody] | ""antigen""=[antigen] | ""molecular beacon""=[molecular beacon] | ""nanobody""=[nanobody] | ""primer""=[primer] | ""receptor""=[receptor] | ""DNA-oligonucleotide""=[DNA-oligonucleotide] | ""analyte binding peptide""=[analyte binding peptide] | ""enzyme-substrate""=[enzyme-substrate]",Multiple entities can be specified as list if biosenor uses a combination of molecular entities,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,


In [7]:
def split_string_variations(input_string):
    # Split the input string by underscores
    parts = input_string.split('_')
    
    # Store all possible substrings
    substrings = set()
    
    # Generate combinations of substrings by slicing the list of parts
    for i in range(len(parts)):
        for j in range(i + 1, len(parts) + 1):
            substrings.add(' '.join(parts[i:j]))  # Join with a space instead of underscore
    
    # Return the result as a list
    return sorted(substrings)

In [8]:
ids = tier2[["Id"]].copy()
ids["keyword"] = ids["Id"].apply(split_string_variations)
ids = ids.explode("keyword")
#ids = ids[1000:2000].copy()
print(f"Number of kewords: {ids.shape[0]}")
print(f"Number of unique keywords: {ids['keyword'].nunique()}")
keywords = list(ids["keyword"].unique())
ids.head()

Number of kewords: 7210
Number of unique keywords: 2411


Unnamed: 0,Id,keyword
0,protocol_id,id
0,protocol_id,protocol
0,protocol_id,protocol id
1,technology_platform,platform
1,technology_platform,technology


In [9]:
ids.query("keyword == 'inchi key'")

Unnamed: 0,Id,keyword
14,immobilization_reagent_inchi_key,inchi key
295,substrate_inchi_key,inchi key


In [12]:
chunks = ontology_mapper_new.create_chunks(keywords, 500)

for i, chunk in enumerate(chunks):
    if i < 3:
        continue
    print(f"chunk: {i}")
    mappings = ontology_mapper_new.map_ontology(API_KEY, chunk,topn=1000)
    mappings.fillna("", inplace=True)
    mappings.query("definition != ''", inplace=True)
    mappings = mappings[mappings['definition'] != ()]
    mappings.to_csv(f"ontology_mappings_{i}.csv", index=False)

chunk: 3
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
chunk: 4
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to fetch data (status code 404)
ERROR: Unable to

In [15]:
#nchunks = len(list(chunks))
map_chunks = []
for i in range(5):
    print(i)
    df = pd.read_csv(f"ontology_mappings_{i}.csv", keep_default_na=False)
    map_chunks.append(df)

all_mappings = pd.concat(map_chunks)
all_mappings.drop_duplicates(inplace=True)
print(f"Number of mappings: {all_mappings.shape[0]}")
all_mappings.to_csv("ontology_mappings.csv", index=False)
all_mappings

0
1
2
3
4
Number of mappings: 1350


Unnamed: 0,keyword,pref_label,synonyms,definition,class
0,protocol,Protocol,"('Protocol',)","('A rule which guides how an activity should be performed.',)",http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C42651
1,protocol,procedure specification,"('flow specification', 'workflow specification', 'protocol')","('A specification on the order, transitions and trigger of multiple action specifications that are part of some aggregate action specification. [Allotrope]',)",http://purl.allotrope.org/ontologies/result#AFR:0000938
2,protocol,method development,"('U44 grant', 'N43 grant', 'N44 grant', 'R43 grant', 'U43 grant', 'protocol', 'R42 grant', 'R44 grant', 'STTR grant', 'SBIR grant', 'R41 grant')","('development of protocols, procedures, or processes for conducting biomedical research; most often applies to analytical methodology.',)",http://purl.bioontology.org/ontology/CSP/4001-0081
3,protocol,protocol,(),"('A plan specification which has sufficient level of detail and quantitative information to communicate it between investigation agents, so that different investigation agents will reliably be able to independently reproduce the process.',)",OBI:0000272
4,protocol,Protocol,"('Experiment annotation', 'Experiment report', 'Experiment metadata')","('A human-readable collection of information about about how a scientific experiment or analysis was carried out that results in a specific set of data or results used for further analysis or to test a specific hypothesis.',)",http://edamontology.org/data:2531
5,protocol,Protocol,(),"('A set of instructions needed to perform an activity.',)",https://w3id.org/reproduceme#Protocol
6,platform,Technology Platform Version,"('Technology Platform', 'Technology Platform Version', 'Platform Version', 'Platform', 'platform')","('The specific version (manufacturer, model, etc.) of a technology that is used to carry out a laboratory or computational experiment.',)",http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C45378
7,platform,Platform,(),"('A surface onto which instruments are deployed to collect data',)",http://hadatac.org/ont/vstoi#Platform
8,technology,Technology,"('Technology',)","('Technology is the discipline dealing with the art or science of applying scientific knowledge to practical problems.',)",http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C17187
9,technology,Technology,"('Industrial Arts', 'Arts, Industrial')","('The application of scientific knowledge to practical purposes in any field. It includes methods, techniques, and instrumentation.',)",http://purl.bioontology.org/ontology/MESH/D013672


In [14]:
#ontology_mapper_new.map_ontology(API_KEY, ["surface"], topn=100)