In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import ontology_mapper_new

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columns
pd.set_option('display.max_colwidth', None) # don't truncate wide columns

In [3]:
tier2 = pd.read_csv("../cdes/RADx-rad_tier2_dict_2024-08-22.csv", keep_default_na=False, dtype=str)

In [4]:
tier2.head()

Unnamed: 0,Id,Label,Examples,Section,Cardinality,Terms,Datatype,Unit,Enumeration,Notes,MissingValueCodes,Provenance,SeeAlso
0,protocol_id,Unique name or identifier for biosensor protocol,ddPCR_SARS-CoV-2,Technology Metadata,single,,string,,,example: ddPCR_SARS-CoV-2,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,
1,technology_platform,Abbreviation or short label for technology,ddPCR,Technology Metadata,single,,string,,,example: ddPCR,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,
2,technology_description,Short description of technology,Droplet Digital PCR,Technology Metadata,single,,string,,,example: Droplet Digital PCR,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,
3,technology_reference,"URL of publication, preprint, or website that describes technology",,Technology Metadata,single,,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,
4,biorecognition_type,Type of molecular entity used for biosensor fabrication,,Technology Metadata,multiple,,string,,"""aptamer""=[aptamer] | ""antibody""=[antibody] | ""antigen""=[antigen] | ""molecular beacon""=[molecular beacon] | ""nanobody""=[nanobody] | ""primer""=[primer] | ""receptor""=[receptor] | ""DNA-oligonucleotide""=[DNA-oligonucleotide] | ""analyte binding peptide""=[analyte binding peptide] | ""enzyme-substrate""=[enzyme-substrate]",Multiple entities can be specified as list if biosenor uses a combination of molecular entities,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,


In [5]:
def split_string_variations(input_string):
    # Split the input string by underscores
    parts = input_string.split('_')
    
    # Store all possible substrings
    substrings = set()
    
    # Generate combinations of substrings by slicing the list of parts
    for i in range(len(parts)):
        for j in range(i + 1, len(parts) + 1):
            substrings.add(' '.join(parts[i:j]))  # Join with a space instead of underscore
    
    # Return the result as a list
    return sorted(substrings)

In [6]:
ids = tier2[["Id"]].copy()
ids["keyword"] = ids["Id"].apply(split_string_variations)
ids = ids.explode("keyword")
print(f"Number of kewords: {ids.shape[0]}")
print(f"Number of unique keywords: {ids['keyword'].nunique()}")
keywords = list(ids["keyword"].unique())
ids.head()

Number of kewords: 7210
Number of unique keywords: 2411


Unnamed: 0,Id,keyword
0,protocol_id,id
0,protocol_id,protocol
0,protocol_id,protocol id
1,technology_platform,platform
1,technology_platform,technology


In [7]:
ids[ids["keyword"].str.contains("inchi key")]

Unnamed: 0,Id,keyword
14,immobilization_reagent_inchi_key,immobilization reagent inchi key
14,immobilization_reagent_inchi_key,inchi key
14,immobilization_reagent_inchi_key,reagent inchi key
295,substrate_inchi_key,inchi key
295,substrate_inchi_key,substrate inchi key


In [8]:
terms = pd.read_csv("../ontology_mappings_2024-09-24.csv")
terms.head()

Unnamed: 0,use,keyword,pref_label,synonyms,definition,class
0,,protocol,Protocol,"('Protocol',)","('A rule which guides how an activity should be performed.',)",http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C42651
1,,protocol,procedure specification,"('flow specification', 'workflow specification', 'protocol')","('A specification on the order, transitions and trigger of multiple action specifications that are part of some aggregate action specification. [Allotrope]',)",http://purl.allotrope.org/ontologies/result#AFR:0000938
2,,protocol,method development,"('U44 grant', 'N43 grant', 'N44 grant', 'R43 grant', 'U43 grant', 'protocol', 'R42 grant', 'R44 grant', 'STTR grant', 'SBIR grant', 'R41 grant')","('development of protocols, procedures, or processes for conducting biomedical research; most often applies to analytical methodology.',)",http://purl.bioontology.org/ontology/CSP/4001-0081
3,,protocol,protocol,(),"('A plan specification which has sufficient level of detail and quantitative information to communicate it between investigation agents, so that different investigation agents will reliably be able to independently reproduce the process.',)",OBI:0000272
4,y,protocol,Protocol,"('Experiment report', 'Experiment metadata', 'Experiment annotation')","('A human-readable collection of information about about how a scientific experiment or analysis was carried out that results in a specific set of data or results used for further analysis or to test a specific hypothesis.',)",http://edamontology.org/data:2531


In [9]:
terms.query("keyword == 'inchi key'")

Unnamed: 0,use,keyword,pref_label,synonyms,definition,class


In [10]:
# extra_mappings = [{"use": "y", "keyword": "inchi key", "use": "y", "pref_label": "", "synonyms": "", "definition": "An InChIKey molecular structure is a molecular structure specified as an InChIKey (hashed InChI), which is a fixed length (25 character) condensed digital representation of an InChI chemical structure specification. It uniquely identifies a chemical compound. An InChIKey identifier is not human- nor machine-readable but is more suitable for web searches than an InChI chemical structure specification. [edamontology.org]", "class": ""},
#                   {"use": "y", "immobilization", "keyword": "inchi key", "use": "y", "pref_label": "", "synonyms": "", "definition": "", "class": ""}

In [11]:
mapped_terms = terms.dropna()
print(f"Number of mapped terms: {mapped_terms.shape[0]}")
mapped_terms.head()

Number of mapped terms: 385


Unnamed: 0,use,keyword,pref_label,synonyms,definition,class
4,y,protocol,Protocol,"('Experiment report', 'Experiment metadata', 'Experiment annotation')","('A human-readable collection of information about about how a scientific experiment or analysis was carried out that results in a specific set of data or results used for further analysis or to test a specific hypothesis.',)",http://edamontology.org/data:2531
11,y,technology platform,Technology Platform Version,"('Technology Platform', 'Technology Platform Version', 'Platform Version', 'Platform', 'platform')","('The specific version (manufacturer, model, etc.) of a technology that is used to carry out a laboratory or computational experiment.',)",http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C45378
12,y,description,Description,"('Description', 'Descriptive', 'description', 'DESCR', 'Desc', 'Descriptor')","('A written or verbal account, representation, statement, or explanation of something.',)",http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C25365
14,y,description,Description,(),"('The text which is used to provide information about a thing.',)",https://w3id.org/reproduceme#Description
18,y,reference,Citation,"('Reference', 'Bibliographic reference')","('Bibliographic data that uniquely identifies a scientific article, book or other published material.', 'A bibliographic reference might include information such as authors, title, journal name, date and (possibly) a link to the abstract or full-text of the article if available.')",http://edamontology.org/data:0970


In [12]:
ids = ids.merge(mapped_terms, on="keyword")
print(f"Number of matched terms: {ids.shape[0]}")
ids.head()

Number of matched terms: 2424


Unnamed: 0,Id,keyword,use,pref_label,synonyms,definition,class
0,protocol_id,protocol,y,Protocol,"('Experiment report', 'Experiment metadata', 'Experiment annotation')","('A human-readable collection of information about about how a scientific experiment or analysis was carried out that results in a specific set of data or results used for further analysis or to test a specific hypothesis.',)",http://edamontology.org/data:2531
1,technology_platform,technology platform,y,Technology Platform Version,"('Technology Platform', 'Technology Platform Version', 'Platform Version', 'Platform', 'platform')","('The specific version (manufacturer, model, etc.) of a technology that is used to carry out a laboratory or computational experiment.',)",http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C45378
2,technology_description,description,y,Description,"('Description', 'Descriptive', 'description', 'DESCR', 'Desc', 'Descriptor')","('A written or verbal account, representation, statement, or explanation of something.',)",http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C25365
3,technology_description,description,y,Description,(),"('The text which is used to provide information about a thing.',)",https://w3id.org/reproduceme#Description
4,technology_reference,reference,y,Citation,"('Reference', 'Bibliographic reference')","('Bibliographic data that uniquely identifies a scientific article, book or other published material.', 'A bibliographic reference might include information such as authors, title, journal name, date and (possibly) a link to the abstract or full-text of the article if available.')",http://edamontology.org/data:0970


In [13]:
# Define the dictionary for replacements
replacements = {
    "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#": "Thesaurus:",
    "http://purl.allotrope.org/ontologies/.*#AFR:": "af-p:", # why doesn't this work all the time?
    "http://purl.allotrope.org/ontologies/process#AFP:": "af-p:"
}
ids["class"] = ids["class"].replace(replacements, regex=True)
ids.head(100)

Unnamed: 0,Id,keyword,use,pref_label,synonyms,definition,class
0,protocol_id,protocol,y,Protocol,"('Experiment report', 'Experiment metadata', 'Experiment annotation')","('A human-readable collection of information about about how a scientific experiment or analysis was carried out that results in a specific set of data or results used for further analysis or to test a specific hypothesis.',)",http://edamontology.org/data:2531
1,technology_platform,technology platform,y,Technology Platform Version,"('Technology Platform', 'Technology Platform Version', 'Platform Version', 'Platform', 'platform')","('The specific version (manufacturer, model, etc.) of a technology that is used to carry out a laboratory or computational experiment.',)",Thesaurus:C45378
2,technology_description,description,y,Description,"('Description', 'Descriptive', 'description', 'DESCR', 'Desc', 'Descriptor')","('A written or verbal account, representation, statement, or explanation of something.',)",Thesaurus:C25365
3,technology_description,description,y,Description,(),"('The text which is used to provide information about a thing.',)",https://w3id.org/reproduceme#Description
4,technology_reference,reference,y,Citation,"('Reference', 'Bibliographic reference')","('Bibliographic data that uniquely identifies a scientific article, book or other published material.', 'A bibliographic reference might include information such as authors, title, journal name, date and (possibly) a link to the abstract or full-text of the article if available.')",http://edamontology.org/data:0970
5,biorecognition_type,type,y,Type,"('Kind', 'Type')","('Something distinguishable as an identifiable class based on common qualities.',)",NCIT:C25284
6,target_analyte_type,analyte,y,Analyte,"('Analyte',)","('The sample or material being subjected to analysis.',)",Thesaurus:C128639
7,target_analyte_type,analyte type,y,Analyte Type,"('analyte type', 'analyte_type', 'Analyte Type')","('The type of biospecimen subjected to analysis.',)",Thesaurus:C156434
8,target_analyte_type,type,y,Type,"('Kind', 'Type')","('Something distinguishable as an identifiable class based on common qualities.',)",NCIT:C25284
9,target_analyte_name,analyte,y,Analyte,"('Analyte',)","('The sample or material being subjected to analysis.',)",Thesaurus:C128639


In [14]:
def remove_substrings(keywords):
    keywords = sorted(keywords, key=len, reverse=True)  # Sort by length, longest first
    result = []
    
    for keyword in keywords:
        if not any(keyword in k for k in result):
            result.append(keyword)
    
    return result

In [15]:
# Apply the function to each group in the DataFrame
groups = ids.groupby("Id")["keyword"].apply(list).apply(remove_substrings).reset_index()

In [16]:
# Function to remove substrings and their corresponding classes
def remove_substrings_with_classes(keywords, classes):
    sorted_pairs = sorted(zip(keywords, classes), key=lambda x: len(x[0]), reverse=True)
    result_keywords = []
    result_classes = []
    
    for keyword, cls in sorted_pairs:
        if not any(keyword in k for k in result_keywords):
            result_keywords.append(keyword)
            result_classes.append(cls)
    
    return pd.Series([result_keywords, result_classes])

# Group by Id, aggregate the keywords and classes, and apply the function
grouped = ids.groupby("Id").agg({
    "keyword": list,
    "class": list
}).apply(lambda row: remove_substrings_with_classes(row['keyword'], row['class']), axis=1)

# Reformatting the DataFrame
grouped.columns = ['filtered_keywords', 'filtered_classes']
grouped = grouped.reset_index()

In [17]:
grouped

Unnamed: 0,Id,filtered_keywords,filtered_classes
0,accumulation_time,"[accumulation, time]","[NCIT:C120860, Thesaurus:C25207]"
1,acquisition_time,[acquisition time],[af-p:0001158]
2,acquisition_time_unit,"[acquisition time, time unit]","[af-p:0001158, Thesaurus:C42574]"
3,adsorption_time_max,"[adsorption, time]","[Thesaurus:C157206, Thesaurus:C25207]"
4,adsorption_time_min,"[adsorption, time]","[Thesaurus:C157206, Thesaurus:C25207]"
5,adsorption_time_unit,"[adsorption, time unit]","[Thesaurus:C157206, Thesaurus:C42574]"
6,age_range_max,"[range, age]","[Thesaurus:C38013, Thesaurus:C25150]"
7,age_range_min,"[range, age]","[Thesaurus:C38013, Thesaurus:C25150]"
8,aggregating_site,[site],[Thesaurus:C37901]
9,analyte_binding_peptide_name,"[analyte, binding, peptide]","[Thesaurus:C128639, GO:0005488, Thesaurus:C735]"


In [18]:
grouped["Terms"] = grouped['filtered_classes'].apply(lambda x: ' '.join(x))
grouped = grouped[["Id", "Terms"]].copy()
grouped.head(25)

Unnamed: 0,Id,Terms
0,accumulation_time,NCIT:C120860 Thesaurus:C25207
1,acquisition_time,af-p:0001158
2,acquisition_time_unit,af-p:0001158 Thesaurus:C42574
3,adsorption_time_max,Thesaurus:C157206 Thesaurus:C25207
4,adsorption_time_min,Thesaurus:C157206 Thesaurus:C25207
5,adsorption_time_unit,Thesaurus:C157206 Thesaurus:C42574
6,age_range_max,Thesaurus:C38013 Thesaurus:C25150
7,age_range_min,Thesaurus:C38013 Thesaurus:C25150
8,aggregating_site,Thesaurus:C37901
9,analyte_binding_peptide_name,Thesaurus:C128639 GO:0005488 Thesaurus:C735


In [19]:
tier2.drop(columns="Terms", inplace=True)
tier2 = tier2.merge(grouped, on="Id", how="left")
tier2.fillna("", inplace=True)

In [20]:
tier2.head(25)

Unnamed: 0,Id,Label,Examples,Section,Cardinality,Datatype,Unit,Enumeration,Notes,MissingValueCodes,Provenance,SeeAlso,Terms
0,protocol_id,Unique name or identifier for biosensor protocol,ddPCR_SARS-CoV-2,Technology Metadata,single,string,,,example: ddPCR_SARS-CoV-2,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,http://edamontology.org/data:2531
1,technology_platform,Abbreviation or short label for technology,ddPCR,Technology Metadata,single,string,,,example: ddPCR,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Thesaurus:C45378
2,technology_description,Short description of technology,Droplet Digital PCR,Technology Metadata,single,string,,,example: Droplet Digital PCR,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Thesaurus:C25365
3,technology_reference,"URL of publication, preprint, or website that describes technology",,Technology Metadata,single,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,http://edamontology.org/data:0970
4,biorecognition_type,Type of molecular entity used for biosensor fabrication,,Technology Metadata,multiple,string,,"""aptamer""=[aptamer] | ""antibody""=[antibody] | ""antigen""=[antigen] | ""molecular beacon""=[molecular beacon] | ""nanobody""=[nanobody] | ""primer""=[primer] | ""receptor""=[receptor] | ""DNA-oligonucleotide""=[DNA-oligonucleotide] | ""analyte binding peptide""=[analyte binding peptide] | ""enzyme-substrate""=[enzyme-substrate]",Multiple entities can be specified as list if biosenor uses a combination of molecular entities,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,NCIT:C25284
5,target_analyte_type,Type of analyte(s) or target(s) to be detected,virus,Technology Metadata,multiple,string,,"""antigen""=[antigen] | ""viral RNA""=[viral RNA] | ""virus""=[virus] | ""antibody""=[antibody] | ""human IgA""=[human IgA] | ""human RBD IgA""=[human RBD IgA] | ""human IgG""=[human IgG] | ""human RBD IgG""=[human RBD IgG] | ""human IgM""=[human IgM] | ""human RBD IgM""=[human RBD IgM] | ""neutralizing antibody""=[neutralizing antibody] | ""EV RNA""=[EV RNA] | ""human microRNA""=[human microRNA] | ""VOC""=[VOC] | ""solid binding peptide""=[solid binding peptide] | ""viral protein""=[viral protein] | ""control""=[control]",example: virus,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Thesaurus:C156434
6,target_analyte_name,Name of target molecule to be detected,Nucleoprotein,Technology Metadata,single,string,,,example: Nucleoprotein,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,af-p:0001604
7,target_analyte_id,"Unique identifer for target molecule or biological entity to be detected (e.g., UniProt Id for a protein, NCBI taxonomy Id for viruses)",P0DTC9,Technology Metadata,single,string,,,example: P0DTC9,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Thesaurus:C128639
8,signal_detection,Type of signal generated by the methods or assay,amerometric,Technology Metadata,single,string,,"""amperometric""=[amperometric] | ""chemiluminescent""=[chemiluminescent] | ""chromatographic""=[chromatographic] | ""colorimetric""=[colorimetric] | ""displacement""=[displacement] | ""electrical resistance""=[electrical resistance] | ""fluorogenic""=[fluorogenic] | ""frequency""=[frequency] | ""gas-chromatographic""=[gas-chromatographic] | ""impedimetric""=[impedimetric] | ""MS chromatogram""=[MS chromatogram] | ""olfactory perception""=[olfactory perception] | ""PCR""=[PCR] | ""Raman scattering intensity""=[Raman scattering intensity] | ""scattering angle""=[scattering angle] | """"=[] | ""voltammetric""=[voltammetric]",example: amerometric,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,http://edamontology.org/operation:2423 Thesaurus:C43382
9,surface,Type of plate or electrode used in the methods or assay,,Technology Metadata/Immobilization,single,string,,,examples: polystyrene | gold electrode | carbon electrode,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,,Thesaurus:C25245


In [21]:
tier2 = tier2[["Id", "Label", "Examples", "Section", "Cardinality", "Terms", "Datatype", "Unit", "Enumeration", "Notes", "MissingValueCodes", "Provenance", "SeeAlso"]]

In [22]:
tier2

Unnamed: 0,Id,Label,Examples,Section,Cardinality,Terms,Datatype,Unit,Enumeration,Notes,MissingValueCodes,Provenance,SeeAlso
0,protocol_id,Unique name or identifier for biosensor protocol,ddPCR_SARS-CoV-2,Technology Metadata,single,http://edamontology.org/data:2531,string,,,example: ddPCR_SARS-CoV-2,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,
1,technology_platform,Abbreviation or short label for technology,ddPCR,Technology Metadata,single,Thesaurus:C45378,string,,,example: ddPCR,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,
2,technology_description,Short description of technology,Droplet Digital PCR,Technology Metadata,single,Thesaurus:C25365,string,,,example: Droplet Digital PCR,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,
3,technology_reference,"URL of publication, preprint, or website that describes technology",,Technology Metadata,single,http://edamontology.org/data:0970,string,,,,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,
4,biorecognition_type,Type of molecular entity used for biosensor fabrication,,Technology Metadata,multiple,NCIT:C25284,string,,"""aptamer""=[aptamer] | ""antibody""=[antibody] | ""antigen""=[antigen] | ""molecular beacon""=[molecular beacon] | ""nanobody""=[nanobody] | ""primer""=[primer] | ""receptor""=[receptor] | ""DNA-oligonucleotide""=[DNA-oligonucleotide] | ""analyte binding peptide""=[analyte binding peptide] | ""enzyme-substrate""=[enzyme-substrate]",Multiple entities can be specified as list if biosenor uses a combination of molecular entities,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,
5,target_analyte_type,Type of analyte(s) or target(s) to be detected,virus,Technology Metadata,multiple,Thesaurus:C156434,string,,"""antigen""=[antigen] | ""viral RNA""=[viral RNA] | ""virus""=[virus] | ""antibody""=[antibody] | ""human IgA""=[human IgA] | ""human RBD IgA""=[human RBD IgA] | ""human IgG""=[human IgG] | ""human RBD IgG""=[human RBD IgG] | ""human IgM""=[human IgM] | ""human RBD IgM""=[human RBD IgM] | ""neutralizing antibody""=[neutralizing antibody] | ""EV RNA""=[EV RNA] | ""human microRNA""=[human microRNA] | ""VOC""=[VOC] | ""solid binding peptide""=[solid binding peptide] | ""viral protein""=[viral protein] | ""control""=[control]",example: virus,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,
6,target_analyte_name,Name of target molecule to be detected,Nucleoprotein,Technology Metadata,single,af-p:0001604,string,,,example: Nucleoprotein,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,
7,target_analyte_id,"Unique identifer for target molecule or biological entity to be detected (e.g., UniProt Id for a protein, NCBI taxonomy Id for viruses)",P0DTC9,Technology Metadata,single,Thesaurus:C128639,string,,,example: P0DTC9,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,
8,signal_detection,Type of signal generated by the methods or assay,amerometric,Technology Metadata,single,http://edamontology.org/operation:2423 Thesaurus:C43382,string,,"""amperometric""=[amperometric] | ""chemiluminescent""=[chemiluminescent] | ""chromatographic""=[chromatographic] | ""colorimetric""=[colorimetric] | ""displacement""=[displacement] | ""electrical resistance""=[electrical resistance] | ""fluorogenic""=[fluorogenic] | ""frequency""=[frequency] | ""gas-chromatographic""=[gas-chromatographic] | ""impedimetric""=[impedimetric] | ""MS chromatogram""=[MS chromatogram] | ""olfactory perception""=[olfactory perception] | ""PCR""=[PCR] | ""Raman scattering intensity""=[Raman scattering intensity] | ""scattering angle""=[scattering angle] | """"=[] | ""voltammetric""=[voltammetric]",example: amerometric,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,
9,surface,Type of plate or electrode used in the methods or assay,,Technology Metadata/Immobilization,single,Thesaurus:C25245,string,,,examples: polystyrene | gold electrode | carbon electrode,"""-9960""=[Not Entered By Originator]",RADx-rad DCC,


In [23]:
tier2_name = "../cdes/RADx-rad_tier2_dict_2024-09-30.csv"
tier2.to_csv(tier2_name, index=False)