In [1]:
from das.distributed_atom_space import DistributedAtomSpace, QueryOutputFormat
from das.pattern_matcher.pattern_matcher import PatternMatchingAnswer, OrderedAssignment, UnorderedAssignment, CompositeAssignment, Node, Link, Variable, Not, And, Or
from das.database.db_interface import WILDCARD
import warnings
import time
warnings.filterwarnings('ignore')
das = DistributedAtomSpace()
db = das.db
print("Atoms (nodes, links): " + str(das.count_atoms()))
# 2023_02 :-->  ( 2584425,  26915329)
# 2023_04 :-->  (11004962, 176057040)


Log initialized. Log file: /tmp/das.log
Atoms (nodes, links): (4381122, 9836656)


In [12]:
class WallClock:
    
    def __init__(self):
        self.start_time = None
        self.wall_time = None
        self.epochs = 0
    
    def start(self):
        self.start_time = time.perf_counter()
    
    def stop(self):
        self.wall_time = time.perf_counter() - self.start_time
    
    def epoch(self, n=1):
        self.epochs += n
        
    def print(self, text="query"):
        if self.wall_time >= 1:
            total_time = f"{self.wall_time:.3f} seconds"
        else:
            total_time = f"{(self.wall_time * 1000):.0f} milliseconds"
        if self.epochs == 0:
            time_per_epoch = ""
        else:
            time_per_epoch = f"{((self.wall_time * 1000) / self.epochs):.3f} milliseconds per {text}"
        print(f"{total_time} ({time_per_epoch})")
            
        
def print_ordered_assignment(assignment):
    if assignment is not None:
        for key, value in assignment.mapping.items():
            print(f"{key}: {db.get_node_name(value)}")

def print_unordered_assignment(assignment):
    if assignment is not None:
        symbols = []
        for key in assignment.symbols:
            for i in range(assignment.symbols[key]):
                symbols.append(key)
        values = []
        for key in assignment.values:
            for i in range(assignment.values[key]):
                values.append(key)
        mapping_keys = []
        mapping_values = []
        for symbol, value in zip(symbols, values):
            mapping_keys.append(symbol)
            mapping_values.append(db.get_node_name(value))
        print(f"{mapping_keys} = {mapping_values}")

def print_elapsed_time(start):
    end = time.perf_counter()
    wall_time = end - start
    if wall_time >= 1:
        print(f"{wall_time:.3f} seconds")
    else:
        print(f"{(wall_time * 1000):.0f} milliseconds")
        
def query(query_obj, log = False, detailed_log = False):
    assert log or (not detailed_log)
    query_answer = PatternMatchingAnswer()
    start = time.perf_counter()
    matched = query_obj.matched(db, query_answer)
    if log:
        print_elapsed_time(start)
        print(matched)
        if matched:
            print(f"{len(query_answer.assignments)} answers")
            if detailed_log:
        #         print(query_answer.assignments)
                for assignment in query_answer.assignments:
                    if type(assignment) is OrderedAssignment:
                        print_ordered_assignment(assignment)
                    elif type(assignment) is UnorderedAssignment:
                        print_unordered_assignment(assignment)
                    elif type(assignment) is CompositeAssignment:
                        print_ordered_assignment(assignment.ordered_mapping)
                        for unordered_assignment in assignment.unordered_mappings:
                            print_unordered_assignment(unordered_assignment)
                    print("")
    return query_answer.assignments

def get_mappings(q, variable_name):
    """
    Executes passed query and return the values assigned to the passed variable by searching for the respective node name
    """
    assignments = query(q)
    return [das.get_node_name(assignment.mapping[variable_name]) for assignment in assignments]
    

def get_feature_node_handle(name):
    """
    Get the handle of the corresponding Gene node given a gene name.
    """
    verbatim_node = das.get_node("Verbatim", name)
    schema_node = das.get_node("Schema", "Schema:feature_name")
    v1 = Variable("v1")
    links = das.get_links("Execution", None, [schema_node, WILDCARD, verbatim_node])
    # To be replaced by:
    # assert len(links) > 0
    if len(links) == 0:
        return None
    link = das.get_atom(links[0], output_format=QueryOutputFormat.ATOM_INFO)
    return link["targets"][1]


def build_feature_node(name):
    """
    Build a Node object to be used to compose queries. 
    """
    feature_node_handle = get_feature_node_handle(name)
    if feature_node_handle == None:
        return None
    feature_node = das.get_atom(feature_node_handle, output_format=QueryOutputFormat.ATOM_INFO)
    return Node("feature", feature_node["name"])


def get_feature_fb_id(symbol):
    """
    Get the FB id of a given feature by its symbol
    """
    n = build_feature_node(symbol.replace("(", "[").replace(")", "]"))
    if n == None:
        return None
    v = Variable("v1")
    s = Node("Schema", "Schema:feature_uniquename")
    # searches for an Execution link to the schema "feature_uniquename", node of type "feature" 
    # and value "symbol", and any ***atom***??? represented by Variable "v1"
    q = Link("Execution", ordered=True, targets=[s, n, v])
    assignment = query(q)
    assert len(assignment) == 1
    id_handle = assignment.pop().mapping['v1'] # handle of "Verbatim" node
    return db.get_node_name(id_handle)


"""
    Get all handles gene node given a gene name (symbol).
    
    Some genes have more than  one uniquename. Eg "AGO2" in the next cell
    Unfortunately, for some DAS setups, the uniquename retrieved by get_feature_fb_id doesn't work. 
    To ensure correct retrieval, the way is to get all FBgn# and test each one... :/
"""    
def get_all_feature_node_handles(name):
    verbatim_node = das.get_node("Verbatim", name)
    schema_node = das.get_node("Schema", "Schema:feature_name")
    v1 = Variable("v1")
    links = das.get_links("Execution", None, [schema_node, WILDCARD, verbatim_node])
    # To be replaced by:
    # assert len(links) > 0
    if len(links) == 0:
        return None
    handles = []
    for link in links:
        atom_link = das.get_atom(link, output_format=QueryOutputFormat.ATOM_INFO)
        handles.append(atom_link["targets"][1])
    return handles



"""
    Build a Node object to be used to compose queries. 
    
    Some genes have more than  one uniquename. Eg "AGO2" in the next cell
    Unfortunately, for some DAS setups, the uniquename retrieved by get_feature_fb_id doesn't work. 
    To ensure correct retrieval, the way is to get all FBgn# and test each one... :/
"""
def build_all_feature_nodes(name):
    feature_node_handles = get_all_feature_node_handles(name)    
    if feature_node_handles == None:
        return None
    nodes = []
    for feature_node_handle in feature_node_handles:
        feature_node = das.get_atom(feature_node_handle, output_format=QueryOutputFormat.ATOM_INFO)
        nodes.append( Node("feature", feature_node["name"]) )
    return nodes
    

"""
    Some genes have more than  one uniquename. Eg "AGO2" in the next cell
    Unfortunately, for some DAS setups, the uniquename retrieved by get_feature_fb_id doesn't work. 
    To ensure correct retrieval, the way is to get all FBgn# and test each one... :/
"""
def get_all_feature_fb_id(symbol):
    nodes = build_all_feature_nodes(symbol.replace("(", "[").replace(")", "]"))
    if nodes == None:
        return None
    fb_ids = []
    for node in nodes:
        uniq_var = Variable("v1")
        schema = Node("Schema", "Schema:feature_uniquename")
        q = Link("Execution", ordered=True, targets=[schema, node, uniq_var])
        assignment = query(q)
        assert len(assignment) == 1
        id_handle = assignment.pop().mapping['v1'] # handle of "Verbatim" node
        fb_ids.append( db.get_node_name(id_handle) )
    return fb_ids


In [13]:
# the FBgn# for the 20 NetAct TFs.
# In this small "GeneGroup DAS" "Nipped-B" and "Rbf" are not in the "feature_uniquename" schema
for name in ["Su(var)205", "Top3beta", "Mef2", "Clk", "Dref", "TfIIB", "Myc", "AGO2", "Nipped-B", 
             "Cp190", "TfIIA-L", "Trl", "ash1", "Raf", "Abd-B", "Orc2", "Rbf", "mof", "msl-1", "Hmr"]:
    fb_id = get_feature_fb_id(name)
    print(str(fb_id) + "  ::---->    " + name)

print("\n\nAll FBgn# for gene Argonaut:")
fb_ids = get_all_feature_fb_id("AGO2")
print (fb_ids)
print("\n\nAll FBgn# for gene cap-n-collar-RG:")
fb_ids = get_all_feature_fb_id("cnc-RG")
print (fb_ids)
print("\n\nAll FBgn# for gene Diedel:")
fb_ids = get_all_feature_fb_id("Diedel")
print (fb_ids)

FBgn0003607  ::---->    Su(var)205
FBgn0026015  ::---->    Top3beta
FBgn0011656  ::---->    Mef2
FBgn0023076  ::---->    Clk
FBgn0015664  ::---->    Dref
FBgn0004915  ::---->    TfIIB
FBgn0262656  ::---->    Myc
FBgn0087035  ::---->    AGO2
FBgn0026401  ::---->    Nipped-B
FBgn0000283  ::---->    Cp190
FBgn0011289  ::---->    TfIIA-L
FBgn0013263  ::---->    Trl
FBgn0005386  ::---->    ash1
FBgn0003079  ::---->    Raf
FBgn0000015  ::---->    Abd-B
FBgn0015270  ::---->    Orc2
FBgn0015799  ::---->    Rbf
FBgn0014340  ::---->    mof
FBgn0005617  ::---->    msl-1
FBgn0001206  ::---->    Hmr


All FBgn# for gene Argonaut:
['FBgn0087035', 'FBgn0046812']


All FBgn# for gene cap-n-collar-RG:
['FBtr0306750', 'FBtr0084394']


All FBgn# for gene Diedel:
['Q9VAK8-IPR025061-PF13164-FBpp0084794-1-tr', 'FBgn0039666', 'Q9VWM9-IPR025061-PF13164-FBpp0310706-1-tr', 'A0A0B4K6M8-IPR025061-PF13164-FBpp0297389-1-tr', 'Q9VWM9-IPR025061-PF13164-FBpp0310706-1-tr-exon0', 'A0A0B4K6M8-IPR025061-PF13164-FBpp029738

In [28]:
# General building block

def get_feature_entities_list(feature_identifier, another_entity_table_column_name, feature_identifier_type="symbol"):
    """
        Gets a ***list of entities*** from table_column "another_entity_table_column_name" [as stored in the "Schema Node" whose name is
        exactly the same ("another_entity_table_column_name")] for a given feature (feature_identifier) 
        Most times the returned list will contain only one element.
        Look the next cell for examples of using this query
    
        Args:
            feature_identifer (str): string that identifies a feature. Most frequently it will be a gene_symbol or a uniquename (FBid).
            another_entity_table_column_name (str): this parameter must be the SchemaNode's name (whithout the "Schema:" prefix) of the 
                                                    second element of the "essential column pair" that is related somehow to the 
                                                    feature_identifier parameter
            feature_identifier_type (str, optional): string that indicates the type of the feature_identifier parameter. Most commonly it will be 
                                                     "symbol" (it is the default) that indicates a gene symbol
                                                     "FBid" that indicates a current (valid, non obsolete) uniquename. Eg: FBgn0000166, FBpp0310706.
                                                     "self" that indicates to use the value of the feature_identifier parameter
            
                
        Returns:
            A list of feature-related entities (strings). Most times it will hold only one element.
    """
    if feature_identifier_type == "symbol":
        return _get_feature_entities_list(feature_identifier, another_entity_table_column_name)
    else:
        if feature_identifier_type == "FBid" or feature_identifier_type == "self":
            n1 = Node("Verbatim", feature_identifier)
            v1 = Variable("v1")
            s = Node("Schema", f"Schema:{another_entity_table_column_name.replace('(', '[').replace(')', ']')}")    
        
            q1 = Link("Execution", ordered=True, targets=[s, n1, v1])    
        else:
            print(feature_identifier_type)
    return get_mappings(q1, "v1")

In [29]:
# Look the next cell for examples of using this query
def _get_feature_entities_list(gene_symbol, another_entity_table_column_name):
    fb_id = get_feature_fb_id(gene_symbol)     
    if fb_id == None:
        return None
    n1 = Node("Verbatim", fb_id)
    v1 = Variable("v1")
    s = Node("Schema", f"Schema:{another_entity_table_column_name.replace('(', '[').replace(')', ']')}")    

    q1 = Link("Execution", ordered=True, targets=[s, n1, v1])    
    return get_mappings(q1, "v1")

In [35]:
    symbols_list = ["Top3beta", "Mef2", "Clk", "Myc", "Abd-B"]
symbols_list = ["Su(var)205", "Top3beta", "Mef2", "Clk", "Dref", "TfIIB", "Myc", "AGO2", "Nipped-B", 
                "Cp190", "TfIIA-L","Trl", "ash1", "Raf", "Abd-B", "Orc2", "Rbf", "mof", "msl-1", "Hmr"]
symbols_list = ["Top3beta"]               
for gene_symbol in symbols_list:
    print(f'\n\nGene {gene_symbol}:')
    print(f'{gene_symbol} (fbgn_fbtr_fbpp_expanded) organism: {get_feature_entities_list(gene_symbol, "fbgn_fbtr_fbpp_expanded_organism")}')
    print(f'{gene_symbol} (Dmel_enzyme_data) GO terms: {get_feature_entities_list(gene_symbol, "Dmel_enzyme_data_gene_group_GO_id(s)")}') 
    print(f'{gene_symbol} (Dmel_enzyme_data) gene group id (FBgg#): {get_feature_entities_list(gene_symbol, "Dmel_enzyme_data_gene_group_id")}') 
    print(f'{gene_symbol} (Dmel_enzyme_data) gene_group_EC_number(s): {get_feature_entities_list(gene_symbol, "Dmel_enzyme_data_gene_group_EC_number(s)")}') 
    print(f'{gene_symbol} (Dmel_enzyme_data) gene_EC_number(s): {get_feature_entities_list(gene_symbol, "Dmel_enzyme_data_gene_EC_number(s)")}')     
    print(f'{gene_symbol} (Dmel_enzyme_data) gene_name: {get_feature_entities_list(gene_symbol, "Dmel_enzyme_data_gene_name")}')     
    
    print(f'{gene_symbol} (gene_map_table) recombination_loc: {get_feature_entities_list(gene_symbol, "gene_map_table_recombination_loc")}') 
    print(f'{gene_symbol} (gene_map_table) cytogenetic_loc: {get_feature_entities_list(gene_symbol, "gene_map_table_cytogenetic_loc")}') 

    print(f'{gene_symbol} (best_gene_summary) Summary source: {get_feature_entities_list(gene_symbol, "best_gene_summary_Summary_Source")}') 
    print(f'{gene_symbol} (best_gene_summary) Summary: {get_feature_entities_list(gene_symbol, "best_gene_summary_Summary")}') 



Gene Top3beta:
Top3beta (fbgn_fbtr_fbpp_expanded) organism: ['Dmel']
Top3beta (Dmel_enzyme_data) GO terms: ['GO:0003917']
Top3beta (Dmel_enzyme_data) gene group id (FBgg#): ['FBgg0001554']
Top3beta (Dmel_enzyme_data) gene_group_EC_number(s): ['EC 5.6.2.1']
Top3beta (Dmel_enzyme_data) gene_EC_number(s): ['EC 5.6.2.1']
Top3beta (Dmel_enzyme_data) gene_name: ['Topoisomerase 3beta']
Top3beta (gene_map_table) recombination_loc: ['1-16']
Top3beta (gene_map_table) cytogenetic_loc: ['5E5-5E5']
Top3beta (dmel_human_orthologs_disease) Human_gene_symbol: []
Top3beta (dmel_human_orthologs_disease) Dmel_gene_ID: ['FBgn0026015']
Top3beta (dmel_human_orthologs_disease) Human_gene_symbol: ['TOP3B']
Top3beta (dmel_human_orthologs_disease) Human_gene_symbol: ['TOP3A']
Top3beta (dmel_human_orthologs_disease) Dmel_gene_symbol: ['Top3beta']
Top3beta (best_gene_summary) Summary source: ['FlyBase Gene Snapshot']
Top3beta (best_gene_summary) Summary: ['Topoisomerase 3beta [Top3beta] encodes an RNA topoisome

In [78]:
# Allele / diseases related gene data:
symbols_list = ["Top3beta", "Mef2", "Clk", "Myc", "Abd-B"]
symbols_list = ["Su(var)205", "Top3beta", "Mef2", "Clk", "Dref", "TfIIB", "Myc", "AGO2", "Nipped-B", 
                "Cp190", "TfIIA-L","Trl", "ash1", "Raf", "Abd-B", "Orc2", "Rbf", "mof", "msl-1", "Hmr"]
symbols_list = ["Top3beta"]               
symbols_list = ["Raf"]               
#symbols_list = ["Rbf"]
for gene_symbol in symbols_list:
    print(f'\n\nGene {gene_symbol}:')
    alleles_ids = get_feature_entities_list(gene_symbol, "fbal_to_fbgn_AlleleID")
    
    # relating another allele info...				interaction		FBrf#     Top3β[hs.PP] is a non-suppressor of lethal | recessive phenotype of Top3α[191]
    for allele_id in alleles_ids: 
        print(f'{allele_id} (allele_genetic_interactions) Allele symbol: '
              f'{get_feature_entities_list(allele_id, "allele_genetic_interactions_allele_symbol", feature_identifier_type="FBid")}')
        print(f'{allele_id} (allele_genetic_interactions) interaction: '
             f'{get_feature_entities_list(allele_id, "allele_genetic_interactions_interaction", feature_identifier_type="FBid")}')
        
    print(f'{gene_symbol} (dmel_human_orthologs_disease) OMIM_Phenotype_IDs[name]: {get_feature_entities_list(gene_symbol, "dmel_human_orthologs_disease_OMIM_Phenotype_IDs[name]")}') 
    # THIS WOULD RETURN [] BECAUSE THERE IS NO PAIR FROM "feature_uniquename" TO "dmel_human_orthologs_disease_Human_gene_symbol"
    # BUT THERE IS FROM "dmel_human_orthologs_disease_Dmel_gene_ID" TO "dmel_human_orthologs_disease_Human_gene_symbol" THAT SHOULD BE USED
    print(f'{gene_symbol} (dmel_human_orthologs_disease) Human_gene_symbol: {get_feature_entities_list(gene_symbol, "dmel_human_orthologs_disease_Human_gene_symbol")}') 
    #print(f'{gene_symbol} (dmel_human_orthologs_disease) Dmel_gene_ID: {get_feature_entities_list(gene_symbol, "dmel_human_orthologs_disease_Dmel_gene_ID", feature_identifier_type="self")}')     
    human_HGNC_ids = get_feature_entities_list(gene_symbol, "dmel_human_orthologs_disease_Human_gene_HGNC_ID")
    print(f'{gene_symbol} (dmel_human_orthologs_disease) Human_gene_HGNC_ID: {human_HGNC_ids}')
    for hgnc_id in human_HGNC_ids:
        print(f'{gene_symbol} (dmel_human_orthologs_disease) Human_gene_symbol: {get_feature_entities_list(hgnc_id, "dmel_human_orthologs_disease_Human_gene_symbol", feature_identifier_type="self")}') 
    print(f'{gene_symbol} (dmel_human_orthologs_disease) Dmel_gene_symbol: {get_feature_entities_list(gene_symbol, "dmel_human_orthologs_disease_Dmel_gene_symbol")}')     
    print(f'{gene_symbol} (dmel_human_orthologs_disease) Human_gene_OMIM_ID: {get_feature_entities_list(gene_symbol, "dmel_human_orthologs_disease_Human_gene_OMIM_ID")}') 




Gene Raf:
FBal0294413 (allele_genetic_interactions) Allele symbol: []
FBal0294413 (allele_genetic_interactions) interaction: []
FBal0013773 (allele_genetic_interactions) Allele symbol: []
FBal0013773 (allele_genetic_interactions) interaction: []
FBal0374656 (allele_genetic_interactions) Allele symbol: []
FBal0374656 (allele_genetic_interactions) interaction: []
FBal0245477 (allele_genetic_interactions) Allele symbol: ['Raf[JF01185]']
FBal0245477 (allele_genetic_interactions) interaction: ['Raf[JF01185], Scer\\GAL4[tin.CΔ4] is a non-suppressor of adult heart phenotype of Scer\\GAL4[tin.CΔ4], yki[S168A.UAS.Tag:V5]']
FBal0103602 (allele_genetic_interactions) Allele symbol: ['Raf[LE78]']
FBal0103602 (allele_genetic_interactions) interaction: ['Raf[LE78] is a non-suppressor of phenotype of cic[1]']
FBal0127909 (allele_genetic_interactions) Allele symbol: []
FBal0127909 (allele_genetic_interactions) interaction: []
FBal0372506 (allele_genetic_interactions) Allele symbol: []
FBal0372506 (al

ValueError: Invalid handle: 5985381cefe9a1115d36d5f933b1a3e7

In [75]:
# Evaluation "Predicate has_name" (List "Enzyme EC 1.1.1.1" "Concept alcohol dehydrogenase"))

# (Execution "Schema Schema:grp_uniquename" "grp 162" "Verbatim FBgg0000201")
# (Execution "Schema Schema:public.cvterm" "grp 162" "Concept Concept:public.cvterm_505")
# (Execution "Schema Schema:grp_uniquename" "grp 163" "Verbatim FBgg0000202")
# (Execution "Schema Schema:public.cvterm" "grp 163" "Concept Concept:public.cvterm_505")
# (Execution "Schema Schema:grp_is_analysis" "grp 163" "Concept Concept:False")

# (Evaluation "Predicate has_name" (List "Enzyme EC 1.1.1.7" "Concept propanediol-phosphate dehydrogenase"))
#parent_query = Link("Evaluation", ordered=True, targets=[group_node, parent_var])

def get_EC_name(ec_number):
    if ec_number.startswith("EC"):
        enz_node = Node("Enzyme", ec_number)
    else:
        enz_node = Node("EnzymeOntology", ec_number)
    v1 = Variable("v1")
    has_name_pred_node = Node("Predicate", "has_name")    

    q1 = Link("Evaluation", ordered=True, targets=[has_name_pred_node, enz_node, v1])    
    return get_mappings(q1, "v1")


In [76]:
import json

# GO dictionary for knowing correct GO nodes types
with open("dict_data/go-namespace.json", "r") as ns:
    go_plus_dict = json.load(ns)

   
def get_GO_term_name(go_term):
    # finds go_term type (BP, MF, CC)
    if go_term.startswith('GO'):        # Gene Ontology ID
        go_namespace = go_plus_dict[go_term]
        #print(go_namespace)
    go_node = Node(go_namespace, go_term)
    v1 = Variable("v1")
    has_name_pred_node = Node("Predicate", "has_name")    

    q1 = Link("Evaluation", ordered=True, targets=[has_name_pred_node, go_node, v1])    
    return get_mappings(q1, "v1")[0]

FileNotFoundError: [Errno 2] No such file or directory: 'dict_data/go-namespace.json'

In [79]:
#  GO terms / predicates
# ExplorEnz number / names

symbols_list = ["Top3beta", "Mef2", "Clk", "Myc", "Abd-B"]
symbols_list = ["Su(var)205", "Top3beta", "Mef2", "Clk", "Dref", "TfIIB", "Myc", "AGO2", "Nipped-B", 
                "Cp190", "TfIIA-L","Trl", "ash1", "Raf", "Abd-B", "Orc2", "Rbf", "mof", "msl-1", "Hmr"]
symbols_list = ["Top3beta"]               
for gene_symbol in symbols_list:
    print(f'\n\nGene {gene_symbol}:') 
 
    print(f'{gene_symbol} (Dmel_enzyme_data) gene_group_EC_number(s): {get_feature_entities_list(gene_symbol, "Dmel_enzyme_data_gene_group_EC_number(s)")}')
    ec_numbers = get_feature_entities_list(gene_symbol, "Dmel_enzyme_data_gene_EC_number(s)")
    for ec_number in ec_numbers:
        print(f'{gene_symbol} (Dmel_enzyme_data) gene_EC_number(s)/(explorEnz has_name): {ec_number} / {get_EC_name(ec_number)}')
        
    go_terms = get_feature_entities_list(gene_symbol, "Dmel_enzyme_data_gene_group_GO_id(s)")
    for go_term in go_terms:
        #print(f'{gene_symbol} (Dmel_enzyme_data) GO term: {get_GO_term_name(go_term)}')    
        print(f'{gene_symbol} (Dmel_enzyme_data) GO term: {go_term}')    



Gene Top3beta:
Top3beta (Dmel_enzyme_data) gene_group_EC_number(s): ['EC 5.6.2.1']
Top3beta (Dmel_enzyme_data) gene_EC_number(s)/(explorEnz has_name): EC 5.6.2.1 / []
Top3beta (Dmel_enzyme_data) GO term: GO:0003917


In [14]:
"""
About HGNC groups. 
The "HGNC_family_ID" (eg 705) is a group id; not a gene id (which is like HGNC:705)
Flybase assigned 705 to the PTP (PROTEIN TYROSINE PHOSPHATASES) group that corresponds to this: 
https://www.genenames.org/data/genegroup/#!/group/705 (Class I classical Cys-based phosphatases)

So,the get_groups_HGNC_ids method returns a list of pairs ("id", id_URL) because Flybase states that could exist
more than one HGNC id for a given Flybase group and the table "gene_groups_HGNC_fb*.tsv" reflects that statement.

get_groups_HGNC_ids method output:

For the current example (Flybase PTP group) it is:

[(705, https://www.genenames.org/data/genegroup/#!/group/705)]


For multiple HGNC ids with input of "FBgg0000112":

FB_group_id	FB_group_symbol	FB_group_name	HGNC_family_ID
FBgg0000112	INX	            INNEXINS	    314
FBgg0000112	INX	            INNEXINS	    288

it outputs:

[ 
    (314, https://www.genenames.org/data/genegroup/#!/group/314), 
    (288, https://www.genenames.org/data/genegroup/#!/group/288)
]
"""
# 1
# Gets the group ***symbol*** from the "group_id" (FBgg#) as stored in the "Schema Node" whose name is "table_name".
def get_group_symbol(group_id, table_name):
    n1 = Node("Verbatim", group_id)
    v1 = Variable("v1")
    s = Node("Schema", f"Schema:{table_name}_FB_group_symbol")
    q1 = Link("Execution", ordered=True, targets=[s, n1, v1])
    return get_mappings(q1, "v1")[0]

# 2
# Gets the group ***name*** from the "group_id" (FBgg#) as stored in the "Schema Node" whose name is "table_name".
def get_group_name(group_id, table_name):
    n1 = Node("Verbatim", group_id)
    v1 = Variable("v1")
    s = Node("Schema", f"Schema:{table_name}_FB_group_name")
    q1 = Link("Execution", ordered=True, targets=[s, n1, v1])
    return get_mappings(q1, "v1")[0]

# 3
# Gets a ***list*** of HGNC ids for the group designated by "group_id".
def get_groups_HGNC_ids(group_id):
    n1 = Node("Verbatim", group_id)
    v1 = Variable("v1")
    s = Node("Schema", f"Schema:gene_groups_HGNC_HGNC_family_ID")
    q1 = Link("Execution", ordered=True, targets=[s, n1, v1])
    ids = get_mappings(q1, "v1")
    HGNC_ids_data_list = []
    for id in ids:
        HGNC_ids_data_list.append((id, f"https://www.genenames.org/data/genegroup/#!/group/{id}"))
    return HGNC_ids_data_list

# 4
# Gets a ***list of groups ids*** (FBgg#) from the "group_symbol" as stored in the "Schema Node" whose name is "table_name".    
def get_groups_ids(gene_symbol, table_name):
    fb_id = get_feature_fb_id(gene_symbol)     
    if fb_id == None:
        return None
    n1 = Node("Verbatim", fb_id)
    v1 = Variable("v1")
    s = Node("Schema", f"Schema:{table_name}_FB_group_id")    
    q1 = Link("Execution", ordered=True, targets=[s, n1, v1])    
    return get_mappings(q1, "v1")

# 5
# gets group id(s) (FBgg#) of the gene designated by "gene_symbol"
def get_gene_groups_ids(gene_symbol):
    return get_groups_ids(gene_symbol, "gene_group_data")

# 6
# gets PATHWAY group id(s) (FBgg#) of the gene designated by "gene_symbol"
def get_pathway_groups_ids(gene_symbol):
    return get_groups_ids(gene_symbol, "pathway_group_data")    
    

# 7
# Gets a ***list of groups symbols*** from the "generoup_symbol" as stored in the "Schema Node" whose name is "table_name".    
def get_groups_symbols(gene_symbol, table_name):
    gene_groups_ids = get_groups_ids(gene_symbol, table_name)
    if gene_groups_ids == None:
        return None
    gg_symbols = []
    for gg_id in gene_groups_ids:
        n1 = Node("Verbatim", gg_id)
        v1 = Variable("v1")
        # groups
        s = Node("Schema", f"Schema:{table_name}_FB_group_symbol")
        q1 = Link("Execution", ordered=True, targets=[s, n1, v1])
        for symb in get_mappings(q1, "v1"):
            gg_symbols.append(symb)    
    return gg_symbols

# 8
def get_gene_groups_symbols(gene_symbol):
    return get_groups_symbols(gene_symbol, "gene_group_data")

# 9
def get_pathway_groups_symbols(gene_symbol):
    return get_groups_symbols(gene_symbol, "pathway_group_data")


# 10
# get group name(s) of gene designated by gene_symbol
def get_groups_names(gene_symbol, table_name):
    gene_groups_ids = get_groups_ids(gene_symbol, table_name)   
    if gene_groups_ids == None:
        return None
    gg_names = []
    for gg_id in gene_groups_ids:
        n1 = Node("Verbatim", gg_id)
        v1 = Variable("v1")
        # groups
        s = Node("Schema", f"Schema:{table_name}_FB_group_name")        
        q1 = Link("Execution", ordered=True, targets=[s, n1, v1])
        for gg_name in get_mappings(q1, "v1"):
            gg_names.append(gg_name)
    return gg_names

# 11
def get_gene_groups_names(gene_symbol):
    return get_groups_names(gene_symbol, "gene_group_data")

# 12
def get_pathway_groups_names(gene_symbol):
    return get_groups_names(gene_symbol, "pathway_group_data")

# 13
# In fact, a group could have more than one parent! E.g: FBgg0000275
def get_parent_groups_ids(group_id):
    group_node = Node("Verbatim", group_id)
    parent_var = Variable("v1")
    parent_query = Link("Inheritance", ordered=True, targets=[group_node, parent_var])
    return get_mappings(parent_query, "v1")


# 14
def get_parent_groups_symbols(group_id, table_name):
    parent_ids = get_parent_groups_ids(group_id)
    parents_symbols = []
    for parent_id in parent_ids:
        n1 = Node("Verbatim", parent_id)
        v1 = Variable("v1")
        s = Node("Schema", f"Schema:{table_name}_Parent_FB_group_symbol")        
        q1 = Link("Execution", ordered=True, targets=[s, n1, v1])
        parents_symbols.extend(get_mappings(q1, "v1"))    
    return parents_symbols

# 15    
# In fact, a group could have more than one parent! E.g: FBgg0000275
def get_parent_gene_groups_symbols(group_id):
    return get_parent_groups_symbols(group_id, "gene_group_data")        

# 16
# In fact, a group could have more than one parent! E.g: FBgg0000275
def get_parent_pathway_group_symbols(group_id):
    return get_parent_groups_symbols(group_id, "pathway_group_data")  



# 17
# In fact, a group could have more than one parent. E.g: FBgg0000275
# In the precomputed tables there are only group PARENT id and symbol...
# So, the parent group names should be retrieved as the regular groups names.
def get_parent_groups_names(group_id, table_name):
    parent_ids = get_parent_groups_ids(group_id)
    parents_names = []
    for parent_id in parent_ids:
        n1 = Node("Verbatim", parent_id)
        v1 = Variable("v1")
        s = Node("Schema", f"Schema:{table_name}_FB_group_name")        
        q1 = Link("Execution", ordered=True, targets=[s, n1, v1])
        parents_names.extend(get_mappings(q1, "v1"))    
    return parents_names

# 18
# In fact, a group could have more than one parent! E.g: FBgg0000275
def get_parent_gene_groups_names(group_id):
    return get_parent_groups_names(group_id, "gene_group_data")

# 19
# In fact, a group could have more than one parent! E.g: FBgg0000275
def get_parent_pathway_groups_names(group_id):
    return get_parent_groups_names(group_id, "pathway_group_data")



# 20
def get_groups_members(gene_symbol, table_name):
    gene_groups_ids = get_groups_ids(gene_symbol, table_name)
    if gene_groups_ids == None:
        return None
    gene_group_symbols = []
    fb_id_schema_name = f"Schema:{table_name}_Group_member_FB_gene_id"
    gene_sym_chema_name = f"Schema:{table_name}_Group_member_FB_gene_symbol"
    for gg_id in gene_groups_ids:
        a_gg_symbols = []
        n1 = Node("Verbatim", gg_id)
        v1 = Variable("v1")
        # gets all gene ids (FBgn#) of group given by gg_id
        sp = Node("Schema", fb_id_schema_name)
        q1 = Link("Execution", ordered=True, targets=[sp, n1, v1])
        #gene_ids = sorted(get_mappings(q1, "v1"))
        gene_ids = sorted(list(set(get_mappings(q1, "v1"))))
        for gene_id in gene_ids:
            n2 = Node("Verbatim", gene_id)
            v2 = Variable("v2")
            ss = Node("Schema", gene_sym_chema_name)
            q1 = Link("Execution", ordered=True, targets=[ss, n2, v2])
            a_gg_symbols.append(get_mappings(q1, "v2")[0].replace("[", "(").replace("]", ")"))
        gene_group_symbols.append(((gg_id, get_group_symbol(gg_id, table_name), get_group_name(gg_id, table_name), len(a_gg_symbols), sorted(a_gg_symbols))))        
    return gene_group_symbols

# 21    
# Returns list(s) of gene symbols that are members of the same group(s) of 
# the gene desinated by gene_symbol
def get_gene_groups_members(gene_symbol):
    return get_groups_members(gene_symbol, "gene_group_data")

# 22
# same as get_gene_group_members() for pathway groups
def get_pathway_groups_members(gene_symbol):
    return get_groups_members(gene_symbol, "pathway_group_data")



# 23
"""
    Level zero holds the groups ids that the gene belongs to.
    Upper levels hold the hierarchy itself.
    An empty list signals that the hierarchy finished in the previous level (exception to zero
    that means there is no hierarchy at all).
"""
def get_group_hierarchy(gene_symbol, table_name):
    gene_group_ids = get_groups_ids(gene_symbol, table_name)
    if gene_group_ids == None:
        return None
    level = 0
    group_hierarchy = [(level, gene_group_ids)]    
    while gene_group_ids != []:
        end = []
        for gg_id in gene_group_ids:
            parents = get_parent_groups_ids(gg_id)
            if parents != []:
                group_hierarchy.append((level + 1, parents))
                end.extend(parents)
            else:
                group_hierarchy.append((level + 1, parents))  # empty parents marks the end of the hierarchy
        level += 1
        gene_group_ids = end
    return group_hierarchy

# 24
"""
    Level zero holds the groups ids that the gene belongs to.
    Upper levels hold the hierarchy itself.
    An empty list signals that the hierarchy finished in the previous level (exception to zero
    that means there is no hierarchy at all).
"""
def get_gene_group_hierarchy(gene_symbol):
    return get_group_hierarchy(gene_symbol, "gene_group_data")

# 25
"""
    Level zero holds the groups ids that the gene belongs to.
    Upper levels hold the hierarchy itself.
    An empty list signals that the hierarchy finished in the previous level (exception to zero
    that means there is no hierarchy at all).
"""
def get_pathway_group_hierarchy(gene_symbol):
    return get_group_hierarchy(gene_symbol, "pathway_group_data")


In [21]:
# Get uniquename of all genes in the knowledge base
v1 = Variable("v1")
v2 = Variable("v2")
#s = Node("Schema", "Schema:feature_uniquename")
s = Node("Schema", "Schema:feature_uniquename")
q1 = Link("Execution", ordered=True, targets=[s, v1, v2])
assignments = query(q1, True)

6.781 seconds
True
736654 answers


In [9]:
# Print the mapping uniquename -> FB id for all features from the above query
clock = WallClock()
clock.start()
for assignment in assignments:
    #if clock.epochs > 100:
        #break    
    pkey_handle = assignment.mapping["v1"] # handle of a "gene" node
    unique_name_handle = assignment.mapping["v2"] # handle of a "Verbatim" node
    pkey = db.get_node_name(pkey_handle) # sequential integer used as PK in the DB table
    unique_name = db.get_node_name(unique_name_handle) # FB id of the gene
    v1 = Variable("v1")
    s = Node("Schema", "Schema:feature_name")
    q = Link("Execution", ordered=True, targets=[s, Node("feature", pkey), v1])
    assignment2 = query(q)
    assert len(assignment2) == 1 # There's only one link between the gene and its name
    name_handle = assignment2.pop().mapping['v1'] # handle of "Verbatim" node
    name = db.get_node_name(name_handle) # gene's name
    if name in ["Su[var]205", "Top3beta", "Mef2", "Clk", "Dref", "TfIIB", "Myc", "AGO2", "Nipped-B", "Cp190", "TfIIA-L",
               "Trl", "ash1", "Raf", "Abd-B", "Orc2", "Rbf", "mof", "msl-1", "Hmr"]:
        print(f"{unique_name} -------------> {name}")

    clock.epoch()
clock.stop()
clock.print()

FBgn0000015 -------------> Abd-B
FBgn0023076 -------------> Clk
FBgn0015799 -------------> Rbf
FBgn0026015 -------------> Top3beta
FBgn0005386 -------------> ash1
FBgn0013263 -------------> Trl
FBgn0026401 -------------> Nipped-B
FBgn0015664 -------------> Dref
FBgn0015270 -------------> Orc2
FBgn0001206 -------------> Hmr
FBgn0262656 -------------> Myc
FBgn0003079 -------------> Raf
FBgn0000283 -------------> Cp190
FBgn0046812 -------------> AGO2
FBgn0011289 -------------> TfIIA-L
FBgn0087035 -------------> AGO2
FBgn0004915 -------------> TfIIB
FBgn0005617 -------------> msl-1
FBgn0014340 -------------> mof
FBgn0011656 -------------> Mef2
1010.240 seconds (1.371 milliseconds per query)


In [19]:
#from hyperon_das_atomdb import UNORDERED_LINK_TYPES, WILDCARD

#from hyperon_das.api import DistributedAtomSpace
#from hyperon_das.exceptions import QueryParametersException
#from hyperon_das.pattern_matcher.pattern_matcher import And, Link, Variable
#import hyperon_das
import csv

class DASLoadingVerifier:

    '''
        Generates a list like this:
        table_strings = [
            "organism_abbreviation",
            "synonym_name",
            "gene_group_data_FB_group_id",
            "gene_group_data_FB_group_symbol" ,
            "gene_group_data_FB_group_name"
        ]

        This is the name of "Schema nodes" in DAS lingo. A "Schema node" represents a column in
        a Flybase precomputed table and it links all data in that column. The name of a Schema
        node is composed of the "underscored" table name underscore "underscored column name". 
        An "underscored name" is a name in which the blank spaces were replced by underscores.

        Parameter:  file_absolute_name
                    Look the "precomputed" directory for the "essential_pairs.txt" file. It should
                    be passed to this function.
    '''
    def read_precomputed_table_columns(self, file_absolute_name):
        #with open(PRECOMPUTED_DIR + "/essential_pairs.txt", 'r') as file:
        with open(file_absolute_name, 'r') as file:
            reader = csv.reader(file, delimiter='\t')
            schema_nodes_list = []
            for row in reader:
                # print(row)
                table_name = row[0].strip()
                table_name = table_name.replace(' ', '_')
                column_names = row[1:]      #  repetitions can be stored in this list, so
                column_names = list( set( column_names ) )  # remove duplicates

                for i in range(0, len(column_names)):
                    if column_names[i] != '':
                        schema_nodes_list.append( table_name + "_" + column_names[i].strip().replace(' ', '_') )

        return schema_nodes_list


    def print_schema_nodes_data(self, file_absolute_name, print_full_data=False):
        #schema_names = self.read_precomputed_table_columns(file_absolute_name)
        schema_names = [
        "ncRNA_genes_soTermId", "ncRNA_genes_primaryId", "ncRNA_genes_gene_symbol", "ncRNA_genes_gene_geneId", "ncRNA_genes_symbol",
        "ncRNA_genes_gene_locusTag", "ncRNA_genes_taxonId", "ncRNA_genes_synonyms_symbol2", "ncRNA_genes_synonyms_symbol1",
        "ncRNA_genes_cross_references_symbol2", "ncRNA_genes_cross_references_symbol1", "ncRNA_genes_related_sequences_sequenceId",
        "ncRNA_genes_related_sequences_relationship", "ncRNA_genes_related_sequences_primaryId", "ncRNA_genes_gene_synonyms_symbol2",
        "ncRNA_genes_gene_synonyms_symbol1", "ncRNA_genes_publications_publication", "ncRNA_genes_publications_primaryId",
        "ncRNA_genes_genome_locations_primaryId", "ncRNA_genes_genome_locations_endPosition", "ncRNA_genes_genome_locations_assembly",
        "ncRNA_genes_genome_locations_gca_accession", "ncRNA_genes_genome_locations_INSDC_accession", "ncRNA_genes_genome_locations_startPosition",
        "ncRNA_genes_genome_locations_strand", "ncRNA_genes_genome_locations_chromosome", "fbgn_fbtr_fbpp_expanded_gene_symbol",
        "fbgn_fbtr_fbpp_expanded_polypeptide_symbol", "fbgn_fbtr_fbpp_expanded_transcript_ID", "fbgn_fbtr_fbpp_expanded_transcript_symbol",
        "fbgn_fbtr_fbpp_expanded_transcript_type", "fbgn_fbtr_fbpp_expanded_annotation_ID", "fbgn_fbtr_fbpp_expanded_organism",
        "fbgn_fbtr_fbpp_expanded_polypeptide_ID", "fbgn_fbtr_fbpp_expanded_gene_fullname", "fbgn_fbtr_fbpp_expanded_gene_ID",
        "dmel_human_orthologs_disease_Dmel_gene_symbol", "dmel_human_orthologs_disease_Human_gene_HGNC_ID",
        "dmel_human_orthologs_disease_Human_gene_OMIM_ID", "dmel_human_orthologs_disease_Dmel_gene_ID",
        "dmel_human_orthologs_disease_Human_gene_symbol", "dmel_human_orthologs_disease_DIOPT_score",
        "dmel_human_orthologs_disease_OMIM_Phenotype_IDs[name]", "Dmel_enzyme_data_gene_symbol", "Dmel_enzyme_data_gene_EC_number(s)",
        "Dmel_enzyme_data_gene_group_EC_number(s)", "Dmel_enzyme_data_gene_name", "Dmel_enzyme_data_gene_group_name",
        "Dmel_enzyme_data_gene_group_GO_id(s)", "Dmel_enzyme_data_gene_id", "Dmel_enzyme_data_gene_group_id",
        "gene_groups_HGNC_FB_group_name", "gene_groups_HGNC_FB_group_symbol", "gene_groups_HGNC_FB_group_id",
        "gene_groups_HGNC_HGNC_family_ID", "best_gene_summary_Summary", "best_gene_summary_Summary_Source",
        "best_gene_summary_Gene_Symbol", "best_gene_summary_FBgn_ID", "gene_map_table_cytogenetic_loc",
        "gene_map_table_organism_abbreviation", "gene_map_table_recombination_loc", "gene_map_table_primary_FBid",
        "gene_map_table_current_symbol", "gene_map_table_sequence_loc", "pathway_group_data_Parent_FB_group_symbol",
        "pathway_group_data_Parent_FB_group_id", "pathway_group_data_Group_member_FB_gene_id", "pathway_group_data_FB_group_name",
        "pathway_group_data_Group_member_FB_gene_symbol", "pathway_group_data_FB_group_symbol", "pathway_group_data_FB_group_id",
        "genotype_phenotype_data_qualifier_ids", "genotype_phenotype_data_genotype_symbols", "genotype_phenotype_data_qualifier_names",
        "genotype_phenotype_data_phenotype_id", "genotype_phenotype_data_genotype_FBids", "genotype_phenotype_data_phenotype_name",
        "genotype_phenotype_data_reference", "gene_group_data_Parent_FB_group_symbol", "gene_group_data_Parent_FB_group_id",
        "gene_group_data_Group_member_FB_gene_id", "gene_group_data_FB_group_name", "gene_group_data_Group_member_FB_gene_symbol",
        "gene_group_data_FB_group_symbol", "gene_group_data_FB_group_id", "dmel_unique_protein_isoforms_representative_protein",
        "dmel_unique_protein_isoforms_FBgn", "dmel_unique_protein_isoforms_identical_protein(s)", "dmel_unique_protein_isoforms_FB_gene_symbol",
        "dmel_gene_sequence_ontology_annotations_so_term_id", "dmel_gene_sequence_ontology_annotations_gene_symbol",
        "dmel_gene_sequence_ontology_annotations_gene_primary_id", "scRNA-Seq_gene_expression_Source_Tissue_Sex",
        "scRNA-Seq_gene_expression_Clustering_Analysis_Name", "scRNA-Seq_gene_expression_Cluster_Cell_Type_Name",
        "scRNA-Seq_gene_expression_Pub_miniref", "scRNA-Seq_gene_expression_Source_Tissue_Stage", "scRNA-Seq_gene_expression_Clustering_Analysis_ID",
        "scRNA-Seq_gene_expression_Gene_Symbol", "scRNA-Seq_gene_expression_Spread", "scRNA-Seq_gene_expression_Pub_ID",
        "scRNA-Seq_gene_expression_Gene_ID", "scRNA-Seq_gene_expression_Source_Tissue_Anatomy", "scRNA-Seq_gene_expression_Mean_Expression",
        "scRNA-Seq_gene_expression_Cluster_ID", "scRNA-Seq_gene_expression_Cluster_Name", "scRNA-Seq_gene_expression_Cluster_Cell_Type_ID",
        "gene_genetic_interactions_Interaction_type", "gene_genetic_interactions_Interacting_gene(s)_symbol",
        "gene_genetic_interactions_Interacting_gene(s)_FBgn", "gene_genetic_interactions_Starting_gene(s)_FBgn",
        "gene_genetic_interactions_Publication_FBrf", "gene_genetic_interactions_Starting_gene(s)_symbol",
        "gene_rpkm_report_Bin_value", "gene_rpkm_report_Parent_library_FBlc#", "gene_rpkm_report_Total_exon_base_count",
        "gene_rpkm_report_Parent_library_name", "gene_rpkm_report_RNASource_name", "gene_rpkm_report_FBgn#",
        "gene_rpkm_report_RNASource_FBlc#", "gene_rpkm_report_GeneSymbol", "gene_rpkm_report_Count_used",
        "gene_rpkm_report_Unique_exon_base_count", "gene_rpkm_report_Release_ID", "gene_rpkm_report_RPKM_value",
        "physical_interactions_mitab_Interaction_Identifier(s)", "physical_interactions_mitab_Interaction_Type(s)",
        "physical_interactions_mitab_Annotation(s)_Interactor_B", "physical_interactions_mitab_Publication_1st_Author(s)",
        "physical_interactions_mitab_Alias(es)_Interactor_B", "physical_interactions_mitab_Alt_ID(s)_Interactor_A",
        "physical_interactions_mitab_Alt_ID(s)_Interactor_B", "physical_interactions_mitab_Interaction_Xref(s)",
        "physical_interactions_mitab_Experimental_Role(s)_Interactor_B", "physical_interactions_mitab_ID(s)_Interactor_A",
        "physical_interactions_mitab_Type(s)_Interactor_A", "physical_interactions_mitab_Biological_Role(s)_Interactor_A",
        "physical_interactions_mitab_Publication_ID(s)", "physical_interactions_mitab_ID(s)_Interactor_B",
        "physical_interactions_mitab_Experimental_Role(s)_Interactor_A", "physical_interactions_mitab_Type(s)_Interactor_B",
        "physical_interactions_mitab_Xref(s)_Interactor_B", "physical_interactions_mitab_Host_Organism(s)",
        "physical_interactions_mitab_Biological_Role(s)_Interactor_B", "physical_interactions_mitab_Interaction_Detection_Method(s)",
        "physical_interactions_mitab_Annotation(s)_Interactor_A", "physical_interactions_mitab_Source_Database(s)",
        "physical_interactions_mitab_Taxid_Interactor_B", "physical_interactions_mitab_Taxid_Interactor_A",
        "physical_interactions_mitab_Xref(s)_Interactor_A", "physical_interactions_mitab_Interaction_Annotation(s)",
        "physical_interactions_mitab_Alias(es)_Interactor_A", "fb_synonym_symbol_synonym(s)", "fb_synonym_organism_abbreviation",
        "fb_synonym_fullname_synonym(s)", "fb_synonym_primary_FBid", "fb_synonym_current_symbol", "fb_synonym_current_fullname",
        "allele_genetic_interactions_allele_symbol", "allele_genetic_interactions_interaction", "allele_genetic_interactions_FBrf#",
        "allele_genetic_interactions_allele_FBal#", "fbrf_pmid_pmcid_doi_DOI", "fbrf_pmid_pmcid_doi_PMCID",
        "fbrf_pmid_pmcid_doi_pub_type", "fbrf_pmid_pmcid_doi_miniref", "fbrf_pmid_pmcid_doi_FBrf",
        "fbrf_pmid_pmcid_doi_pmid_added", "fbrf_pmid_pmcid_doi_PMID", "disease_model_annotations_Based_on_orthology_with_(HGNC_ID)",
        "disease_model_annotations_Evidence/interacting_alleles", "disease_model_annotations_Based_on_orthology_with_(symbol)",
        "disease_model_annotations_DO_qualifier", "disease_model_annotations_DO_ID", "disease_model_annotations_Gene_symbol",
        "disease_model_annotations_Allele_used_in_model_(FBal_ID)", "disease_model_annotations_Reference_(FBrf_ID)",
        "disease_model_annotations_HGNC_ID", "disease_model_annotations_FBgn_ID", "disease_model_annotations_Allele_used_in_model_(symbol)",
        "fbal_to_fbgn_AlleleID", "fbal_to_fbgn_GeneSymbol", "fbal_to_fbgn_AlleleSymbol", "fbal_to_fbgn_GeneID",
        "gene_association_DB_Object_Name", "gene_association_With_(or)_From", "gene_association_GO_ID",
        "gene_association_DB_Object_Type", "gene_association_DB_Object_Symbol", "gene_association_Evidence",
        "gene_association_DB_Object_Synonym", "gene_association_DB_Object_ID", "gene_association_DB:Reference",
        "gene_association_DB", "gene_association_Qualifier", "gene_association_Assigned_by",
        "featureprop_value",
        "featureprop_rank",
        "pubprop_value",
        "pubprop_rank",
        "allele_symbol",
        "allele_is_alleleof",
        "allele_is_construct",
        "allele_propagate_transgenic_uses",
        "allele_gene_is_regulatory_region",
        "allele_stocks_count",
        "allele_pub_count",
        "allele_known_lesion",
        "allele_has_image",
        "feature_name",
        "feature_uniquename",
        "feature_residues",
        "feature_seqlen",
        "grp_name",
        "grp_uniquename",
        "grp_synonym_grp_id",
        "library_name",
        "library_uniquename",
        "feature_synonym_feature_id",
        "feature_is_analysis",
        "feature_is_obsolete",
        "featureloc_fmin",
        "featureloc_is_fmin_partial",
        "featureloc_fmax",
        "featureloc_is_fmax_partial",
        "featureloc_strand",
        "featureloc_phase",
        "featureloc_residue_info",
        "featureloc_locgroup",
        "featureloc_rank",
        "gene_name",
        "gene_uniquename",
        "gene_residues",
        "gene_seqlen",
        "gene_md5checksum",
        "gene_is_analysis",
        "gene_is_obsolete",
        "cvterm_definition",
        "cvterm_is_obsolete",
        "cvterm_is_relationshiptype",
        "dbxref_accession",
        "dbxref_version",
        "dbxref_description",
        "dbxref_url",
        "organism_abbreviation",
        "organism_genus",
        "organism_species",
        "organism_common_name",
        "organism_comment",
        "pub_title",
        "pub_volumetitle",
        "pub_volume",
        "pub_series_name",
        "pub_issue",
        "pub_pyear",
        "pub_pages",
        "pub_miniref",
        "pub_is_obsolete",
        "pub_publisher",
        "pub_pubplace",
        "synonym_name",
        "synonym_synonym_sgml"
        ]


        for column in schema_names:
            v1 = Variable("v1")
            v2 = Variable("v2")
            s =  Node("Schema", "Schema:" + column)
            # s1 = Node("Schema", "Schema:gene_residues")
            q1 = Link("Execution", ordered=True, targets=[s, v1, v2])  # linka schema=s à v1 e v2. v1 = pk e v2 = column
            # q2 = Link("Execution", ordered=True, targets=[s1, v1, v2])
            assignments = query(q1, True)
            # ass2 = query(q2, True)

            cont = 0
            print(f"Number of relationships in column {column}: {len(assignments)}")
            if print_full_data:
                for assignment in assignments:
                    # pkey_handle = assignment.mapping["v1"]
                    # pkey = db.get_node_name(pkey_handle)
                    un_handle = assignment.mapping["v2"]
                    unique_name = db.get_node_name(un_handle)
                    print(str(cont) + ": " + unique_name)
                    cont += 1
                    #if cont > 100:
                        #break
                    # if unique_name == verificar_prefixo(unique_name):
                    #    print("v2: " + unique_name)
            print("FINISHED for column: " + column + "\n")


    def print_schema_nodes_names(self, file_absolute_name, print_full_data=False):
        schema_names = self.read_precomputed_table_columns(file_absolute_name)

        schema_nodes_names = "schema_nodes_names = ["
        for i in range(len(schema_names)):
            if i != ( len(schema_names)  - 1 ):
                schema_nodes_names += f'"{schema_names[i]}",'
            else:
                schema_nodes_names += f'"{schema_names[i]}"]'
        print(schema_nodes_names)

In [20]:

verifyer = DASLoadingVerifier()
file_absolute_name = "/home/saulo/snet/hyperon/das/das/flybase2metta/fb_data/2023_05/precomputed/essential_pairs.txt"
file_absolute_name = "/mnt/hdd_2/saulo/snet/hyperon/das/data/flybase/input/2023_05/precomputed/essential_pairs.txt"
#verifyer.print_schema_nodes_data(file_absolute_name, print_full_data=True)
verifyer.print_schema_nodes_data(file_absolute_name, print_full_data=False)
#verifyer.print_schema_nodes_names(file_absolute_name, print_full_data=False)

54 milliseconds
True
8359 answers
Number of relationships in column ncRNA_genes_soTermId: 8359
FINISHED for column: ncRNA_genes_soTermId

121 milliseconds
True
18144 answers
Number of relationships in column ncRNA_genes_primaryId: 18144
FINISHED for column: ncRNA_genes_primaryId

21 milliseconds
True
3547 answers
Number of relationships in column ncRNA_genes_gene_symbol: 3547
FINISHED for column: ncRNA_genes_gene_symbol

114 milliseconds
True
18986 answers
Number of relationships in column ncRNA_genes_gene_geneId: 18986
FINISHED for column: ncRNA_genes_gene_geneId

27 milliseconds
True
4536 answers
Number of relationships in column ncRNA_genes_symbol: 4536
FINISHED for column: ncRNA_genes_symbol

21 milliseconds
True
3547 answers
Number of relationships in column ncRNA_genes_gene_locusTag: 3547
FINISHED for column: ncRNA_genes_gene_locusTag

47 milliseconds
True
8097 answers
Number of relationships in column ncRNA_genes_taxonId: 8097
FINISHED for column: ncRNA_genes_taxonId

55 millise

In [10]:
# get FB id and group name(s) of gene designated by gene_symbol  <==> DOESN'T GET CORRECT RESULTS IN "essential pairing" mode because
# there is no link from FB_group_name to Group_member_FB_gene_id
def get_gene_group_names(gene_symbol):
    fb_id = get_feature_fb_id(gene_symbol)    
    n1 = Node("Verbatim", fb_id)
    v1 = Variable("v1")
    # groups
    s = Node("Schema", "Schema:gene_group_data_FB_group_name")
    q1 = Link("Execution", ordered=True, targets=[s, n1, v1])
    return fb_id, get_mappings(q1, "v1")
    
print(f"Mef2: {get_gene_group_names('Mef2')}")
print(f"Clk: {get_gene_group_names('Clk')}")
print(f"Myc: {get_gene_group_names('Myc')}")
print(f"Abd-B: {get_gene_group_names('Abd-B')}")

Mef2: ('FBgn0011656', [])
Clk: ('FBgn0023076', [])
Myc: ('FBgn0262656', [])
Abd-B: ('FBgn0000015', [])


In [29]:
"""
This is an expanded version of the above tailored to performing queries using only the  "essential pairing" of columns.

In fact, the three functions use the same code structure. So, the three could be merged into only one adding a "schema parameter"
(and changing identifiers, of course
"""
# get FB id and group name(s) of gene designated by gene_symbol
def get_gene_group_ids(gene_symbol):
    fb_id = get_feature_fb_id(gene_symbol)    
    n1 = Node("Verbatim", fb_id)
    v1 = Variable("v1")
    # groups
    s = Node("Schema", "Schema:gene_group_data_FB_group_id")
    
    q1 = Link("Execution", ordered=True, targets=[s, n1, v1])
    return fb_id, get_mappings(q1, "v1")


def get_gene_group_symbols(gene_symbol):
    gene_fb_id, gene_groups_ids = get_gene_group_ids(gene_symbol)

    gg_symbols = []
    for gg_id in gene_groups_ids:
        n1 = Node("Verbatim", gg_id)
        v1 = Variable("v1")
        # groups
        s = Node("Schema", "Schema:gene_group_data_FB_group_symbol")
        
        q1 = Link("Execution", ordered=True, targets=[s, n1, v1])
        for symb in get_mappings(q1, "v1"):
            gg_symbols.append(symb)
    return gene_fb_id, gg_symbols


# get FB id and group name(s) of gene designated by gene_symbol
def get_gene_group_names(gene_symbol):
    gene_fb_id, gene_groups_ids = get_gene_group_ids(gene_symbol)
    
    gg_names = []
    for gg_id in gene_groups_ids:
        n1 = Node("Verbatim", gg_id)
        v1 = Variable("v1")
        # groups
        s = Node("Schema", "Schema:gene_group_data_FB_group_name")
        
        q1 = Link("Execution", ordered=True, targets=[s, n1, v1])
        for gg_name in get_mappings(q1, "v1"):
            gg_names.append(gg_name)
    return gene_fb_id, gg_names


symbols_list = ["Mef2", "Clk", "Myc", "Abd-B"]
symbols_list = ["Su(var)205", "Top3beta", "Mef2", "Clk", "Dref", "TfIIB", "Myc", "AGO2", "Nipped-B", "Cp190", "TfIIA-L",
                "Trl", "ash1", "Raf", "Abd-B", "Orc2", "Rbf", "mof", "msl-1", "Hmr"]
for gene_symbol in symbols_list:
    gene_symbol = gene_symbol.replace("(", "[").replace(")", "]")
    print(f"{gene_symbol}: {get_gene_group_ids(gene_symbol)}")
    print(f"{gene_symbol}: {get_gene_group_symbols(gene_symbol)}")
    print(f"{gene_symbol}: {get_gene_group_names(gene_symbol)}")


Su[var]205: ('FBgn0003607', ['FBgg0000213'])
Su[var]205: ('FBgn0003607', ['HP1'])
Su[var]205: ('FBgn0003607', ['HETEROCHROMATIN PROTEIN 1 FAMILY'])
Top3beta: ('FBgn0026015', ['FBgg0001554'])
Top3beta: ('FBgn0026015', ['TOPI'])
Top3beta: ('FBgn0026015', ['DNA TOPOISOMERASES TYPE I'])
Mef2: ('FBgn0011656', ['FBgg0000753'])
Mef2: ('FBgn0011656', ['MADSTF'])
Mef2: ('FBgn0011656', ['MADS-BOX TRANSCRIPTION FACTORS'])
Clk: ('FBgn0023076', ['FBgg0000727'])
Clk: ('FBgn0023076', ['BHLH'])
Clk: ('FBgn0023076', ['BASIC HELIX-LOOP-HELIX TRANSCRIPTION FACTORS'])
Dref: ('FBgn0015664', ['FBgg0000732'])
Dref: ('FBgn0015664', ['ZF-C2H2'])
Dref: ('FBgn0015664', ['C2H2 ZINC FINGER TRANSCRIPTION FACTORS'])
TfIIB: ('FBgn0004915', ['FBgg0000346', 'FBgg0001204'])
TfIIB: ('FBgn0004915', ['TFIIB', 'ACTT-U'])
TfIIB: ('FBgn0004915', ['TRANSCRIPTION FACTOR II B', 'UNCLASSIFIED ACETYLTRANSFERASES'])
Myc: ('FBgn0262656', ['FBgg0000727'])
Myc: ('FBgn0262656', ['BHLH'])
Myc: ('FBgn0262656', ['BASIC HELIX-LOOP-HELIX TR