In [1]:
from das.distributed_atom_space import DistributedAtomSpace, QueryOutputFormat
from das.database.db_interface import UNORDERED_LINK_TYPES
from das.pattern_matcher.pattern_matcher import PatternMatchingAnswer, OrderedAssignment, UnorderedAssignment, CompositeAssignment, Node, Link, Variable, Not, And, Or, TypedVariable, LinkTemplate
from das.database.db_interface import WILDCARD
from das.expression_hasher import ExpressionHasher
import warnings
import numpy as np
from itertools import combinations
warnings.filterwarnings('ignore')
TARGET_NODES = None
das = DistributedAtomSpace()
db = das.db
das.count_atoms()

Log initialized. Log file: /tmp/das.log


(1702797, 27871440)

In [2]:
def get_gene_node(name):
    verbatim_node = das.get_node("Verbatim", name)
    schema_node = das.get_node("Schema", "Schema:sql_gene_name")
    print(f"verbatim_node = {verbatim_node}")
    print(f"schema_node = {schema_node}")
    v1 = Variable("v1")
    links = das.get_links("Execution", None, [schema_node, WILDCARD, verbatim_node])
    print(f"links = {links}")
    link = das.get_atom(links[0], output_format=QueryOutputFormat.ATOM_INFO)
    print(f"link = {link}")
    gene_node_handle = link["targets"][1]
    print(f"gene_node_handle = {gene_node_handle}")
    gene_node = das.get_atom(gene_node_handle, output_format=QueryOutputFormat.ATOM_INFO)
    print(f"gene_node = {gene_node}")
    return Node("gene", gene_node["name"])

print(das.get_node("gene", "3106709"))

80ef77c79ab33f7a7e5d3070a09ded02


In [7]:
GENE_LIST = [
    "mus101"
]

In [8]:
USE_SUBSTRING = False

if USE_SUBSTRING:
    TARGET_TYPE = "Concept"
    TARGET_SUBSTRING = "gl"
else:
    TARGET_NODES = [
        get_gene_node(gene) for gene in GENE_LIST
    ]

NGRAM = 3
SUPPORT = 0
HALO_LENGTH = 2
DEPTH_WEIGTH = [1, 1]
ISURPRISINGNESS_REPORT_THRESHOLD = 0
EPOCHS = 1000
NORMALIZED_ISURPRISINGNESS = False

verbatim_node = 9a65a498651b799372edaf3ad3f639c0
schema_node = 7494788453289a15a95f81a916c9cc21
links = ['49c763399ab9dcae90a10377fb73e65f']
link = {'handle': '49c763399ab9dcae90a10377fb73e65f', 'type': 'Execution', 'template': ['Execution', 'Schema', 'gene', 'Verbatim'], 'targets': ['7494788453289a15a95f81a916c9cc21', '80ef77c79ab33f7a7e5d3070a09ded02', '9a65a498651b799372edaf3ad3f639c0']}
gene_node_handle = 80ef77c79ab33f7a7e5d3070a09ded02
gene_node = {}


KeyError: 'name'

In [9]:
assert len(DEPTH_WEIGTH) == HALO_LENGTH
halo_levels = [i for i in range(HALO_LENGTH)]
if TARGET_NODES is None:
    atomspace_nodes = db.get_matched_node_name(TARGET_TYPE, TARGET_SUBSTRING)
    print(atomspace_nodes)
    TARGET_NODES = [Node(TARGET_TYPE, db.get_node_name(h)) for h in atomspace_nodes]
print(f"TARGET_NODES = {TARGET_NODES}")

NameError: name 'DEPTH_WEIGTH' is not defined

In [None]:
def print_ordered_assignment(assignment):
    if assignment is not None:
        for key, value in assignment.mapping.items():
            print(f"{key}: {db.get_node_name(value)}")

def print_unordered_assignment(assignment):
    if assignment is not None:
        symbols = []
        for key in assignment.symbols:
            for i in range(assignment.symbols[key]):
                symbols.append(key)
        values = []
        for key in assignment.values:
            for i in range(assignment.values[key]):
                values.append(key)
        mapping_keys = []
        mapping_values = []
        for symbol, value in zip(symbols, values):
            mapping_keys.append(symbol)
            mapping_values.append(db.get_node_name(value))
        print(f"{mapping_keys} = {mapping_values}")
        
def build_pattern_from_template(template):
    targets = []
    count_variables = 1
    for target in template[1:]:
        if target == WILDCARD:
            targets.append(Variable(f"V{count_variables}"))
            count_variables += 1
        else:
            #node_document = das.get_atom(target, output_format=QueryOutputFormat.ATOM_INFO)
            try:
                node_type = das.get_node_type(target)
                node_name = das.get_node_name(target)
                targets.append(Node(node_type, node_name))
            except:
                return None
    return Link(template[0], ordered=(template[0] not in UNORDERED_LINK_TYPES), targets=targets)

def _random_selection(v):
    return v[np.random.randint(len(v))]

def random_selection(v, n=1):
    if n == 1:
        return _random_selection(v)
    assert n <= (len(v) / 2)
    a = v.copy()
    selected = []
    for i in range(n):
        s = _random_selection(a)
        a.remove(s)
        selected.append(s)
    return selected

def build_roulette(w):
    answer = []
    s = sum(w)
    acc = 0
    for v in w:
        acc += v / s
        answer.append(acc)
    answer[-1] = 1
    return answer

def roulette_selection(v, w):
    assert len(v) == len(w)
    random = np.random.random()
    for i in range(len(v)):
        if random <= w[i]:
            return v[i]
    
def compute_count(logical_expression):
    query_answer = PatternMatchingAnswer()
    matched = logical_expression.matched(db, query_answer)
    return len(query_answer.assignments) if matched else 0
        
def prob(count):
    return count / universe_size

def compute_isurprisingness(count, terms, term_handles, counts, normalized = False):
    n = len(term_handles)
    if n == 2:
        subset_probs = [prob(counts[0]) * prob(counts[1])]
    elif n == 3:
        subset_probs = [
            prob(counts[0]) * prob(counts[1]) * prob(counts[2]),
            prob(compute_count(And([terms[0], terms[1]]))) * prob(counts[2]), 
            prob(compute_count(And([terms[0], terms[2]]))) * prob(counts[1]),
            prob(compute_count(And([terms[1], terms[2]]))) * prob(counts[0])
        ]
    elif n == 4:
        subset_probs = [
            prob(counts[0]) * prob(counts[1]) * prob(counts[2]) * prob(counts[3]),
            prob(compute_count(And([terms[0], terms[1]]))) * prob(compute_count(And([terms[2], terms[3]]))),
            prob(compute_count(And([terms[0], terms[2]]))) * prob(compute_count(And([terms[1], terms[3]]))),
            prob(compute_count(And([terms[0], terms[3]]))) * prob(compute_count(And([terms[1], terms[2]]))),
            prob(compute_count(And([terms[0], terms[1], terms[2]]))) * prob(counts[3]),
            prob(compute_count(And([terms[0], terms[1], terms[3]]))) * prob(counts[2]),
            prob(compute_count(And([terms[0], terms[2], terms[3]]))) * prob(counts[1]),
            prob(compute_count(And([terms[1], terms[2], terms[3]]))) * prob(counts[0])
        ]
    else:
        raise NotImplementedError()
    p = prob(count)
    isurprisingness = max([p - max(subset_probs), min(subset_probs) - p])
    if normalized:
        return isurprisingness / p
    else:
        return isurprisingness
    
def build_patterns(links):
    pattern = {}
    pattern_count = {}
    link_count = 0
    for link in links:
        link_count += 1
        if link_count % 100000 == 0 or link_count == 1 or link_count == len(links):
            print(f"link {link_count}/{len(links)}")
        #link_document = das.get_atom(link, output_format=QueryOutputFormat.ATOM_INFO)
        #targets = link_document['targets']
        #link_type = link_document['type']
        targets = das.get_link_targets(link)
        link_type = das.get_link_type(link)
        arity = len(targets)
        if arity == 2:
            templates = [
                [link_type, WILDCARD, targets[1]],
                [link_type, targets[0], WILDCARD],
                #[link_type, WILDCARD, WILDCARD],
            ]
        elif arity == 3:
            templates = [
                [link_type, WILDCARD, targets[1], targets[2]],
                [link_type, targets[0], WILDCARD, targets[2]],
                [link_type, targets[0], targets[1], WILDCARD],
                [link_type, WILDCARD, WILDCARD, targets[2]],
                [link_type, WILDCARD, targets[1], WILDCARD],
                [link_type, targets[0], WILDCARD, WILDCARD],
                #[link_type, WILDCARD, WILDCARD, WILDCARD],
            ]
        else:
            raise NotImplementedError()
        for template in templates:
            p = build_pattern_from_template(template)
            if p is not None:
                template_handle = ExpressionHasher.composite_hash(template)
                pattern[template_handle] = p
                pattern_count[template_handle] = len(das.get_links(template[0], None, template[1:]))
    return tuple([pattern, pattern_count])
        
def build_composite_pattern(terms):
    assert len(terms) > 1
    for i in range(len(terms)):
        if i == 0:
            first_term = terms[i]
        else:
            second_term = terms[i]
            composite_pattern = And([first_term, second_term])
            first_term = composite_pattern
    return composite_pattern
    
def print_query(pattern):
    print(pattern)
    query_answer = PatternMatchingAnswer()
    pattern.matched(db, query_answer)
    for assignment in query_answer.assignments:
        if type(assignment) is OrderedAssignment:
            print_ordered_assignment(assignment)
        elif type(assignment) is UnorderedAssignment:
            print_unordered_assignment(assignment)
        elif type(assignment) is CompositeAssignment:
            print_ordered_assignment(assignment.ordered_mapping)
            for unordered_assignment in assignment.unordered_mappings:
                print_unordered_assignment(unordered_assignment)
        print("")
        
halo_level_roulette = build_roulette(DEPTH_WEIGTH)

In [None]:
node_handle_list = set([ExpressionHasher.terminal_hash(n.atom_type, n.name) for n in TARGET_NODES])
print(f"node_handle_list = {node_handle_list}")
links = [set() for i in range(HALO_LENGTH)]
for level in range(HALO_LENGTH):
    new_level_node_handles = set()
    node_handle_count = 0
    for node_handle in node_handle_list:
        node_handle_count += 1
        #print(f"===========================================")
        print(f"Halo level {level+1}/{HALO_LENGTH} node_handle {node_handle_count}/{len(node_handle_list)}")
        template_list = [
            [node_handle, WILDCARD], 
            [WILDCARD, node_handle], 
            [node_handle, WILDCARD, WILDCARD], 
            [WILDCARD, node_handle, WILDCARD], 
            [WILDCARD, WILDCARD, node_handle]
        ]
        for template in template_list:
            #print(f"template = {template}")
            link_list = set(das.get_links(None, None, template))
            #print(f"len(link_list) = {len(link_list)}")
            for link in link_list:
                #link_document = das.get_atom(link, output_format=QueryOutputFormat.ATOM_INFO)
                for h in das.get_link_targets(link):
                    new_level_node_handles.add(h)
            links[level].update(link_list)
    node_handle_list.update(new_level_node_handles)
for level in range(HALO_LENGTH):
    if level == 0:
        all_links = set([link for link in links[level]])
    else:
        links[level] = links[level].difference(all_links)
        all_links.update(links[level])
universe_size = len(all_links)
print(f"===========================================")
print(f"Done - universe_size = {universe_size}")
print(f"===========================================")

In [None]:
#print(node_handle_list)
#print(links)

In [None]:
total = 0
for level in range(HALO_LENGTH):
    total += len(links[level])
    print(len(links[level]))
print("----------")
print(total)
#links

In [None]:
pattern = [None for i in range(HALO_LENGTH)]
pattern_count = [None for i in range(HALO_LENGTH)]
pattern_handles = [None for i in range(HALO_LENGTH)]
all_patterns = {}
all_patterns_count = {}
for level in range(HALO_LENGTH):
    print(f"###########################################")
    print(f"Building patterns for level {level}")
    pattern[level], pattern_count[level] = build_patterns(links[level])
    pattern_handles[level] = [key for key in pattern[level].keys()]
    for key, value in pattern[level].items():
        all_patterns[key] = value
    for key, value in pattern_count[level].items():
        all_patterns_count[key] = value
print(f"===========================================")
print(f"Done - len(all_patterns) = {len(all_patterns)}")
print(f"===========================================")

In [None]:
total = 0
for level in range(HALO_LENGTH):
    total += len(pattern_handles[level])
    print(len(pattern_handles[level]))
print("----------")
print(total)
#pattern_handles

In [None]:
higher_isurprisingness = 0
best_pattern = None
for i in range(EPOCHS):
    if i % 1000 == 0 or i == EPOCHS - 1:
        print(f"Epoch {i + 1}/{EPOCHS}")
    selected_handle = random_selection(pattern_handles[0])
    term_handles = [tuple([selected_handle, 0])]
    terms = [pattern[0][selected_handle]]
    counts = [pattern_count[0][selected_handle]]
    for i in range(NGRAM - 1):
        while True:
            selected_level = roulette_selection(halo_levels, halo_level_roulette)
            selected_handle = random_selection(pattern_handles[selected_level])
            if tuple([selected_handle, selected_level]) not in term_handles:
                break
        term_handles.append(tuple([selected_handle, selected_level]))
        terms.append(pattern[selected_level][selected_handle])
        counts.append(pattern_count[selected_level][selected_handle])
    composite_pattern = build_composite_pattern(terms)
    count = compute_count(composite_pattern)
    if count > 0:
        print(f"Count: {count}")
    if count >= SUPPORT:
        isurprisingness = compute_isurprisingness(count, terms, term_handles, counts, normalized=NORMALIZED_ISURPRISINGNESS) 
        if isurprisingness > higher_isurprisingness:
            print(f"{count} {isurprisingness}: {terms} {term_handles} {counts}")
            higher_isurprisingness = isurprisingness
            best_pattern = composite_pattern
print_query(best_pattern)

In [None]:
higher_isurprisingness = 0
best_pattern = None
all_patterns_handles = all_patterns.keys()

count_bh = 0
for basic_handle in pattern_handles[0]:
    count_bh += 1
    print(f"Cycle {count_bh}/{len(pattern_handles[0])}")
    for combination_handles in combinations(all_patterns, NGRAM - 1):
        if basic_handle in combination_handles:
            continue
        term_handles = [basic_handle, *combination_handles]
        terms = [all_patterns[handle] for handle in term_handles]
        counts = [all_patterns_count[handle] for handle in term_handles]
        composite_pattern = build_composite_pattern(terms)
        count = compute_count(composite_pattern)
        if count >= SUPPORT:
            isurprisingness = compute_isurprisingness(count, terms, term_handles, counts, normalized=NORMALIZED_ISURPRISINGNESS) 
            if isurprisingness > higher_isurprisingness:
                print(f"{count} {isurprisingness}: {terms} {counts}")
                higher_isurprisingness = isurprisingness
                best_pattern = composite_pattern    
print_query(best_pattern)