From 76aaa0362a9576713a11e7a9ade1a7d94ace1e3f Mon Sep 17 00:00:00 2001 From: Haoyue Dai Date: Thu, 23 Dec 2021 12:03:33 -0500 Subject: [PATCH 1/3] Debug: 1. cache refresh at every init; 2. compute cardinality only once --- .gitignore | 1 + causallearn/graph/GraphClass.py | 5 +- causallearn/search/ConstraintBased/FCI.py | 47 ++++++++++--------- causallearn/utils/Fas.py | 26 +++++----- .../utils/PCUtils/SkeletonDiscovery.py | 10 ++-- causallearn/utils/cit.py | 6 +-- 6 files changed, 48 insertions(+), 47 deletions(-) diff --git a/.gitignore b/.gitignore index b1ad19ac..ea388fe7 100644 --- a/.gitignore +++ b/.gitignore @@ -140,4 +140,5 @@ dmypy.json # Developer's local tests *localtest.py +data-localtest/ diff --git a/causallearn/graph/GraphClass.py b/causallearn/graph/GraphClass.py index 360028d3..cc2c314f 100644 --- a/causallearn/graph/GraphClass.py +++ b/causallearn/graph/GraphClass.py @@ -44,8 +44,6 @@ def __init__(self, no_of_var): self.cardinalities = None # only works when self.data is discrete, i.e. self.test is chisq or gsq self.is_discrete = False self.citest_cache = dict() - self.data_hash_key = None - self.ci_test_hash_key = None def set_ind_test(self, indep_test, mvpc=False): """Set the conditional independence test that will be used""" @@ -53,7 +51,6 @@ def set_ind_test(self, indep_test, mvpc=False): if mvpc: self.mvpc = True self.test = indep_test - self.ci_test_hash_key = hash(indep_test) def ci_test(self, i, j, S): """Define the conditional independence test""" @@ -62,7 +59,7 @@ def ci_test(self, i, j, S): return self.test(self.data, self.nx_skel, self.prt_m, i, j, S, self.data.shape[0]) i, j = (i, j) if (i < j) else (j, i) - ijS_key = (i, j, frozenset(S), self.data_hash_key, self.ci_test_hash_key) + ijS_key = (i, j, frozenset(S)) if ijS_key in self.citest_cache: return self.citest_cache[ijS_key] # if discrete, assert self.test is chisq or gsq, pass into cardinalities diff --git a/causallearn/search/ConstraintBased/FCI.py b/causallearn/search/ConstraintBased/FCI.py index 831ea0db..0e2569cc 100644 --- a/causallearn/search/ConstraintBased/FCI.py +++ b/causallearn/search/ConstraintBased/FCI.py @@ -4,7 +4,7 @@ from causallearn.graph.GraphNode import GraphNode from causallearn.utils.PCUtils.BackgroundKnowledge import BackgroundKnowledge from causallearn.utils.cit import * -from causallearn.utils.Fas import fas, citest_cache +from causallearn.utils import Fas from causallearn.graph.Endpoint import Endpoint from causallearn.utils.ChoiceGenerator import ChoiceGenerator import numpy as np @@ -20,8 +20,6 @@ def __init__(self, data, graph, independence_test, alpha, knowledge, depth, maxP self.depth = depth self.maxPathLength = maxPathLength self.verbose = verbose - self.data_hash_key = hash(self.data.tobytes()) - self.ci_test_hash_key = hash(self.independence_test) def traverseSemiDirected(self, node, edge): @@ -194,12 +192,13 @@ def get_cond_set(self, node_1, node_2, max_path_length): X, Y = self.graph.node_map[node_1], self.graph.node_map[node_2] X, Y = (X, Y) if (X < Y) else (Y, X) - XYS_key = (X, Y, frozenset(condSet), self.data_hash_key, self.ci_test_hash_key) - if XYS_key in citest_cache: - p_value = citest_cache[XYS_key] + XYS_key = (X, Y, frozenset(condSet)) + if XYS_key in Fas.citest_cache: + p_value = Fas.citest_cache[XYS_key] else: - p_value = self.independence_test(self.data, X, Y, tuple(condSet)) - citest_cache[XYS_key] = p_value + p_value = self.independence_test(self.data, X, Y, tuple(condSet), Fas.cardinalities) if Fas.is_discrete \ + else self.independence_test(self.data, X, Y, tuple(condSet)) + Fas.citest_cache[XYS_key] = p_value independent = p_value > self.alpha if independent and noEdgeRequired: @@ -442,14 +441,13 @@ def doDdpOrientation(node_d, node_a, node_b, node_c, previous, graph, data, inde X, Y = graph.node_map[node_d], graph.node_map[node_c] X, Y = (X, Y) if (X < Y) else (Y, X) condSet = tuple([graph.node_map[nn] for nn in path]) - data_hash_key = hash(data.tobytes()) - ci_test_hash_key = hash(independence_test_method) - XYS_key = (X, Y, frozenset(condSet), data_hash_key, ci_test_hash_key) - if XYS_key in citest_cache: - p_value = citest_cache[XYS_key] + XYS_key = (X, Y, frozenset(condSet)) + if XYS_key in Fas.citest_cache: + p_value = Fas.citest_cache[XYS_key] else: - p_value = independence_test_method(data, X, Y, condSet) - citest_cache[XYS_key] = p_value + p_value = independence_test_method(data, X, Y, condSet, Fas.cardinalities) if Fas.is_discrete \ + else independence_test_method(data, X, Y, condSet) + Fas.citest_cache[XYS_key] = p_value ind = p_value > alpha path2 = list(path) @@ -458,12 +456,13 @@ def doDdpOrientation(node_d, node_a, node_b, node_c, previous, graph, data, inde X, Y = graph.node_map[node_d], graph.node_map[node_c] X, Y = (X, Y) if (X < Y) else (Y, X) condSet = tuple([graph.node_map[nn2] for nn2 in path2]) - XYS_key = (X, Y, frozenset(condSet), data_hash_key, ci_test_hash_key) - if XYS_key in citest_cache: - p_value2 = citest_cache[XYS_key] + XYS_key = (X, Y, frozenset(condSet)) + if XYS_key in Fas.citest_cache: + p_value2 = Fas.citest_cache[XYS_key] else: - p_value2 = independence_test_method(data, X, Y, condSet) - citest_cache[XYS_key] = p_value2 + p_value2 = independence_test_method(data, X, Y, condSet, Fas.cardinalities) if Fas.is_discrete \ + else independence_test_method(data, X, Y, condSet) + Fas.citest_cache[XYS_key] = p_value2 ind2 = p_value2 > alpha if not ind and not ind2: @@ -613,11 +612,17 @@ def fci(dataset, independence_test_method = fisherz, alpha=0.05, depth=-1, max_p if dataset.shape[0] < dataset.shape[1]: warnings.warn("The number of features is much larger than the sample size!") + Fas.citest_cache = dict() # DEBUG@2021/12/23, must refresh cache every time at initialization + Fas.cardinalities = None + Fas.is_discrete = False + def _unique(column): return np.unique(column, return_inverse=True)[1] if independence_test_method == chisq or independence_test_method == gsq: dataset = np.apply_along_axis(_unique, 0, dataset).astype(np.int64) + Fas.is_discrete = True + Fas.cardinalities = np.max(dataset, axis=0) + 1 ## ------- check parameters ------------ @@ -636,7 +641,7 @@ def _unique(column): nodes.append(node) # FAS (“Fast Adjacency Search”) is the adjacency search of the PC algorithm, used as a first step for the FCI algorithm. - graph, sep_sets = fas(dataset, nodes, independence_test_method=independence_test_method, alpha=alpha, knowledge=background_knowledge, depth=depth, verbose=verbose) + graph, sep_sets = Fas.fas(dataset, nodes, independence_test_method=independence_test_method, alpha=alpha, knowledge=background_knowledge, depth=depth, verbose=verbose) # reorient all edges with CIRCLE Endpoint ori_edges = graph.get_graph_edges() diff --git a/causallearn/utils/Fas.py b/causallearn/utils/Fas.py index 4f10bf14..14dac48b 100644 --- a/causallearn/utils/Fas.py +++ b/causallearn/utils/Fas.py @@ -7,6 +7,8 @@ from tqdm.auto import tqdm citest_cache = dict() +cardinalities = None # only works for discrete data +is_discrete = False def possible_parents(node_x, adjx, knowledge=None): @@ -45,8 +47,6 @@ def forbiddenEdge(node_x, node_y, knowledge): def searchAtDepth0(data, nodes, adjacencies, sep_sets, independence_test_method=fisherz, alpha=0.05, verbose=False, knowledge=None, pbar=None): empty = [] - data_hash_key = hash(data.tobytes()) - ci_test_hash_key = hash(independence_test_method) show_progress = not pbar is None if show_progress: pbar.reset() for i in range(len(nodes)): @@ -56,11 +56,12 @@ def searchAtDepth0(data, nodes, adjacencies, sep_sets, independence_test_method= print(nodes[i + 1].get_name()) for j in range(i+1, len(nodes)): - ijS_key = (i, j, frozenset(), data_hash_key, ci_test_hash_key) + ijS_key = (i, j, frozenset()) if ijS_key in citest_cache: p_value = citest_cache[ijS_key] else: - p_value = independence_test_method(data, i, j, tuple(empty)) + p_value = independence_test_method(data, i, j, tuple(empty), cardinalities) if is_discrete \ + else independence_test_method(data, i, j, tuple(empty)) citest_cache[ijS_key] = p_value independent = p_value > alpha no_edge_required = True if knowledge is None else \ @@ -80,9 +81,6 @@ def searchAtDepth0(data, nodes, adjacencies, sep_sets, independence_test_method= def searchAtDepth(data, depth, nodes, adjacencies, sep_sets, independence_test_method=fisherz, alpha=0.05, verbose=False, knowledge=None, pbar=None): - data_hash_key = hash(data.tobytes()) - ci_test_hash_key = hash(independence_test_method) - def edge(adjx, i, adjacencies_completed_edge): for j in range(len(adjx)): node_y = adjx[j] @@ -100,11 +98,12 @@ def edge(adjx, i, adjacencies_completed_edge): Y = nodes.index(adjx[j]) X, Y = (i, Y) if (i < Y) else (Y, i) - XYS_key = (X, Y, frozenset(cond_set), data_hash_key, ci_test_hash_key) + XYS_key = (X, Y, frozenset(cond_set)) if XYS_key in citest_cache: p_value = citest_cache[XYS_key] else: - p_value = independence_test_method(data, X, Y, tuple(cond_set)) + p_value = independence_test_method(data, X, Y, tuple(cond_set), cardinalities) if is_discrete \ + else independence_test_method(data, X, Y, tuple(cond_set)) citest_cache[XYS_key] = p_value independent = p_value > alpha @@ -166,10 +165,6 @@ def edge(adjx, i, adjacencies_completed_edge): def searchAtDepth_not_stable(data, depth, nodes, adjacencies, sep_sets, independence_test_method=fisherz, alpha=0.05, verbose=False, knowledge=None, pbar=None): - - data_hash_key = hash(data.tobytes()) - ci_test_hash_key = hash(independence_test_method) - def edge(adjx, i, adjacencies_completed_edge): for j in range(len(adjx)): node_y = adjx[j] @@ -187,11 +182,12 @@ def edge(adjx, i, adjacencies_completed_edge): Y = nodes.index(adjx[j]) X, Y = (i, Y) if (i < Y) else (Y, i) - XYS_key = (X, Y, frozenset(cond_set), data_hash_key, ci_test_hash_key) + XYS_key = (X, Y, frozenset(cond_set)) if XYS_key in citest_cache: p_value = citest_cache[XYS_key] else: - p_value = independence_test_method(data, X, Y, tuple(cond_set)) + p_value = independence_test_method(data, X, Y, tuple(cond_set), cardinalities) if is_discrete \ + else independence_test_method(data, X, Y, tuple(cond_set)) citest_cache[XYS_key] = p_value independent = p_value > alpha diff --git a/causallearn/utils/PCUtils/SkeletonDiscovery.py b/causallearn/utils/PCUtils/SkeletonDiscovery.py index 2ecafc42..7d959c0c 100644 --- a/causallearn/utils/PCUtils/SkeletonDiscovery.py +++ b/causallearn/utils/PCUtils/SkeletonDiscovery.py @@ -1,5 +1,5 @@ from itertools import combinations -from causallearn.utils.Fas import fas +from causallearn.utils import Fas import numpy as np from causallearn.graph.GraphClass import CausalGraph from causallearn.utils.PCUtils.Helper import append_value @@ -51,7 +51,6 @@ def _unique(column): cg.cardinalities = np.max(cg.data, axis=0) + 1 else: cg.data = data - cg.data_hash_key = hash(data.tobytes()) depth = -1 pbar = tqdm(total=no_of_var) if show_progress else None @@ -143,6 +142,9 @@ def skeleton_discovery_using_fas(data, alpha, indep_test, stable=True, backgroun assert type(data) == np.ndarray assert 0 < alpha < 1 + Fas.citest_cache = dict() # DEBUG@2021/12/23, must refresh cache every time at initialization + Fas.cardinalities = None + Fas.is_discrete = False no_of_var = data.shape[1] cg = CausalGraph(no_of_var) @@ -160,11 +162,13 @@ def _unique(column): cg.is_discrete = True cg.data = np.apply_along_axis(_unique, 0, data).astype(np.int64) cg.cardinalities = np.max(cg.data, axis=0) + 1 + Fas.cardinalities = cg.cardinalities # DEBUG@2021/12/23, no repeat calculating cardinalities at every chisq/gsq. + Fas.is_discrete = True else: cg.data = data - graph, sep_sets = fas(cg.data, cg.G.nodes, independence_test_method=indep_test, alpha=alpha, + graph, sep_sets = Fas.fas(cg.data, cg.G.nodes, independence_test_method=indep_test, alpha=alpha, knowledge=background_knowledge, depth=-1, verbose=verbose, stable=stable, show_progress=show_progress) for (x, y) in sep_sets.keys(): diff --git a/causallearn/utils/cit.py b/causallearn/utils/cit.py index 995d7891..b7a52fa0 100644 --- a/causallearn/utils/cit.py +++ b/causallearn/utils/cit.py @@ -176,15 +176,13 @@ def fisherz(data, X, Y, condition_set, correlation_matrix=None): return p -def chisq(data, X, Y, conditioning_set, cardinalities=None): +def chisq(data, X, Y, conditioning_set, cardinalities): # though cardinalities can be computed from data, here we pass it as argument, # to prevent from repeated computation on each variable's vardinality - if cardinalities is None: cardinalities = np.max(data, axis=0) + 1 indexs = list(conditioning_set) + [X, Y] return chisq_or_gsq_test(data[:, indexs].T, cardinalities[indexs]) -def gsq(data, X, Y, conditioning_set, cardinalities=None): - if cardinalities is None: cardinalities = np.max(data, axis=0) + 1 +def gsq(data, X, Y, conditioning_set, cardinalities): indexs = list(conditioning_set) + [X, Y] return chisq_or_gsq_test(data[:, indexs].T, cardinalities[indexs], G_sq=True) From a255562b5e0f1cc0d5212f6aecf6be2ea8634066 Mon Sep 17 00:00:00 2001 From: Haoyue Dai Date: Thu, 23 Dec 2021 15:17:06 -0500 Subject: [PATCH 2/3] Package the search cache + CI test into a function in FAS --- causallearn/search/ConstraintBased/FCI.py | 30 ++++------------ causallearn/utils/Fas.py | 42 +++++++++-------------- 2 files changed, 23 insertions(+), 49 deletions(-) diff --git a/causallearn/search/ConstraintBased/FCI.py b/causallearn/search/ConstraintBased/FCI.py index 0e2569cc..d2839777 100644 --- a/causallearn/search/ConstraintBased/FCI.py +++ b/causallearn/search/ConstraintBased/FCI.py @@ -191,14 +191,8 @@ def get_cond_set(self, node_1, node_2, max_path_length): choice = cg.next() X, Y = self.graph.node_map[node_1], self.graph.node_map[node_2] - X, Y = (X, Y) if (X < Y) else (Y, X) - XYS_key = (X, Y, frozenset(condSet)) - if XYS_key in Fas.citest_cache: - p_value = Fas.citest_cache[XYS_key] - else: - p_value = self.independence_test(self.data, X, Y, tuple(condSet), Fas.cardinalities) if Fas.is_discrete \ - else self.independence_test(self.data, X, Y, tuple(condSet)) - Fas.citest_cache[XYS_key] = p_value + p_value = Fas.ci_test(self.independence_test, self.data, + X, Y, tuple(condSet)) independent = p_value > self.alpha if independent and noEdgeRequired: @@ -439,30 +433,18 @@ def doDdpOrientation(node_d, node_a, node_b, node_c, previous, graph, data, inde path = getPath(node_d, previous) X, Y = graph.node_map[node_d], graph.node_map[node_c] - X, Y = (X, Y) if (X < Y) else (Y, X) condSet = tuple([graph.node_map[nn] for nn in path]) - XYS_key = (X, Y, frozenset(condSet)) - if XYS_key in Fas.citest_cache: - p_value = Fas.citest_cache[XYS_key] - else: - p_value = independence_test_method(data, X, Y, condSet, Fas.cardinalities) if Fas.is_discrete \ - else independence_test_method(data, X, Y, condSet) - Fas.citest_cache[XYS_key] = p_value + p_value = Fas.ci_test(independence_test_method, data, + X, Y, tuple(condSet)) ind = p_value > alpha path2 = list(path) path2.remove(node_b) X, Y = graph.node_map[node_d], graph.node_map[node_c] - X, Y = (X, Y) if (X < Y) else (Y, X) condSet = tuple([graph.node_map[nn2] for nn2 in path2]) - XYS_key = (X, Y, frozenset(condSet)) - if XYS_key in Fas.citest_cache: - p_value2 = Fas.citest_cache[XYS_key] - else: - p_value2 = independence_test_method(data, X, Y, condSet, Fas.cardinalities) if Fas.is_discrete \ - else independence_test_method(data, X, Y, condSet) - Fas.citest_cache[XYS_key] = p_value2 + p_value2 = Fas.ci_test(independence_test_method, data, + X, Y, condSet) ind2 = p_value2 > alpha if not ind and not ind2: diff --git a/causallearn/utils/Fas.py b/causallearn/utils/Fas.py index 14dac48b..133e7e77 100644 --- a/causallearn/utils/Fas.py +++ b/causallearn/utils/Fas.py @@ -11,6 +11,18 @@ is_discrete = False +def ci_test(independence_test, data, i, j, S): + i, j = (i, j) if (i < j) else (j, i) + ijS_key = (i, j, frozenset(S)) + if ijS_key in citest_cache: + return citest_cache[ijS_key] + # if discrete, assert self.test is chisq or gsq, pass into cardinalities + pValue = independence_test(data, i, j, S, cardinalities) if is_discrete \ + else independence_test(data, i, j, S) + citest_cache[ijS_key] = pValue + return pValue + + def possible_parents(node_x, adjx, knowledge=None): possibleParents = [] @@ -56,13 +68,7 @@ def searchAtDepth0(data, nodes, adjacencies, sep_sets, independence_test_method= print(nodes[i + 1].get_name()) for j in range(i+1, len(nodes)): - ijS_key = (i, j, frozenset()) - if ijS_key in citest_cache: - p_value = citest_cache[ijS_key] - else: - p_value = independence_test_method(data, i, j, tuple(empty), cardinalities) if is_discrete \ - else independence_test_method(data, i, j, tuple(empty)) - citest_cache[ijS_key] = p_value + p_value = ci_test(independence_test_method, data, i, j, tuple(empty)) independent = p_value > alpha no_edge_required = True if knowledge is None else \ ((not knowledge.is_required(nodes[i], nodes[j])) or knowledge.is_required(nodes[j], nodes[i])) @@ -96,15 +102,8 @@ def edge(adjx, i, adjacencies_completed_edge): cond_set = [nodes.index(ppx[index]) for index in choice] choice = cg.next() - Y = nodes.index(adjx[j]) - X, Y = (i, Y) if (i < Y) else (Y, i) - XYS_key = (X, Y, frozenset(cond_set)) - if XYS_key in citest_cache: - p_value = citest_cache[XYS_key] - else: - p_value = independence_test_method(data, X, Y, tuple(cond_set), cardinalities) if is_discrete \ - else independence_test_method(data, X, Y, tuple(cond_set)) - citest_cache[XYS_key] = p_value + p_value = ci_test(independence_test_method, data, + i, nodes.index(adjx[j]), tuple(cond_set)) independent = p_value > alpha @@ -180,15 +179,8 @@ def edge(adjx, i, adjacencies_completed_edge): cond_set = [nodes.index(ppx[index]) for index in choice] choice = cg.next() - Y = nodes.index(adjx[j]) - X, Y = (i, Y) if (i < Y) else (Y, i) - XYS_key = (X, Y, frozenset(cond_set)) - if XYS_key in citest_cache: - p_value = citest_cache[XYS_key] - else: - p_value = independence_test_method(data, X, Y, tuple(cond_set), cardinalities) if is_discrete \ - else independence_test_method(data, X, Y, tuple(cond_set)) - citest_cache[XYS_key] = p_value + p_value = ci_test(independence_test_method, data, + i, nodes.index(adjx[j]), tuple(cond_set)) independent = p_value > alpha From ab597c05fa726eb341291c679207ee8b19dd18ee Mon Sep 17 00:00:00 2001 From: Haoyue Dai Date: Thu, 23 Dec 2021 15:31:07 -0500 Subject: [PATCH 3/3] Added Fas.citest_cache to cg_1.citest_cache, for UCSepset.uc_sepset --- causallearn/search/ConstraintBased/PC.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/causallearn/search/ConstraintBased/PC.py b/causallearn/search/ConstraintBased/PC.py index 1e3d1c72..2cef5889 100644 --- a/causallearn/search/ConstraintBased/PC.py +++ b/causallearn/search/ConstraintBased/PC.py @@ -4,7 +4,7 @@ import networkx as nx import numpy as np - +from causallearn.utils import Fas from causallearn.graph.GraphClass import CausalGraph from causallearn.utils.PCUtils import SkeletonDiscovery, UCSepset, Meek, Helper from causallearn.utils.PCUtils.BackgroundKnowledgeOrientUtils import orient_by_background_knowledge @@ -71,6 +71,7 @@ def pc_alg(data, alpha, indep_test, stable, uc_rule, uc_priority, background_kno cg_1 = SkeletonDiscovery.skeleton_discovery_using_fas(data, alpha, indep_test, stable, background_knowledge=background_knowledge, verbose=verbose, show_progress=show_progress) + cg_1.citest_cache = Fas.citest_cache # for citests in further UCSepset.uc_sepset if background_knowledge is not None: orient_by_background_knowledge(cg_1, background_knowledge)