diff --git a/doc/references.bib b/doc/references.bib index 8b434ad1..1dba2cde 100644 --- a/doc/references.bib +++ b/doc/references.bib @@ -145,3 +145,18 @@ @book{Spirtes1993 publisher = {The MIT Press} } +@article{xie2020generalized, + title = {Generalized independent noise condition for estimating latent variable causal graphs}, + author = {Xie, Feng and Cai, Ruichu and Huang, Biwei and Glymour, Clark and Hao, Zhifeng and Zhang, Kun}, + journal = {Advances in Neural Information Processing Systems}, + volume = {33}, + pages = {14891--14902}, + year = {2020} +} + +@article{dai2022independence, + title = {Independence Testing-Based Approach to Causal Discovery under Measurement Error and Linear Non-Gaussian Models}, + author = {Dai, Haoyue and Spirtes, Peter and Zhang, Kun}, + journal = {arXiv preprint arXiv:2210.11021}, + year = {2022} +} diff --git a/dodiscover/replearning/gin.py b/dodiscover/replearning/gin.py index ff91bda2..56300922 100644 --- a/dodiscover/replearning/gin.py +++ b/dodiscover/replearning/gin.py @@ -3,18 +3,46 @@ """ from pandas import DataFrame - -from causallearn.search.HiddenCausal.GIN.GIN import GIN as GIN_ -from pywhy_graphs import CPDAG +from pywhy_graphs.array.export import clearn_arr_to_graph class GIN: """Wrapper for GIN in the causal-learn package. + The GIN algorithm is a causal discovery algorithm that learns latent + variable structure. We can view it as a causal representation learning + algorithm. + + Given an observed set of variables, the GIN algorithm tries to learn a set + of latent parents that d-separate subsets of the observed variables. GIN + will also learn undirected structure between the latent parents. + + In GIN, the latent variables are always assumed to be parents of the + observed. Further, it will not learn direct causal edges between + the observed variables. In that sense, we can view it as a causal + representation learning algorithm that learns latent high-level variables + and structure between them from low-level observed variables. + + The GIN algorithm assumes a linear non-Gaussian latent variable model + of the observed variables given the latent variables. One should not + expect it to work if the true relationship is Gaussian. + + GIN stands for "generalized independent noise" (GIN) condition. Roughly, + the GIN condition is used to divide observed variables into subsets that + are d-separated given the latent variables. + + See :footcite:`xie2020generalized` and :footcite:`dai2022independence` + for full details on the algorithm.See https://causal-learn.readthedocs.io + for the causal-learn documentation. + Parameters ---------- indep_test_method : str - The method to use for testing independence, by default "kci" + The method to use for testing independence. The default argument is + "kci" for kernel conditional independence testing. Another option is + "hsic" for the Hilbert Schmidt Independence Criterion. This is a + wrapper for causal-learn's GIN implementation and the causal-learn devs + may or may not add other options in the future. alpha : float The significance level for independence tests, by default 0.05 @@ -22,72 +50,27 @@ class GIN: ---------- graph_ : CPDAG The estimated causal graph. - causal_learn_graph : CausalGraph - The causal graph object from causal-learn. - causal_ordering : list of str - The causal ordering of the variables. + causal_learn_graph_ : CausalGraph + The causal graph object from causal-learn. Internally, we convert this + to a network-like graph object that supports CPDAGs. This is stored in + the ``graph_`` fitted attribute. + + References + ---------- + .. footbibliography:: """ - def __init__(self, indep_test_method: str = "kci", alpha: float = 0.05): + + def __init__(self, ci_estimator_method: str = "kci", alpha: float = 0.05): """Initialize GIN object with specified parameters.""" - self.graph_ = None + self.graph = None # GIN default parameters. - self.indep_test_method = indep_test_method + self.ci_estimator_method = ci_estimator_method self.alpha = alpha # The follow objects are specific to causal-learn, perhaps they should # go in a base class too. - self.causal_learn_graph = None - self.causal_ordering = None - - def _causal_learn_to_pdag(self, cl_graph): - """Convert a causal-learn graph to a CPDAG object. - - Parameters - ---------- - cl_graph : CausalGraph - The causal-learn graph to be converted. - - Returns - ------- - pdag : CPDAG - The equivalent CPDAG object. - """ - def _extract_edgelists(adj_mat, names): - """Extracts directed and undirected edges from an adjacency matrix. - - Parameters: - - adj_mat: numpy array - The adjacency matrix of the graph. - - names: list of str - The names of the nodes in the graph. - - Returns: - - directed_edges: list of tuples - The directed edges of the graph. - - undirected_edges: list of sets - The undirected edges of the graph. - """ - directed_edges = [] - undirected_edges = [] - for i, row in enumerate(adj_mat): - for j, item in enumerate(row): - if item != 0.: - if item == -1. and adj_mat[j][i] == -1.: - undirected_edges.append(set((names[j], names[i]))) - if item == 1.: - directed_edges.append((names[j], names[i])) - undirected_edges = list(set(tuple(edge) for edge in undirected_edges)) - return directed_edges, undirected_edges - - names = [n.name for n in cl_graph.nodes] - adj_mat = cl_graph.graph - directed_edges, undirected_edges = _extract_edgelists( - adj_mat, - names - ) - pdag = CPDAG(directed_edges, undirected_edges) - return pdag + self.causal_learn_graph_ = None def fit(self, data: DataFrame, context: DataFrame): """Fit the GIN model to data. @@ -105,11 +88,10 @@ def fit(self, data: DataFrame, context: DataFrame): self : GIN The fitted GIN object. """ - causal_learn_graph, ordering = GIN_( - data.to_numpy(), - self.indep_test_method, - self.alpha - ) - self.causal_learn_graph = causal_learn_graph - self.causal_ordering = ordering - self.graph_ = self._causal_learn_to_pdag(causal_learn_graph) + from causallearn.search.HiddenCausal.GIN.GIN import GIN as GIN_ + + causal_learn_graph, _ = GIN_(data.to_numpy(), self.ci_estimator_method, self.alpha) + self.causal_learn_graph_ = causal_learn_graph + names = [n.name for n in causal_learn_graph.nodes] + adj_mat = causal_learn_graph.graph + self.graph = clearn_arr_to_graph(adj_mat, names, "cpdag") diff --git a/tests/unit_tests/replearning/test_gin.py b/tests/unit_tests/replearning/test_gin.py index 1250e02b..0ed9153d 100644 --- a/tests/unit_tests/replearning/test_gin.py +++ b/tests/unit_tests/replearning/test_gin.py @@ -44,7 +44,7 @@ def test_estimate_gin_testdata(): context = make_context().variables(data=data).build() gin = GIN() gin.fit(data, context) - pdag = gin.graph_ + pdag = gin.graph assert nx.is_isomorphic(pdag.sub_undirected_graph(), g_answer.sub_undirected_graph()) assert nx.is_isomorphic(pdag.sub_directed_graph(), g_answer.sub_directed_graph())