py-why · kunwuz · Jul 14, 2022 · Jul 13, 2022 · Jul 13, 2022 · Jul 13, 2022
diff --git a/tests/TestAstar.py b/tests/TestAstar.py
@@ -1,42 +1,84 @@
-import sys
-
+import itertools
+import unittest
+import hashlib
+import numpy as np
+from causallearn.graph.Dag import Dag
+from causallearn.graph.GraphNode import GraphNode
 from causallearn.search.ScoreBased.ExactSearch import bic_exact_search
+from causallearn.utils.DAG2CPDAG import dag2cpdag
 
-sys.path.append("")
-import unittest
-from pickle import load
 
-import numpy as np
+
+######################################### Test Notes ###########################################
+# All the benchmark results of loaded files (e.g. "./TestData/benchmark_returned_results/")    #
+# are obtained from the code of causal-learn as of commit                                      #
+# https://github.com/cmu-phil/causal-learn/commit/8badb41 (07-12-2022).                        #
+#                                                                                              #
+# We are not sure if the results are completely "correct" (reflect ground truth graph) or not. #
+# So if you find your tests failed, it means that your modified code is logically inconsistent #
+# with the code as of 8badb41, but not necessarily means that your code is "wrong".            #
+# If you are sure that your modification is "correct" (e.g. fixed some bugs in 8badb41),       #
+# please report it to us. We will then modify these benchmark results accordingly. Thanks :)   #
+######################################### Test Notes ###########################################
+
+
+BENCHMARK_TXTFILE_TO_MD5 = {
+    "tests/TestData/test_exact_search_simulated_linear_gaussian_data.txt": "1ec70464e4fc68c312adfb7143bd240b",
+    "tests/TestData/test_exact_search_simulated_linear_gaussian_CPDAG.txt": "52a6d3c5db269d5e212edcbb8283aca9",
+}
+# verify files integrity first
+for file_path, expected_MD5 in BENCHMARK_TXTFILE_TO_MD5.items():
+    with open(file_path, 'rb') as fin:
+        assert hashlib.md5(fin.read()).hexdigest() == expected_MD5,\
+            f'{file_path} is corrupted. Please download it again from https://github.com/cmu-phil/causal-learn/blob/8badb41/tests/TestData'
 
 
 class TestAstar(unittest.TestCase):
+    # Load data and run Astar with default parameters.
+    def test_astar_simulate_linear_gaussian_with_local_score_BIC(self):
+        # The data and ground truth loaded in this test case is generated by the function
+        # simulate_linear_gaussian_data_for_exact_search commented below
+        print('Now start test_astar_simulate_linear_gaussian_with_local_score_BIC ...')
+        truth_CPDAG_matrix = np.loadtxt("tests/TestData/test_exact_search_simulated_linear_gaussian_CPDAG.txt")
+        data = np.loadtxt("tests/TestData/test_exact_search_simulated_linear_gaussian_data.txt")
+        assert truth_CPDAG_matrix.shape[0] == truth_CPDAG_matrix.shape[1], "Should be a square numpy matrix"
+        num_of_nodes = len(truth_CPDAG_matrix)
+        assert data.shape[1] == num_of_nodes, "The second dimension of data should be same as number of nodes"
+        data = data - data.mean(axis=0, keepdims=True)    # Center the data
+        # Iterate over different configurations of path extension and k-cycle heuristic
+        # to make sure they are working fine
+        for use_path_extension, use_k_cycle_heuristic in itertools.product([False, True], repeat=2):
+            DAG_matrix, _ = bic_exact_search(data, search_method='astar', use_path_extension=use_path_extension,
+                                             use_k_cycle_heuristic=use_k_cycle_heuristic, k=3)
+            # Convert DAG adjacency matrix to Dag object
+            nodes = [GraphNode(str(i)) for i in range(num_of_nodes)]
+            DAG = Dag(nodes)
+            for i, j in zip(*np.where(DAG_matrix == 1)):
+                DAG.add_directed_edge(nodes[i], nodes[j])
+            CPDAG = dag2cpdag(DAG)    # Convert DAG to CPDAG
+            self.assertTrue(np.all(CPDAG.graph == truth_CPDAG_matrix))
+        print('test_astar_simulate_linear_gaussian_with_local_score_BIC passed!\n')
+
 
-    # example1
-    # for data with single-variate dimensions, astar.
-    def test_single_astar(self):
-        with open("example_data1.pk", 'rb') as example_data1:
-            # example_data1 = load(open("example_data1.pk", 'rb'))
-            example_data1 = load(example_data1)
-            X = example_data1['X']
-            X = X - np.tile(np.mean(X, axis=0), (X.shape[0], 1))
-            X = np.dot(X, np.diag(1 / np.std(X, axis=0)))
-            X = X[:50, :]
-            dag_est, search_stats = bic_exact_search(X, search_method='astar')
-            print(dag_est)
-            print(search_stats)
-
-    # example2
-    # for data with multi-variate dimensions, astar.
-    def test_multi_astar(self):
-        with open("example_data2.pk", 'rb') as example_data:
-            # example_data = load(open("example_data2.pk", 'rb'))
-            example_data = load(example_data)
-            Data_save = example_data['Data_save']
-            trial = 0
-            X = Data_save[trial]
-            X = X - np.tile(np.mean(X, axis=0), (X.shape[0], 1))
-            X = np.dot(X, np.diag(1 / np.std(X, axis=0)))
-            X = X[:50, :]
-            dag_est, search_stats = bic_exact_search(X, search_method='astar')
-            print(dag_est)
-            print(search_stats)
+# def simulate_linear_gaussian_data_for_exact_search():
+#     import pandas as pd
+#     import random
+#     random.seed(1)    # Reproducibility
+#     np.random.seed(1)    # Reproducibility
+#     num_of_samples = 100000
+#     # Generate linear Gaussian data
+#     X0 = np.random.normal(scale=1.0, size=num_of_samples)
+#     X1 = 0.5 * X0 + np.random.normal(scale=2.0, size=num_of_samples)
+#     X3 = np.random.normal(scale=0.5, size=num_of_samples)
+#     X2 = 0.4 * X1 + 0.7 * X3 + np.random.normal(scale=1.5, size=num_of_samples)
+#     data_df = pd.DataFrame(data={'X0': X0, 'X1': X1, 'X2': X2, 'X3': X3})
+#     # Ground truth DAG: X0 -> X1 -> X2 <- X3
+#     # Ground truth CPDAG: X0 -- X1 -> X2 <- X3
+#     truth_CPDAG_matrix = np.array([[ 0, -1,  0, 0],
+#                                    [-1,  0, -1, 0],
+#                                    [ 0,  1,  0, 1],
+#                                    [ 0,  0, -1, 0]])
+#     truth_CPDAG_df = pd.DataFrame(data=truth_CPDAG_matrix)
+#     # Save data and ground truth
+#     truth_CPDAG_df.to_csv('./TestData/test_dp_simulated_linear_gaussian_CPDAG.txt', sep=' ', index=False, header=False)
+#     data_df.to_csv('./TestData/test_dp_simulated_linear_gaussian_data.txt', sep=' ', index=False, header=False)
diff --git a/tests/TestDP.py b/tests/TestDP.py
@@ -1,41 +1,81 @@
-import sys
-
+import unittest
+import hashlib
+import numpy as np
+from causallearn.graph.Dag import Dag
+from causallearn.graph.GraphNode import GraphNode
 from causallearn.search.ScoreBased.ExactSearch import bic_exact_search
+from causallearn.utils.DAG2CPDAG import dag2cpdag
 
-sys.path.append("")
-import unittest
-from pickle import load
 
-import numpy as np
+
+######################################### Test Notes ###########################################
+# All the benchmark results of loaded files (e.g. "./TestData/benchmark_returned_results/")    #
+# are obtained from the code of causal-learn as of commit                                      #
+# https://github.com/cmu-phil/causal-learn/commit/8badb41 (07-12-2022).                        #
+#                                                                                              #
+# We are not sure if the results are completely "correct" (reflect ground truth graph) or not. #
+# So if you find your tests failed, it means that your modified code is logically inconsistent #
+# with the code as of 8badb41, but not necessarily means that your code is "wrong".            #
+# If you are sure that your modification is "correct" (e.g. fixed some bugs in 8badb41),       #
+# please report it to us. We will then modify these benchmark results accordingly. Thanks :)   #
+######################################### Test Notes ###########################################
+
+
+BENCHMARK_TXTFILE_TO_MD5 = {
+    "tests/TestData/test_exact_search_simulated_linear_gaussian_data.txt": "1ec70464e4fc68c312adfb7143bd240b",
+    "tests/TestData/test_exact_search_simulated_linear_gaussian_CPDAG.txt": "52a6d3c5db269d5e212edcbb8283aca9",
+}
+# verify files integrity first
+for file_path, expected_MD5 in BENCHMARK_TXTFILE_TO_MD5.items():
+    with open(file_path, 'rb') as fin:
+        assert hashlib.md5(fin.read()).hexdigest() == expected_MD5,\
+            f'{file_path} is corrupted. Please download it again from https://github.com/cmu-phil/causal-learn/blob/8badb41/tests/TestData'
 
 
 class TestDP(unittest.TestCase):
-    # example3
-    # for data with single-variate dimensions, dp.
-    def test_single_dp(self):
-        with open("example_data1.pk", 'rb') as example_data1:
-            # example_data1 = load(open("example_data1.pk", 'rb'))
-            example_data1 = load(example_data1)
-            X = example_data1['X']
-            X = X - np.tile(np.mean(X, axis=0), (X.shape[0], 1))
-            X = np.dot(X, np.diag(1 / np.std(X, axis=0)))
-            X = X[:50, :]
-            dag_est, search_stats = bic_exact_search(X, search_method='dp')
-            print(dag_est)
-            print(search_stats)
-
-    # example4
-    # for data with multi-variate dimensions, dp.
-    def test_multi_dp(self):
-        with open("example_data2.pk", 'rb') as example_data:
-            # example_data = load(open("example_data2.pk", 'rb'))
-            example_data = load(example_data)
-            Data_save = example_data['Data_save']
-            trial = 0
-            X = Data_save[trial]
-            X = X - np.tile(np.mean(X, axis=0), (X.shape[0], 1))
-            X = np.dot(X, np.diag(1 / np.std(X, axis=0)))
-            X = X[:50, :]
-            dag_est, search_stats = bic_exact_search(X, search_method='dp')
-            print(dag_est)
-            print(search_stats)
+    # Load data and run DP with default parameters.
+    def test_dp_simulate_linear_gaussian_with_local_score_BIC(self):
+        # The data and ground truth loaded in this test case is generated by the function
+        # simulate_linear_gaussian_data_for_exact_search commented below
+        print('Now start test_dp_simulate_linear_gaussian_with_local_score_BIC ...')
+        truth_CPDAG_matrix = np.loadtxt("tests/TestData/test_exact_search_simulated_linear_gaussian_CPDAG.txt")
+        data = np.loadtxt("tests/TestData/test_exact_search_simulated_linear_gaussian_data.txt")
+        assert truth_CPDAG_matrix.shape[0] == truth_CPDAG_matrix.shape[1], "Should be a square numpy matrix"
+        num_of_nodes = len(truth_CPDAG_matrix)
+        assert data.shape[1] == num_of_nodes, "The second dimension of data should be same as number of nodes"
+        data = data - data.mean(axis=0, keepdims=True)    # Center the data
+        # Iterate over different configurations of path extension to make sure they are working fine
+        for use_path_extension in [False, True]:
+            DAG_matrix, _ = bic_exact_search(data, search_method='dp', use_path_extension=use_path_extension)
+            # Convert DAG adjacency matrix to Dag object
+            nodes = [GraphNode(str(i)) for i in range(num_of_nodes)]
+            DAG = Dag(nodes)
+            for i, j in zip(*np.where(DAG_matrix == 1)):
+                DAG.add_directed_edge(nodes[i], nodes[j])
+            CPDAG = dag2cpdag(DAG)    # Convert DAG to CPDAG
+            self.assertTrue(np.all(CPDAG.graph == truth_CPDAG_matrix))
+        print('test_dp_simulate_linear_gaussian_with_local_score_BIC passed!\n')
+
+
+# def simulate_linear_gaussian_data_for_exact_search():
+#     import pandas as pd
+#     import random
+#     random.seed(1)    # Reproducibility
+#     np.random.seed(1)    # Reproducibility
+#     num_of_samples = 100000
+#     # Generate linear Gaussian data
+#     X0 = np.random.normal(scale=1.0, size=num_of_samples)
+#     X1 = 0.5 * X0 + np.random.normal(scale=2.0, size=num_of_samples)
+#     X3 = np.random.normal(scale=0.5, size=num_of_samples)
+#     X2 = 0.4 * X1 + 0.7 * X3 + np.random.normal(scale=1.5, size=num_of_samples)
+#     data_df = pd.DataFrame(data={'X0': X0, 'X1': X1, 'X2': X2, 'X3': X3})
+#     # Ground truth DAG: X0 -> X1 -> X2 <- X3
+#     # Ground truth CPDAG: X0 -- X1 -> X2 <- X3
+#     truth_CPDAG_matrix = np.array([[ 0, -1,  0, 0],
+#                                    [-1,  0, -1, 0],
+#                                    [ 0,  1,  0, 1],
+#                                    [ 0,  0, -1, 0]])
+#     truth_CPDAG_df = pd.DataFrame(data=truth_CPDAG_matrix)
+#     # Save data and ground truth
+#     truth_CPDAG_df.to_csv('./TestData/test_dp_simulated_linear_gaussian_CPDAG.txt', sep=' ', index=False, header=False)
+#     data_df.to_csv('./TestData/test_dp_simulated_linear_gaussian_data.txt', sep=' ', index=False, header=False)
diff --git a/tests/TestData/test_exact_search_simulated_linear_gaussian_CPDAG.txt b/tests/TestData/test_exact_search_simulated_linear_gaussian_CPDAG.txt
@@ -0,0 +1,4 @@
+0 -1 0 0
+-1 0 -1 0
+0 1 0 1
+0 0 -1 0