diff --git a/.gitignore b/.gitignore index 6ab30854b..72a35bbeb 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,10 @@ __pycache__/ # C extensions *.so +# possibly produced from drawing graphs +*.gv +*.png + junit-results.xml # Distribution / packaging diff --git a/README.md b/README.md index 48f4ea44e..3d96ddabc 100644 --- a/README.md +++ b/README.md @@ -49,5 +49,8 @@ To install the package from github, clone the repository and then `cd` into the # for graph functionality poetry install --extras graph_func + # to load datasets used in tutorials + poetry install --extras data + # if you would like an editable install of dodiscover for dev purposes pip install -e . diff --git a/doc/conf.py b/doc/conf.py index 18abf7a63..64ee47c83 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -263,6 +263,10 @@ "image_scrapers": scrapers, } +# prevent jupyter notebooks from being run even if empty cell +nbsphinx_execute = "never" +nbsphinx_allow_errors = True + # Custom sidebar templates, maps document names to template names. html_sidebars = { "index": ["search-field.html"], diff --git a/doc/index.rst b/doc/index.rst index f13a22dcf..f068a093f 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -15,13 +15,13 @@ Contents -------- .. toctree:: - :maxdepth: 1 + :maxdepth: 2 :caption: Getting started: installation api use - tutorials + tutorials/index whats_new .. toctree:: diff --git a/doc/tutorials.rst b/doc/tutorials.rst deleted file mode 100644 index 97ece8265..000000000 --- a/doc/tutorials.rst +++ /dev/null @@ -1,12 +0,0 @@ -:orphan: - -******** -Tutorial -******** - -.. _dodiscover_tutorials: - -Using dodiscover -===================== -This tutorial shows how to use dodiscover - diff --git a/doc/tutorials/index.rst b/doc/tutorials/index.rst new file mode 100644 index 000000000..1416eb4cd --- /dev/null +++ b/doc/tutorials/index.rst @@ -0,0 +1,18 @@ +********* +Tutorials +********* + +.. _models_tutorials: + +Basic causal discovery models without latent confounders +======================================================== +The first tutorial presents several algorithms for causal discovery without latent confounding: the Peters and Clarke (PC) algorithm. +These models provide a basis for learning causal structure from data when we make the **assumption** that there are +no latent confounders. + +.. toctree:: + :maxdepth: 1 + :titlesonly: + + markovian/pc + diff --git a/doc/tutorials/markovian/example-pc-algo.ipynb b/doc/tutorials/markovian/example-pc-algo.ipynb new file mode 100644 index 000000000..f49e41989 --- /dev/null +++ b/doc/tutorials/markovian/example-pc-algo.ipynb @@ -0,0 +1,475 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3e91e320", + "metadata": {}, + "source": [ + "# PC algorithm for causal discovery from observational data without latent confounders\n", + "\n", + "In this tutorial, we will demonstrate how to use the PC algorithm to learn a causal graph structure.\n", + "\n", + "The PC algorithm works on observational data when there are no unobserved latent confounders." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "95e12b39", + "metadata": {}, + "outputs": [], + "source": [ + "import bnlearn as bn\n", + "import networkx as nx\n", + "import numpy as np\n", + "\n", + "from pywhy_graphs import CPDAG\n", + "from pywhy_graphs.viz import draw\n", + "from dodiscover import PC, make_context\n", + "from dodiscover.ci import GSquareCITest, Oracle" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e77e7416", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[bnlearn] >Extracting files..\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ATSLBEXD
011011111
111110110
211110110
310010000
411110110
\n", + "
" + ], + "text/plain": [ + " A T S L B E X D\n", + "0 1 1 0 1 1 1 1 1\n", + "1 1 1 1 1 0 1 1 0\n", + "2 1 1 1 1 0 1 1 0\n", + "3 1 0 0 1 0 0 0 0\n", + "4 1 1 1 1 0 1 1 0" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load example dataset\n", + "data = bn.import_example(data='asia')\n", + "data.rename(columns = {\n", + " 'tub': 'T',\n", + " 'lung': 'L',\n", + " 'bronc': 'B',\n", + " 'asia': 'A',\n", + " 'smoke': 'S',\n", + " 'either': 'E',\n", + " 'xray': 'X',\n", + " 'dysp': 'D'},\n", + " inplace=True\n", + ")\n", + "data.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "84323abd", + "metadata": {}, + "source": [ + "We'll use the PC algorithm to infer the ASIA network. The ASIA network is a case study of an expert system for diagnosing lung disease from Lauritzen and Spiegelhalter (1988). Given respiratory symptions and other evidence, the goal is to distinguish between tuberculosis, lung cancer or bronchitis in a given patient. The ground truth causal DAG is as follows:\n", + "\n", + "![asia](figures/asia.png)\n", + "\n", + "The variables in the DAG have the following interpretation:\n", + "\n", + "* T: Whether or not the patient has **tuberculosis**.\n", + "* L: Whether or not the patient has **lung cancer**.\n", + "* B: Whether or not the patient has **bronchitis**.\n", + "* A: Whether or not the patient has recently visited **Asia**.\n", + "* S: Whether or not the patient is a **smoker**.\n", + "* E: An indicator of whether the patient has either lung cancer or tuberculosis (or both).\n", + "* X: Whether or not a chest X-ray shows evidence of tuberculosis or lung cancer.\n", + "* D: Whether or not the patient has **dyspnoea** (difficulty breathing).\n", + "\n", + "Note we have three kinds of variables, diseases (B, L, T, and E which indicates one or more diseases), symptoms (X and D), and behaviors (S and A). The goal of the model is to use symptoms and behaviors to diagnose (i.e. infer) diseases. Further, note that diseases are causes of symptoms, and are effects of behaviors." + ] + }, + { + "cell_type": "markdown", + "id": "e5620376", + "metadata": {}, + "source": [ + "Our goal is to discover this graph structure from observational data. To this end, we'll use the dodiscover implementation of the PC algorithm. First, we'll implement the ground truth graph." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "33fa1389", + "metadata": {}, + "outputs": [], + "source": [ + "ground_truth_edges = [\n", + " (\"A\", \"T\"),\n", + " (\"T\", \"E\"),\n", + " (\"L\", \"E\"),\n", + " (\"S\", \"L\"),\n", + " (\"S\", \"B\"),\n", + " (\"B\", \"D\"),\n", + " (\"E\", \"D\"),\n", + " (\"E\", \"X\")\n", + " ]\n", + "\n", + "ground_truth = nx.DiGraph(ground_truth_edges)" + ] + }, + { + "cell_type": "markdown", + "id": "b769b7f1", + "metadata": {}, + "source": [ + "This is the ground truth, but ground truth is not learnable from observational data alone. We can only learn a CPDAG (complete partially directed acyclic graph), which has a mixture of directed and undirected graph. The undirected edges are edges where we cannot learn causal direction from observations alone.\n", + "\n", + "In technical terms, the CPDAG represents an equivalence class of DAGs. In other words, the CPDAG represents all the DAGs that we could create by orienting the undirected edges (except for those DAGs that would introduce new colliders), because each of those DAGs are equally probable given the data. " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "30a29f1d", + "metadata": {}, + "outputs": [], + "source": [ + "cpdag_directed = [\n", + " (\"T\", \"E\"),\n", + " (\"L\", \"E\"),\n", + " (\"B\", \"D\"),\n", + " (\"E\", \"D\"),\n", + " (\"E\", \"X\")\n", + " ]\n", + "\n", + "cpdag_undirected = [\n", + " (\"A\", \"T\"),\n", + " (\"S\", \"L\"),\n", + " (\"S\", \"B\"),\n", + " ]\n", + "\n", + "ground_truth_cpdag = CPDAG(cpdag_directed, cpdag_undirected)" + ] + }, + { + "cell_type": "markdown", + "id": "5a9d2aae", + "metadata": {}, + "source": [ + "## Instantiate some conditional independence tests\n", + "\n", + "The PC algorithm is a constraint-based causal discovery algorithm, which means it uses statistical constraints induced by causal relationships to learn those causal relationships. The most commonly used constraint is conditional independence. Constraint-based algorithms run a series of statistical tests for conditional independence (CI) and construct a graph consistent with those assumptions.\n", + "\n", + "So we need a way to test for CI constraints. There are various options for tests. If we are applying the algorithm on real data, we would want to use the CI test that best suits the data. \n", + "\n", + "If we are interested in evaluating how the discovery algorithm works in an ideal setting, we can use an oracle, which is imbued with the ground-truth graph, which can query all the d-separation statements needed. This can help one determine in a simulation setting, what is the best case graph the PC algorithm can learn.\n", + "\n", + "**Warnings about statistical tests for independence**. \n", + "* Note that because of finite sample sizes, any CI test will sometimes make erroneous conclusions. Errors results in incorrect orientations and edges in the learned graph. Further, constraint-based algorithms run multiple tests in sequence, which causes those errors to accumulate. One might use adjustments for multiple comparisons to compensate.\n", + "* The standard statistical test has a null and alternative hypothesis. The alternative hypothesis should be the hypothesis that the thing one is looking for is present, while the null hypothesis is that thing is not present. In causal discovery, the thing we are looking for is causal structure, and conditional independence is evidence of causal structure. So arguably, the alternative hypothesis should be the hypothesis of conditional independence. However, most CI test implementations take the hypothesis of independence as the null hypothesis.\n", + "* The p-value decreases with the size of the data. So the more data is provided, the more likely the test will favor dependence. For large data sizes, use smaller significance thresholds.\n", + "\n", + "For these reasons, the user should view the results of the CI test more as a heuristic than a rigorous statistical test. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "902d126f", + "metadata": {}, + "outputs": [], + "source": [ + "oracle = Oracle(ground_truth)" + ] + }, + { + "cell_type": "markdown", + "id": "591c00cf", + "metadata": {}, + "source": [ + "## Define the context\n", + "\n", + "A common anti-pattern in causal discovery is encouraging the user to see the discovery algorithm as a philosopher's stone that converts data to causal relationships. In other words, discovery libraries tend to encourage users tend to surrender the task of providing domain-specific assumptions that enable identifiability to the algorithm. PyWhy's key strength is how it guides users to specifying domain assumptions up front (in the form of a DAG) before the data is added, and then addresses identifiability given those assumptions and data. \n", + "\n", + "To this end, we introduce the `context.Context` class, where users provide data as the primary input to a discovery algorithm. The Context class houses both data, a priori assumptions and other relevant data that may be used in downstream structure learning algorithms.\n", + "\n", + "In this example, we'll use the context object to specify some fixed edges in the graph." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "89a4f0af", + "metadata": {}, + "outputs": [], + "source": [ + "context = make_context().variables(data=data).build()" + ] + }, + { + "cell_type": "markdown", + "id": "efcf01c7", + "metadata": {}, + "source": [ + "## Run discovery algorithm\n", + "\n", + "Now we are ready to run the PC algorithm. First, we will show the output of the oracle, which is the best case scenario the PC algorithm can learn given an infinite amount of data." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e5cba859", + "metadata": {}, + "outputs": [], + "source": [ + "pc = PC(ci_estimator=oracle)\n", + "pc.fit(data, context)" + ] + }, + { + "cell_type": "markdown", + "id": "ff7ebc10", + "metadata": {}, + "source": [ + "The resulting completely partially directed acyclic graph (CPDAG) that is learned is a \"Markov equivalence class\", which encodes all the conditional dependences that were learned from the data." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "918c6f5f", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": "\n\n\n\n\n\n\n\n\nT\n\nT\n\n\n\nX\n\nX\n\n\n\nT->X\n\n\n\n\n\nE\n\nE\n\n\n\nT->E\n\n\n\n\n\nA\n\nA\n\n\n\nT->A\n\n\n\n\nD\n\nD\n\n\n\nT->D\n\n\n\n\n\nL\n\nL\n\n\n\nL->X\n\n\n\n\n\nL->E\n\n\n\n\n\nL->D\n\n\n\n\n\nS\n\nS\n\n\n\nL->S\n\n\n\n\nE->X\n\n\n\n\n\nE->D\n\n\n\n\n\nB\n\nB\n\n\n\nB->D\n\n\n\n\n\nB->S\n\n\n\n\n", + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graph = pc.graph_\n", + "draw(graph)" + ] + }, + { + "cell_type": "markdown", + "id": "162d98be", + "metadata": {}, + "source": [ + "Compare this against the ground truth CPDAG." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "67d47367", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": "\n\n\n\n\n\n\n\n\nT\n\nT\n\n\n\nE\n\nE\n\n\n\nT->E\n\n\n\n\n\nD\n\nD\n\n\n\nT->D\n\n\n\n\n\nX\n\nX\n\n\n\nT->X\n\n\n\n\n\nE->D\n\n\n\n\n\nE->X\n\n\n\n\n\nL\n\nL\n\n\n\nL->E\n\n\n\n\n\nL->D\n\n\n\n\n\nL->X\n\n\n\n\n\nB\n\nB\n\n\n\nB->D\n\n\n\n\n\nA\n\nA\n\n\n\nA->T\n\n\n\n\nS\n\nS\n\n\n\nS->L\n\n\n\n\nS->B\n\n\n\n\n", + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "draw(ground_truth_cpdag)" + ] + }, + { + "cell_type": "markdown", + "id": "db1ff244", + "metadata": {}, + "source": [ + "Now, we will show the output given a real CI test, which performs CI hypothesis testing to determine CI in the data. Due to finite data and the presence of noise, there is always a possibility that the CI test makes a mistake." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5b80aeba", + "metadata": {}, + "outputs": [], + "source": [ + "ci_estimator = GSquareCITest(data_type=\"discrete\")\n", + "pc = PC(ci_estimator=ci_estimator)\n", + "pc.fit(data, context)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c948064b", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": "\n\n\n\n\n\n\n\n\nT\n\nT\n\n\n\nL\n\nL\n\n\n\nL->T\n\n\n\n\n\nE\n\nE\n\n\n\nL->E\n\n\n\n\n\nS\n\nS\n\n\n\nL->S\n\n\n\n\n\nA\n\nA\n\n\n\nA->T\n\n\n\n\n\nE->T\n\n\n\n\n\nX\n\nX\n\n\n\nB\n\nB\n\n\n\nD\n\nD\n\n\n\nB->D\n\n\n\n\nB->S\n\n\n\n\n\n", + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graph = pc.graph_\n", + "draw(graph)" + ] + }, + { + "cell_type": "markdown", + "id": "ff0b314b", + "metadata": {}, + "source": [ + "The resulting graph captures some of the graph but not all of it. The problem here is a violation of the [faithfulness assumption](https://plato.stanford.edu/entries/causal-models/#MiniFaitCond); in the Asia data, it is very hard to detect the edge between E and X.\n", + "\n", + "Beyond faithfulness violations, in general causal discovery algorithms struggle when there is no user-provided causal knowledge to constrain the problem. A core philosophy of dodiscovery is that causal domain knowledge should be provided to constrain the problem, and providing it should be easy. For example, for this data, we know that smoking (S) causes both lung cancer (L) and bronchitis (B) and not the other way around.\n", + "\n", + "## References\n", + "\n", + "Lauritzen S, Spiegelhalter D (1988). \"Local Computation with Probabilities on Graphical Structures and their Application to Expert Systems (with discussion)\". Journal of the Royal Statistical Society: Series B, 50(2):157–224.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "vscode": { + "interpreter": { + "hash": "83bc06ec5be13f4230f46a3f77f7cefbb44c2fa59a857bbc121b0c3cdb0063f8" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/use.rst b/doc/use.rst index 6aea93d8a..78a74b93c 100644 --- a/doc/use.rst +++ b/doc/use.rst @@ -3,7 +3,7 @@ Using dodiscover ===================== -To be able to effectively use dodiscover, look at some of the examples here +To be able to effectively use dodiscover, look at some of the basic examples here to learn everything you need! diff --git a/doc/whats_new/v0.1.rst b/doc/whats_new/v0.1.rst index 03343a54c..2bb8cd56d 100644 --- a/doc/whats_new/v0.1.rst +++ b/doc/whats_new/v0.1.rst @@ -37,6 +37,7 @@ Changelog - |Feature| Implement FCI algorithm, :class:`dodiscover.constraint.FCI` for learning causal structure from observational data with latent confounders under the ``dodiscover.constraint`` submodule, by `Adam Li`_ (:pr:`52`) - |Feature| Implement Structural Hamming Distance metric to compare directed graphs, :func:`dodiscover.metrics.structure_hamming_dist`, by `Adam Li`_ (:pr:`55`) - |Fix| Update dependency on networkx, which removes a PR branch dependency with pywhy-graphs having the MixedEdgeGraph class that was causing a dependency conflict, by `Adam Li`_ (:pr:`74`) +- |Chore| Add tutorial for PC algorithm with Asia data, by `Robert Osazuwa Ness`_ (:pr:`67`) Code and Documentation Contributors ----------------------------------- diff --git a/pyproject.toml b/pyproject.toml index b8c889e65..20d97b43c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ importlib-resources = { version = "*", python = "<3.9" } networkx = "^2.8.8" pywhy-graphs = { git = "https://github.com/py-why/pywhy-graphs.git", branch = 'main', optional = true } pygraphviz = { version = "*", optional = true } +bnlearn = { git = "https://github.com/erdogant/bnlearn.git", branch = 'master', optional = true } [tool.poetry.group.style] optional = true @@ -96,6 +97,7 @@ tqdm = { version = "^4.64.0" } # needed in dowhy's package [tool.poetry.extras] graph_func = ['pywhy-graphs'] viz = ['pygraphviz'] +data = ['bnlearn'] [tool.portray] output_dir = ['site']