Merge pull request #395 from pykale/add_main_multisite_neuroimg_adapt

Create main.py, cross_validation for multisite_neuroimg_adapt example
pykale · Oct 4, 2023 · cc561ef · cc561ef
2 parents 0139b14 + d47c5d1
commit cc561ef
Show file tree

Hide file tree

Showing 7 changed files with 254 additions and 82 deletions.
diff --git a/docs/source/kale.evaluate.rst b/docs/source/kale.evaluate.rst
@@ -6,6 +6,14 @@ Evaluate
 Submodules
 ----------
 
+kale.evaluate.cross\_validation module
+----------------------------
+
+.. automodule:: kale.evaluate.cross_validation
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 kale.evaluate.metrics module
 ----------------------------
 

diff --git a/examples/multisite_neuroimg_adapt/README.md b/examples/multisite_neuroimg_adapt/README.md
@@ -33,14 +33,10 @@ autism detection.
 
 ### References
 
-[1] Cameron Craddock, Yassine Benhajali, Carlton Chu, Francois Chouinard, Alan Evans, András Jakab, Budhachandra Singh
-Khundrakpam, John David Lewis, Qingyang Li, Michael Milham, Chaogan Yan, Pierre Bellec (2013). The Neuro Bureau
-Preprocessing Initiative: open sharing of preprocessed neuroimaging data and derivatives. In *Neuroinformatics 2013*,
-Stockholm, Sweden.
+[1] Craddock C., Benhajali Y., Chu C., Chouinard F., Evans A., Jakab A., Khundrakpam BS., Lewis JD., Li Q., Milham M., Yan C. and Bellec P. (2013). [The Neuro Bureau Preprocessing Initiative: Open Sharing of Preprocessed Neuroimaging Data and Derivatives](https://doi.org/10.3389/conf.fninf.2013.09.00041). Frontiers in Neuroinformatics, 7.
 
-[2] Abraham, A., Pedregosa, F., Eickenberg, M., Gervais, P., Mueller, A., Kossaifi, J., ... & Varoquaux, G. (2014).
-Machine learning for neuroimaging with scikit-learn. *Frontiers in neuroinformatics*, 14.
+[2] Abraham A., Pedregosa F., Eickenberg M., Gervais P., Mueller A., Kossaifi J., Gramfort A., Thirion B. and Varoquaux G. (2014). [Machine Learning for Neuroimaging with scikit-learn](https://doi.org/10.3389/fninf.2014.00014). Frontiers in Neuroinformatics, 8.
 
-[3] Zhou, S., Li, W., Cox, C.R., & Lu, H. (2020). [Side Information Dependence as a Regularizer for Analyzing Human Brain Conditions across Cognitive Experiments](https://ojs.aaai.org//index.php/AAAI/article/view/6179). in *AAAI 2020*, New York, USA.
+[3] Zhou S., Li W., Cox C. and Lu H. (2020). [Side Information Dependence as a Regularizer for Analyzing Human Brain Conditions across Cognitive Experiments](https://doi.org/10.1609/aaai.v34i04.6179). Proceedings of the AAAI Conference on Artificial Intelligence, 34(04), 6957-6964.
 
-[4] Zhou, S. (2022). [Interpretable Domain-Aware Learning for Neuroimage Classification](https://etheses.whiterose.ac.uk/31044/1/PhD_thesis_ShuoZhou_170272834.pdf) (Doctoral dissertation, University of Sheffield).
+[4] Zhou S. (2022). [Interpretable Domain-Aware Learning for Neuroimage Classification](https://etheses.whiterose.ac.uk/31044/1/PhD_thesis_ShuoZhou_170272834.pdf) (Doctoral Dissertation, University of Sheffield).
diff --git a/examples/multisite_neuroimg_adapt/config.py b/examples/multisite_neuroimg_adapt/config.py
@@ -20,6 +20,11 @@
 # options: {rois_aal, rois_cc200, rois_cc400, rois_dosenbach160, rois_ez, rois_ho, rois_tt}
 _C.DATASET.SITE_IDS = None  # list of site ids to use, if None, use all sites
 _C.DATASET.TARGET = "NYU"  # target site ids, e.g. "UM_1", "UCLA_1", "USM"
+# ---------------------------------------------------------
+# Solver
+# ---------------------------------------------------------
+_C.SOLVER = CfgNode()
+_C.SOLVER.SEED = 2023
 # ---------------------------------------------------------------------------- #
 # Machine learning pipeline
 # ---------------------------------------------------------------------------- #

diff --git a/examples/multisite_neuroimg_adapt/main.py b/examples/multisite_neuroimg_adapt/main.py
@@ -0,0 +1,97 @@
+"""
+Autism Detection: Domain Adaptation for Multi-Site Neuroimaging Data Analysis
+
+Reference:
+[1] Craddock C., Benhajali Y., Chu C., Chouinard F., Evans A., Jakab A., Khundrakpam BS., Lewis JD., Li Q., Milham M., Yan C. and Bellec P. (2013). The Neuro Bureau Preprocessing Initiative: Open Sharing of Preprocessed Neuroimaging Data and Derivatives. Frontiers in Neuroinformatics, 7. https://doi.org/10.3389/conf.fninf.2013.09.00041
+
+[2] Abraham A., Pedregosa F., Eickenberg M., Gervais P., Mueller A., Kossaifi J., Gramfort A., Thirion B. and Varoquaux G. (2014). Machine Learning for Neuroimaging with scikit-learn. Frontiers in Neuroinformatics, 8. https://doi.org/10.3389/fninf.2014.00014
+
+[3] Zhou S., Li W., Cox C. and Lu H. (2020). Side Information Dependence as a Regularizer for Analyzing Human Brain Conditions across Cognitive Experiments. Proceedings of the AAAI Conference on Artificial Intelligence, 34(04), 6957-6964. https://doi.org/10.1609/aaai.v34i04.6179
+
+[4] Zhou S. (2022). Interpretable Domain-Aware Learning for Neuroimage Classification (Doctoral Dissertation, University of Sheffield). https://etheses.whiterose.ac.uk/31044/1/PhD_thesis_ShuoZhou_170272834.pdf
+"""
+import argparse
+import os
+
+import numpy as np
+import pandas as pd
+from config import get_cfg_defaults
+from nilearn.connectome import ConnectivityMeasure
+from nilearn.datasets import fetch_abide_pcp
+from sklearn.linear_model import RidgeClassifier
+
+import kale.utils.seed as seed
+from kale.evaluate import cross_validation
+from kale.pipeline.multi_domain_adapter import CoIRLS
+
+
+def arg_parse():
+    parser = argparse.ArgumentParser(
+        description="Autism Detection: Domain Adaptation for Multi-Site Neuroimaging Data Analysis"
+    )
+    parser.add_argument("--cfg", required=True, help="path to config file", type=str)
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = arg_parse()
+
+    # ---- Set up configs ----
+    cfg = get_cfg_defaults()
+    cfg.merge_from_file(args.cfg)
+    cfg.freeze()
+    seed.set_seed(cfg.SOLVER.SEED)
+
+    # ---- Fetch ABIDE fMRI timeseries ----
+    fetch_abide_pcp(
+        data_dir=cfg.DATASET.ROOT,
+        pipeline=cfg.DATASET.PIPELINE,
+        band_pass_filtering=True,
+        global_signal_regression=False,
+        derivatives=cfg.DATASET.ATLAS,
+        quality_checked=False,
+        SITE_ID=cfg.DATASET.SITE_IDS,
+        verbose=1,
+    )
+
+    # ---- Read Phenotypic data ----
+    pheno_file = os.path.join(cfg.DATASET.ROOT, "ABIDE_pcp/Phenotypic_V1_0b_preprocessed1.csv")
+    pheno_info = pd.read_csv(pheno_file, index_col=0)
+
+    # ---- Read timeseries from files ----
+    data_dir = os.path.join(cfg.DATASET.ROOT, "ABIDE_pcp/%s/filt_noglobal" % cfg.DATASET.PIPELINE)
+    use_idx = []
+    time_series = []
+    for i in pheno_info.index:
+        data_file_name = "%s_%s.1D" % (pheno_info.loc[i, "FILE_ID"], cfg.DATASET.ATLAS)
+        data_path = os.path.join(data_dir, data_file_name)
+        if os.path.exists(data_path):
+            time_series.append(np.loadtxt(data_path, skiprows=0))
+            use_idx.append(i)
+
+    # ---- Use "DX_GROUP" (autism vs control) as labels, and "SITE_ID" as covariates ----
+    pheno = pheno_info.loc[use_idx, ["SITE_ID", "DX_GROUP"]].reset_index(drop=True)
+
+    # ---- Extracting Brain Networks Features ----
+    correlation_measure = ConnectivityMeasure(kind="correlation", vectorize=True)
+    brain_networks = correlation_measure.fit_transform(time_series)
+
+    # ---- Machine Learning for Multi-site Data ----
+    print("Baseline")
+    estimator = RidgeClassifier()
+    results = cross_validation.leave_one_group_out(
+        brain_networks, pheno["DX_GROUP"].values, pheno["SITE_ID"].values, estimator
+    )
+    print(pd.DataFrame.from_dict(results))
+
+    print("Domain Adaptation")
+    estimator = CoIRLS(kernel=cfg.MODEL.KERNEL, lambda_=cfg.MODEL.LAMBDA_, alpha=cfg.MODEL.ALPHA)
+    results = cross_validation.leave_one_group_out(
+        brain_networks, pheno["DX_GROUP"].values, pheno["SITE_ID"].values, estimator, use_domain_adaptation=True
+    )
+    print(pd.DataFrame.from_dict(results))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/multisite_neuroimg_adapt/tutorial.ipynb b/examples/multisite_neuroimg_adapt/tutorial.ipynb
@@ -20,18 +20,18 @@
         "    - [Data loading](#Data-Preparation)\n",
         "    - [Construct brain networks](#Extracting-Brain-Networks-Features)\n",
         "- Machine learning pipeline:\n",
-        "    - [Baseline: Ridge classifier](#Baseline)\n",
+        "    - [Baseline: Ridge classifier](#Baseline-Model)\n",
         "    - [Domain adaptation](#Domain-Adaptation)\n",
         "\n",
         "**Reference:**\n",
         "\n",
-        "[1] Cameron Craddock, Yassine Benhajali, Carlton Chu, Francois Chouinard, Alan Evans, Andr\u00e1s Jakab, Budhachandra Singh Khundrakpam, John David Lewis, Qingyang Li, Michael Milham, Chaogan Yan, Pierre Bellec (2013). The Neuro Bureau Preprocessing Initiative: open sharing of preprocessed neuroimaging data and derivatives. In *Neuroinformatics 2013*, Stockholm, Sweden.\n",
+        "[1] Craddock C., Benhajali Y., Chu C., Chouinard F., Evans A., Jakab A., Khundrakpam BS., Lewis JD., Li Q., Milham M., Yan C. and Bellec P. (2013). [The Neuro Bureau Preprocessing Initiative: Open Sharing of Preprocessed Neuroimaging Data and Derivatives](https://doi.org/10.3389/conf.fninf.2013.09.00041). Frontiers in Neuroinformatics, 7.\n",
         "\n",
-        "[2] Abraham, A., Pedregosa, F., Eickenberg, M., Gervais, P., Mueller, A., Kossaifi, J., ... & Varoquaux, G. (2014). Machine learning for neuroimaging with scikit-learn. *Frontiers in neuroinformatics*, 14.\n",
+        "[2] Abraham A., Pedregosa F., Eickenberg M., Gervais P., Mueller A., Kossaifi J., Gramfort A., Thirion B. and Varoquaux G. (2014). [Machine Learning for Neuroimaging with scikit-learn](https://doi.org/10.3389/fninf.2014.00014). Frontiers in Neuroinformatics, 8.\n",
         "\n",
-        "[3] Zhou, S., Li, W., Cox, C.R., & Lu, H. (2020). [Side Information Dependence as a Regularizer for Analyzing Human Brain Conditions across Cognitive Experiments](https://ojs.aaai.org//index.php/AAAI/article/view/6179). in *AAAI 2020*, New York, USA. \n",
+        "[3] Zhou S., Li W., Cox C. and Lu H. (2020). [Side Information Dependence as a Regularizer for Analyzing Human Brain Conditions across Cognitive Experiments](https://doi.org/10.1609/aaai.v34i04.6179). Proceedings of the AAAI Conference on Artificial Intelligence, 34(04), 6957-6964.\n",
         "\n",
-        "[4] Zhou, S. (2022). [Interpretable Domain-Aware Learning for Neuroimage Classification](https://etheses.whiterose.ac.uk/31044/1/PhD_thesis_ShuoZhou_170272834.pdf) (Doctoral dissertation, University of Sheffield)."
+        "[4] Zhou S. (2022). [Interpretable Domain-Aware Learning for Neuroimage Classification](https://etheses.whiterose.ac.uk/31044/1/PhD_thesis_ShuoZhou_170272834.pdf) (Doctoral Dissertation, University of Sheffield)."
       ],
       "cell_type": "markdown"
     },
@@ -50,13 +50,21 @@
         "    !pip uninstall --yes imgaug && pip uninstall --yes albumentations && pip install git+https://github.com/aleju/imgaug.git\n",
         "    !git clone https://github.com/pykale/pykale.git\n",
         "    %cd pykale\n",
-        "    !pip install .[image,example] \n",
+        "    !pip install .[image,example]\n",
         "    %cd examples/multisite_neuroimg_adapt\n",
         "else:\n",
         "    print('Not running on CoLab')"
       ],
       "cell_type": "code",
-      "outputs": [],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Not running on CoLab\n"
+          ]
+        }
+      ],
       "execution_count": null
     },
     {
@@ -70,18 +78,16 @@
       "metadata": {},
       "source": [
         "import os\n",
-        "\n",
-        "%matplotlib inline\n",
-        "import matplotlib.pyplot as plt\n",
         "import numpy as np\n",
-        "from nilearn.datasets import fetch_abide_pcp\n",
         "import pandas as pd\n",
         "from config import get_cfg_defaults\n",
+        "from nilearn.connectome import ConnectivityMeasure\n",
+        "from nilearn.datasets import fetch_abide_pcp\n",
+        "from sklearn.linear_model import RidgeClassifier\n",
         "\n",
-        "import sys\n",
-        "\n",
-        "from kale.utils.download import download_file_by_url\n",
-        "from kale.interpret import visualize"
+        "import kale.utils.seed as seed\n",
+        "from kale.evaluate import cross_validation\n",
+        "from kale.pipeline.multi_domain_adapter import CoIRLS"
       ],
       "cell_type": "code",
       "outputs": [],
@@ -90,11 +96,12 @@
     {
       "metadata": {},
       "source": [
-        "cfg_path = \"configs/tutorial.yaml\" # Path to `.yaml` config file\n",
-        "\n",
+        "# Path to `.yaml` config file\n",
+        "cfg_path = \"configs/tutorial.yaml\" \n",
         "cfg = get_cfg_defaults()\n",
         "cfg.merge_from_file(cfg_path)\n",
         "cfg.freeze()\n",
+        "seed.set_seed(cfg.SOLVER.SEED)\n",
         "print(cfg)"
       ],
       "cell_type": "code",
@@ -117,10 +124,10 @@
         "pipeline = cfg.DATASET.PIPELINE  # fmri pre-processing pipeline\n",
         "atlas = cfg.DATASET.ATLAS\n",
         "site_ids = cfg.DATASET.SITE_IDS\n",
-        "abide = fetch_abide_pcp(data_dir=root_dir, pipeline=pipeline, \n",
-        "                        band_pass_filtering=True, global_signal_regression=False, \n",
+        "abide = fetch_abide_pcp(data_dir=root_dir, pipeline=pipeline,\n",
+        "                        band_pass_filtering=True, global_signal_regression=False,\n",
         "                        derivatives=atlas, quality_checked=False,\n",
-        "                        SITE_ID=site_ids, \n",
+        "                        SITE_ID=site_ids,\n",
         "                        verbose=0)"
       ],
       "cell_type": "code",
@@ -210,8 +217,6 @@
     {
       "metadata": {},
       "source": [
-        "from nilearn.connectome import ConnectivityMeasure\n",
-        "\n",
         "correlation_measure = ConnectivityMeasure(kind='correlation', vectorize=True)\n",
         "brain_networks = correlation_measure.fit_transform(time_series)"
       ],
@@ -228,50 +233,6 @@
       ],
       "cell_type": "markdown"
     },
-    {
-      "metadata": {},
-      "source": [
-        "from sklearn.metrics import accuracy_score\n",
-        "from sklearn.preprocessing import OneHotEncoder\n",
-        "import torch\n",
-        "\n",
-        "def cross_validation(x, y, covariates, estimator, domain_adaptation=False):\n",
-        "    results = {\"Target\": [], \"Num_samples\": [], \"Accuracy\": []}\n",
-        "    unique_covariates = np.unique(covariates)\n",
-        "    n_covariates = len(unique_covariates)\n",
-        "    enc = OneHotEncoder(handle_unknown=\"ignore\")\n",
-        "    covariate_mat = enc.fit_transform(covariates.reshape(-1, 1)).toarray()\n",
-        "    \n",
-        "    for tgt in unique_covariates:\n",
-        "        idx_tgt = np.where(covariates == tgt)\n",
-        "        idx_src = np.where(covariates != tgt)\n",
-        "        x_tgt = brain_networks[idx_tgt]\n",
-        "        x_src = brain_networks[idx_src]\n",
-        "        y_tgt = y[idx_tgt]\n",
-        "        y_src = y[idx_src]        \n",
-        "        \n",
-        "        if domain_adaptation:\n",
-        "            estimator.fit(np.concatenate((x_src, x_tgt)), y_src, \n",
-        "                          np.concatenate((covariate_mat[idx_src], covariate_mat[idx_tgt])))\n",
-        "        else:            \n",
-        "            estimator.fit(x_src, y_src)\n",
-        "        y_pred = estimator.predict(x_tgt)\n",
-        "        results[\"Accuracy\"].append(accuracy_score(y_tgt, y_pred))\n",
-        "        results[\"Target\"].append(tgt)\n",
-        "        results[\"Num_samples\"].append(x_tgt.shape[0])\n",
-        "    \n",
-        "    mean_acc = sum([results[\"Num_samples\"][i] * results[\"Accuracy\"][i] for i in range(n_covariates)])\n",
-        "    mean_acc /= x.shape[0]\n",
-        "    results[\"Target\"].append(\"Average\")\n",
-        "    results[\"Num_samples\"].append(x.shape[0])\n",
-        "    results[\"Accuracy\"].append(mean_acc)\n",
-        "    \n",
-        "    return pd.DataFrame(results)"
-      ],
-      "cell_type": "code",
-      "outputs": [],
-      "execution_count": null
-    },
     {
       "metadata": {},
       "source": [
@@ -282,10 +243,10 @@
     {
       "metadata": {},
       "source": [
-        "from sklearn.linear_model import RidgeClassifier\n",
-        "\n",
         "estimator = RidgeClassifier()\n",
-        "res_df = cross_validation(brain_networks, pheno[\"DX_GROUP\"].values, pheno[\"SITE_ID\"].values, estimator)"
+        "results = cross_validation.leave_one_group_out(\n",
+        "    brain_networks, pheno[\"DX_GROUP\"].values, pheno[\"SITE_ID\"].values, estimator\n",
+        ")"
       ],
       "cell_type": "code",
       "outputs": [],
@@ -294,7 +255,7 @@
     {
       "metadata": {},
       "source": [
-        "res_df"
+        "pd.DataFrame.from_dict(results)"
       ],
       "cell_type": "code",
       "outputs": [],
@@ -310,10 +271,10 @@
     {
       "metadata": {},
       "source": [
-        "from kale.pipeline.multi_domain_adapter import CoIRLS\n",
         "estimator = CoIRLS(kernel=cfg.MODEL.KERNEL, lambda_=cfg.MODEL.LAMBDA_, alpha=cfg.MODEL.ALPHA)\n",
-        "res_df = cross_validation(brain_networks, pheno[\"DX_GROUP\"].values, pheno[\"SITE_ID\"].values, \n",
-        "                          estimator, domain_adaptation=True)"
+        "results = cross_validation.leave_one_group_out(\n",
+        "  brain_networks, pheno[\"DX_GROUP\"].values, pheno[\"SITE_ID\"].values, estimator, use_domain_adaptation=True\n",
+        ")"
       ],
       "cell_type": "code",
       "outputs": [],
@@ -322,7 +283,7 @@
     {
       "metadata": {},
       "source": [
-        "res_df"
+        "pd.DataFrame.from_dict(results)"
       ],
       "cell_type": "code",
       "outputs": [],