# PyKale Tutorial: Domain Adaptation (Generalization) for Autism Detection with Multi-site Brain Imaging Data
| [Open in Colab](https://github.com/sz144/pykale/blob/brain-example/examples/autism_detection/tutorial.ipynb) (click `Runtime` → `Run all (Ctrl+F9)` |  [Launch Binder](https://mybinder.org/v2/gh/pykale/pykale/HEAD?filepath=examples%2Fautism_detection%2Ftutorial.ipynb) (click `Run` → `Run All Cells`) |

## Overview

- Pre-processing:
    - Data loading
    - Construct brain networks
- Machine learning pipeline:
    - Baseline: Ridge classifier
    - Covariate-dependency regularized least square

**Reference:**

Zhou, S., Li, W., Cox, C.R., & Lu, H. (2020). Side Information Dependence as a Regularizer for Analyzing Human Brain Conditions across Cognitive Experiments. *AAAI*. [[Link](https://ojs.aaai.org//index.php/AAAI/article/view/6179)]

## Setup

In [None]:
if 'google.colab' in str(get_ipython()):
    print('Running on CoLab')
    !pip install git+https://github.com/sz144/pykale.git@brain-example
    !git clone -b brain-example https://github.com/sz144/pykale.git
    %cd pykale/examples/autism_detection
else:
    print('Not running on CoLab')

This imports required modules.

In [1]:
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from nilearn.datasets import fetch_abide_pcp
import pandas as pd
from config import get_cfg_defaults

import sys
sys.path.insert(0, os.path.abspath('../..'))

from kale.utils.download import download_file_by_url
from kale.loaddata.image_access import read_dicom_images
from kale.interpret import visualize

  warn("Fetchers from the nilearn.datasets module will be "


In [22]:
cfg_path = "configs/tutorial.yaml" # Path to `.yaml` config file

cfg = get_cfg_defaults()
cfg.merge_from_file(cfg_path)
cfg.freeze()
print(cfg)

DATASET:
  ATLAS: rois_cc200
  PIPELINE: cpac
  ROOT: ../data
  SITE_IDS: ['NYU', 'UM_1', 'UCLA_1', 'USM']
  TARGET: NYU
MODEL:
  ALPHA: 1.0
  KERNEL: linear
  LAMBDA_: 1000.0
  LOSS: logits
  LR: 1e-05
OUTPUT:
  ROOT: ./outputs


## Data Preparation


### Fetch ABIDE fMRI timeseries

In [3]:
root_dir = cfg.DATASET.ROOT
pipeline = cfg.DATASET.PIPELINE  # fmri pre-processing pipeline
atlas = cfg.DATASET.ATLAS
site_ids = cfg.DATASET.SITE_IDS
abide = fetch_abide_pcp(data_dir=root_dir, pipeline=pipeline, 
                        band_pass_filtering=True, global_signal_regression=False, 
                        derivatives=atlas, quality_checked=False,
#                         SITE_ID=site_ids, 
                        verbose=0)

  output = genfromtxt(fname, **kwargs)


### Read Phenotypic data

In [4]:
pheno_file = os.path.join(cfg.DATASET.ROOT, "ABIDE_pcp/Phenotypic_V1_0b_preprocessed1.csv")
pheno_info = pd.read_csv(pheno_file, index_col=0)

View Phenotypic data

In [5]:
pheno_info.head()

Unnamed: 0,Unnamed: 0.1,SUB_ID,X,subject,SITE_ID,FILE_ID,DX_GROUP,DSM_IV_TR,AGE_AT_SCAN,SEX,...,qc_notes_rater_1,qc_anat_rater_2,qc_anat_notes_rater_2,qc_func_rater_2,qc_func_notes_rater_2,qc_anat_rater_3,qc_anat_notes_rater_3,qc_func_rater_3,qc_func_notes_rater_3,SUB_IN_SMP
0,1,50002,1,50002,PITT,no_filename,1,1,16.77,1,...,,OK,,fail,ic-parietal-cerebellum,OK,,fail,ERROR #24,1
1,2,50003,2,50003,PITT,Pitt_0050003,1,1,24.45,1,...,,OK,,OK,,OK,,OK,,1
2,3,50004,3,50004,PITT,Pitt_0050004,1,1,19.09,1,...,,OK,,OK,,OK,,OK,,1
3,4,50005,4,50005,PITT,Pitt_0050005,1,1,13.73,2,...,,OK,,maybe,ic-parietal-cerebellum,OK,,OK,,0
4,5,50006,5,50006,PITT,Pitt_0050006,1,1,13.37,1,...,,OK,,maybe,ic-parietal slight,OK,,OK,,1


### Read timeseries from files

In [6]:
data_dir = os.path.join(root_dir, "ABIDE_pcp/%s/filt_noglobal" % pipeline)
use_idx = []
time_series = []
for i in pheno_info.index:
    data_file_name = "%s_%s.1D" % (pheno_info.loc[i, "FILE_ID"], atlas)
    data_path = os.path.join(data_dir, data_file_name)
    if os.path.exists(data_path):
        time_series.append(np.loadtxt(data_path, skiprows=0))
        use_idx.append(i)

Use "DX_GROUP" (autism vs control) as labels, and "SITE_ID" as covariates

In [7]:
pheno = pheno_info.loc[use_idx, ["SITE_ID", "DX_GROUP"]].reset_index(drop=True)

### Extracting Brain Networks Features

In [8]:
from nilearn.connectome import ConnectivityMeasure

correlation_measure = ConnectivityMeasure(kind='correlation', vectorize=True)
brain_networks = correlation_measure.fit_transform(time_series)

## Machine Learning for Multi-site Data

### Cross validation Pipeline

In [9]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import torch
from torch.nn.functional import one_hot

def cross_validation(x, y, covariates, estimator, domain_adaptation=False, domain_generalization=False):
    results = {"Target": [], "Num_samples": [], "Accuracy": []}
    unique_covariates = np.unique(covariates)
    n_covariates = len(unique_covariates)
    le = LabelEncoder()
    covariate_mat = one_hot(torch.as_tensor(le.fit_transform(covariates)))
    
    for tgt in unique_covariates:
        idx_tgt = np.where(covariates == tgt)
        idx_src = np.where(covariates != tgt)
        x_tgt = brain_networks[idx_tgt]
        x_src = brain_networks[idx_src]
        y_tgt = y[idx_tgt]
        y_src = y[idx_src]        
        
        if domain_generalization:
            estimator.fit(x_src, y_src, covariate_mat[idx_src])
        elif domain_adaptation:
            estimator.fit(np.concatenate((x_src, x_tgt)), y_src, 
                          np.concatenate((covariate_mat[idx_src], covariate_mat[idx_tgt])))
        else:            
            estimator.fit(x_src, y_src)
        y_pred = estimator.predict(x_tgt)
        results["Accuracy"].append(accuracy_score(y_tgt, y_pred))
        results["Target"].append(tgt)
        results["Num_samples"].append(x_tgt.shape[0])
    
    mean_acc = sum([results["Num_samples"][i] * results["Accuracy"][i] for i in range(n_covariates)])
    mean_acc /= x.shape[0]
    results["Target"].append("Average")
    results["Num_samples"].append(x.shape[0])
    results["Accuracy"].append(mean_acc)
    
    return pd.DataFrame(results)

### Baseline

In [10]:
from sklearn.linear_model import RidgeClassifier

estimator = RidgeClassifier()
res_df = cross_validation(brain_networks, pheno["DX_GROUP"].values, pheno["SITE_ID"], estimator)

In [11]:
res_df

Unnamed: 0,Target,Num_samples,Accuracy
0,CALTECH,37,0.567568
1,CMU,27,0.740741
2,KKI,48,0.6875
3,LEUVEN_1,29,0.689655
4,LEUVEN_2,34,0.676471
5,MAX_MUN,52,0.576923
6,NYU,175,0.714286
7,OHSU,26,0.692308
8,OLIN,34,0.617647
9,PITT,56,0.678571


### Domain Adaptation

In [23]:
from kale.pipeline.multi_domain_adapter import _CoDeRLS
estimator = _CoDeRLS(kernel=cfg.MODEL.KERNEL, lambda_=cfg.MODEL.LAMBDA_, alpha=cfg.MODEL.ALPHA)
res_df = cross_validation(brain_networks, pheno["DX_GROUP"].values, pheno["SITE_ID"], 
                          estimator, domain_adaptation=True)

In [24]:
res_df

Unnamed: 0,Target,Num_samples,Accuracy
0,CALTECH,37,0.621622
1,CMU,27,0.703704
2,KKI,48,0.708333
3,LEUVEN_1,29,0.689655
4,LEUVEN_2,34,0.735294
5,MAX_MUN,52,0.596154
6,NYU,175,0.725714
7,OHSU,26,0.730769
8,OLIN,34,0.676471
9,PITT,56,0.660714


### Domain Generalization

In [25]:
from kale.pipeline.multi_domain_adapter import _CoDeRLS
estimator = _CoDeRLS(kernel=cfg.MODEL.KERNEL, lambda_=cfg.MODEL.LAMBDA_, alpha=cfg.MODEL.ALPHA)
res_df = cross_validation(brain_networks, pheno["DX_GROUP"].values, pheno["SITE_ID"], 
                          estimator, domain_generalization=True)

In [26]:
res_df

Unnamed: 0,Target,Num_samples,Accuracy
0,CALTECH,37,0.621622
1,CMU,27,0.703704
2,KKI,48,0.645833
3,LEUVEN_1,29,0.62069
4,LEUVEN_2,34,0.764706
5,MAX_MUN,52,0.596154
6,NYU,175,0.725714
7,OHSU,26,0.615385
8,OLIN,34,0.676471
9,PITT,56,0.696429
