# Run independent components analysis (ICA)

We clean the data using ICA and then apply some further preprocessing steps to the ICA cleaned data.

**Performed steps**

- load concatenated raw data
- preprocessing the data for ICA
    - highpass filtering the data at 1Hz
    - downsampling the data to 250Hz (applying appropriate lowpass filter to prevent aliasing)
- running ICA on non-broken EEG channels only
- Using the EOG and ECG channels to automatically mark bad components in the ICA
- manually inspect all components, using automatically marked ones as guidance
    - select additional bad components
    - disselect false positive bad components from automatic marking
- Save the ICA object
- Apply the ICA to the concatenated raw data 
    - load the data fresh!
    - this is NOT the data preprocessed for ICA
- preprocess ICA cleaned data with the following steps:
    - bandpass filtering
    - interpolation of bad channels
    - re-referencing to average
- Save the ICA cleaned, preprocessed data

**Script can be run in INTERACTIVE or NON-INTERACTIVE mode!**

- --> set the `INTERACTIVE` variable to True or False
- --> you need to run with `INTERACTIVE=True` to screen the data and decide which components to reject. After running interactively once, these components to be rejected are saved and the script can be run non-interactively if you wish so.

In [None]:
%matplotlib qt

In [None]:
import itertools
import multiprocessing
import os
import os.path as op
import sys

import mne
import numpy as np

from utils import BIDS_ROOT

In [None]:
# IO: Where to find the data
# Where to find concatenated raw data
fname_rawconcat_template = op.join(
    BIDS_ROOT, "derivatives", "sub-{0:02}", "sub-{0:02}_concat_eeg-raw.fif.gz"
)

# Where to save ica-cleaned data and ICA object
fname_rawclean_template = op.join(
    BIDS_ROOT, "derivatives", "sub-{0:02}", "sub-{0:02}_clean_eeg-raw.fif.gz"
)
fname_ica_unscreened_template = op.join(
    BIDS_ROOT,
    "derivatives",
    "sub-{0:02}",
    "sub-{0:02}_concat_eeg-unscreened-ica.fif.gz",
)
fname_ica_screened_template = op.join(
    BIDS_ROOT, "derivatives", "sub-{0:02}", "sub-{0:02}_concat_eeg-screened-ica.fif.gz"
)
fname_excluded_comps_template = op.join(
    BIDS_ROOT,
    "derivatives",
    "sub-{0:02}",
    "sub-{0:02}_concat_eeg-excluded-ica-comps.txt",
)
# Pack all names in a dict
name_templates = dict()
name_templates["rawconcat"] = fname_rawconcat_template
name_templates["rawclean"] = fname_rawclean_template
name_templates["ica_unscreened"] = fname_ica_unscreened_template
name_templates["ica_screened"] = fname_ica_screened_template
name_templates["excluded_comps"] = fname_excluded_comps_template

In [None]:
# Set here whether to run the script interactively or not
INTERACTIVE = False

In [None]:
subjects = range(1, 41)

# How many subjects to run over in parallel
# NOTE: The ICA per subject itself is not parallellized ...
# however, several sub-calls of the ICA may do multithreaded
# MKL computations. By exporting `MKL_NUM_THREADS=1`
# as an environment variable, that can be controlled.
NJOBS = max(1, multiprocessing.cpu_count() - 8)

# Whether or not to overwrite existing files
overwrite = False

# Using autoreject to automatically determine a threshold for
# segment rejection prior to ICA fitting
use_autoreject = False

# For filtering later on
LOW_CUTOFF = 0.1
HIGH_CUTOFF = 40.0

In [None]:
def preproc_data_for_ica(subj, name_templates):
    """Preprocess raw data for ICA.
    
    We are using fixed settings of highpassing at 1Hz and
    downsampling to 100 Hz sampling frequency, using an 
    appropriate anti-aliasing filter before.
    
    Parameters
    ----------
    subj : int
        The subject id to work on.
    name_templates : dict
        A dictionary of string templates. Needs the following keys:
        "rawconcat"
    
    Returns
    -------
    raw : mne.io.fiff.raw.Raw
        The raw data preprocessed for ICA.
        
    """
    # Get the data
    fname_rawconcat = name_templates["rawconcat"].format(subj)
    raw = mne.io.read_raw_fif(fname_rawconcat, preload=True)

    # Preprocessing for ICA
    # highpass filter
    raw.filter(l_freq=1, h_freq=None)

    # downsample
    raw.resample(sfreq=100)

    return raw

In [None]:
def run_ica(raw, subj, name_templates, use_autoreject):
    """Run the ica on raw data.

    For running the ICA, we divide the raw data into fixed length epochs,
    drop all epochs that overlap with "BAD" annotations, and then estimate
    an amplitude threshold to further reject bad epochs (optional, see
    `use_autoreject`). The ICA is then fit using extended infomax for the
    EEG channels only. Lastly, we use the ECG and EOG channels from the
    raw data to automatically mark presumably bad components.

    Parameters
    ----------
    raw : mne.io.fiff.raw.Raw
        The raw data preprocessed for ICA.
    subj : int
        The subject id to work on.
    name_templates : dict
        A dictionary of string templates. Needs the following keys:
        "ica"
    use_autoreject : bool
        Whether or not to use "autoreject" to further clean the data
        prior to ICA fitting.

    Returns
    -------
    ica : mne.preprocessing.ica.ICA
        The ica object.

    """
    # Length of data chunks for artifact rejection in seconds
    tstep = 1.0

    # Automatically estimate an amplitude threshold beyond which epochs
    # should be classified as bad: "autoreject"
    if use_autoreject:
        # make sure you ran pip install autoreject
        import autoreject

        # Make even length epochs out of the continuous data
        events = mne.make_fixed_length_events(raw, duration=tstep)
        epochs = mne.Epochs(
            raw, events, tmin=0.0, tmax=tstep, reject_by_annotation=True, baseline=None
        )

        # Drop epochs overlapping with "BAD" annotation
        epochs.drop_bad()

        # Automatically find a rejection threshold
        reject = autoreject.get_rejection_threshold(epochs, ch_types=["eeg"])
    else:
        reject = None

    # Initialize an ICA object, using extended infomax
    ica = mne.preprocessing.ICA(
        random_state=42, method="infomax", fit_params=dict(extended=True)
    )

    # Get the channel indices of all channels that are *clean* and of type *eeg*
    ica_chn_idxs = list(range(len(raw.ch_names)))

    bad_idxs = [raw.ch_names.index(ii) for ii in raw.info["bads"]]
    eog_idxs = [raw.ch_names.index(ii) for ii in raw.ch_names if "EOG" in ii]
    ecg_idxs = [raw.ch_names.index(ii) for ii in raw.ch_names if "ECG" in ii]

    for chn_to_exclude in [bad_idxs, eog_idxs, ecg_idxs]:
        for chn in chn_to_exclude:
            if chn in ica_chn_idxs:
                ica_chn_idxs.remove(chn)

    # Fit our raw (high passed, downsampled) data to our ica object
    # we pass the estimated rejection threshold to further exclude
    # bad data from the estimation
    ica.fit(
        raw, picks=ica_chn_idxs, reject=reject, tstep=tstep, reject_by_annotation=True
    )

    # Automatically find artifact components using the EOG and ECG data
    # NOTE: for find_bads_ecg, the default threshold of 0.25 is improved upon
    # in MNE 0.21 by adapting it automatically to the sampling rate. See:
    # https://github.com/mne-tools/mne-python/blob/ca3595f0b073ed4c75470eae634d046236007838/mne/preprocessing/ica.py#L1110-L1136
    # Manually running that code yields a threshold of `0.5` for sf=100Hz
    veog_idx, veog_scores = ica.find_bads_eog(raw, "VEOG")
    heog_idx, heog_scores = ica.find_bads_eog(raw, "HEOG")
    ecg_idx, ecg_scores = ica.find_bads_ecg(raw, "ECG", method="ctps", threshold=0.5)

    # exclude the automatically identified components
    ica.exclude = list(set(np.concatenate([veog_idx, heog_idx, ecg_idx]).astype(int)))

    return ica

# Run ICA for all subjects

Save the results as "unscreened".

In [None]:
def run_parallel(subj, name_templates, overwrite, use_autoreject):
    """Run workflow for one subj and save."""

    # Handle existing files
    fname_ica = name_templates["ica_unscreened"].format(subj)
    if op.exists(fname_ica):
        if overwrite:
            os.remove(fname_ica)
        else:
            # if file exists and we do not overwrite, return early
            return

    # compute
    raw = preproc_data_for_ica(subj, name_templates)
    ica = run_ica(raw, subj, name_templates, use_autoreject)

    # save
    fname_ica_unscreened = name_templates["ica_unscreened"].format(subj)
    ica.save(fname_ica_unscreened)

In [None]:
# Run the pipeline in parallel over subjects
pool_inputs = itertools.product(
    subjects, [name_templates], [overwrite], [use_autoreject]
)

with multiprocessing.Pool(NJOBS) as pool:
    pool.starmap(run_parallel, pool_inputs)

# Screen ICA results, apply to non-processed raw data

- Save the updated results as "screened"
- Also save the to-be-rejected ICA components in a TXT file (each component
  index on a separate line)
  

## Non-interactive way

In [None]:
# NON-INTERACTIVE way to run the script
if not INTERACTIVE:

    for subj in subjects:

        # Read components to exclude
        fname_comps_to_exclude = name_templates["excluded_comps"].format(subj)
        if not op.exists(fname_comps_to_exclude):
            raise RuntimeError(
                "The components to exlude file does not exist yet. "
                "You first have to run the script interactively and save the files."
            )

        with open(fname_comps_to_exclude, "r") as fin:
            lines = fin.readlines()
            comps_to_exclude = [int(i.strip()) for i in lines]
        
        # Process the unscreened ICA, apply components to exclude -> screened ICA
        fname_ica_unscreened = name_templates["ica_unscreened"].format(subj)
        ica = mne.preprocessing.read_ica(fname_ica_unscreened)

        ica.exclude = comps_to_exclude

        fname_ica_screened = name_templates["ica_screened"].format(subj)
        ica.save(fname_ica_screened)

        # Apply screened ICA to unprocessed raw data -> clean raw
        # NOTE: we call ica.apply on the full, unprocessed raw data with
        # all channels (64 EEG, 2 EOG, 1 ECG). Once could think that this
        # does not work, because to fit the ICA, we only passed the *clean*
        # EEG channels (so 64 or less EEG). However, it *does* work, because
        # MNE-Python's ica object stores the channels used for fitting, and
        # when we call apply, the ica mixing is only applied to those channels
        # that were also used for fitting. All other channels will remain as
        # they are: which is good for EOG and ECG ... and the bad EEG channels
        # will later be interpolated using the clean EEG channels.        
        unproc_raw = mne.io.read_raw_fif(
            name_templates["rawconcat"].format(subj), preload=True
        )
        ica.apply(unproc_raw)

        # Finish with filtering + interpolation + re-reference
        # Highpass filtering
        unproc_raw = unproc_raw.filter(l_freq=LOW_CUTOFF, h_freq=None)

        # Lowpass filtering
        unproc_raw = unproc_raw.filter(l_freq=None, h_freq=HIGH_CUTOFF)

        # Interpolation
        unproc_raw = unproc_raw.interpolate_bads()

        # Re-referencing
        unproc_raw = unproc_raw.set_eeg_reference(ref_channels="average",
                                                  projection=False, ch_type="eeg")

        # Save as cleaned data
        unproc_raw.save(name_templates["rawclean"].format(subj))

    # This will terminate the script, preventing it to go to the interactive
    # part.
    print("\n\nDone! ... quitting early to prevent interactive part.")
    sys.exit()

## Interactive way

In [None]:
## Increment the subj variable by ID 1 to 40
subj = 1

fname = name_templates["excluded_comps"].format(subj)
if op.exists(fname):
    print(
        "This data has already been screened. "
        "You may want to increase the `subj` variable."
    )

In [None]:
fname_ica_screened = name_templates["ica_screened"].format(subj)

if op.exists(fname_ica_screened):
    print("Screened ICA file already exists. Increment `subj` variable?")

In [None]:
# Load data required for screening
raw = preproc_data_for_ica(subj, name_templates)
fname_ica_unscreened = name_templates["ica_unscreened"].format(subj)
ica = mne.preprocessing.read_ica(fname_ica_unscreened)

In [None]:
# Which ICA components were automatically marked to be excluded?
ica.exclude

In [None]:
# Screen the data, marking components to be included or rejected
# I.e., double checking the automatic marking
ica.plot_sources(raw)

In [None]:
# Which ICA components are now to be excluded?
ica.exclude

In [None]:
# Save which components to be excluded
fname = name_templates["excluded_comps"].format(subj)
if op.exists(fname) and not overwrite:
    raise IOError("fname '{}' exists! Please double check.".format(fname))
else:
    with open(fname, "w") as fout:
        fout.write("\n".join([str(comp_idx) for comp_idx in ica.exclude]))

In [None]:
# Save data as screened
ica.save(fname_ica_screened)

In [None]:
# Apply ICA to unprocessed raw data
ica = mne.preprocessing.read_ica(fname_ica_screened)
unproc_raw = mne.io.read_raw_fif(name_templates["rawconcat"].format(subj), preload=True)
ica.apply(unproc_raw)

### Inspect cleaned vs. non-cleaned data

In [None]:
clean_raw = unproc_raw.copy()
unclean_raw = mne.io.read_raw_fif(
    name_templates["rawconcat"].format(subj), preload=True
)

In [None]:
# temporarily deleting annotations for plotting
clean_raw.annotations.delete(range(len(clean_raw.annotations.description)))
unclean_raw.annotations.delete(range(len(unclean_raw.annotations.description)))

In [None]:
clean_raw.plot(n_channels=len(clean_raw.ch_names), bad_color=(1, 0, 0), duration=20.0)

In [None]:
unclean_raw.plot(
    n_channels=len(unclean_raw.ch_names), bad_color=(1, 0, 0), duration=20.0
)

### Finish with filtering + interpolation + re-reference

In [None]:
# Highpass filtering
unproc_raw = unproc_raw.filter(l_freq=LOW_CUTOFF, h_freq=None)

In [None]:
# Lowpass filtering
unproc_raw = unproc_raw.filter(l_freq=None, h_freq=HIGH_CUTOFF)

In [None]:
# Interpolation
unproc_raw = unproc_raw.interpolate_bads()

In [None]:
# Re-referencing
unproc_raw = unproc_raw.set_eeg_reference(ref_channels="average", projection=False, ch_type="eeg")

In [None]:
# Save as cleaned data
unproc_raw.save(name_templates["rawclean"].format(subj))