Merge pull request #188 from scottclowe/rf_neuropil

RF: Refactor neuropil.separate
rochefort-lab · Jun 26, 2021 · 3a70ce0 · 3a70ce0
2 parents 7a4dcf5 + 78385ee
commit 3a70ce0
Show file tree

Hide file tree

Showing 2 changed files with 200 additions and 165 deletions.
diff --git a/fissa/neuropil.py b/fissa/neuropil.py
@@ -10,22 +10,31 @@
 
 import numpy as np
 import numpy.random as rand
-from sklearn.decomposition import FastICA, NMF, PCA
+import sklearn.decomposition
 
 
 def separate(
-        S, sep_method='nmf', n=None, maxiter=10000, tol=1e-4,
-        random_state=892, maxtries=10, W0=None, H0=None, alpha=0.1):
-    """For the signals in S, finds the independent signals underlying it,
-    using ica or nmf.
+    S,
+    sep_method="nmf",
+    n=None,
+    maxiter=10000,
+    tol=1e-4,
+    random_state=892,
+    maxtries=10,
+    W0=None,
+    H0=None,
+    alpha=0.1,
+):
+    """
+    Find independent signals, sorted by matching score against the first input signal.
 
     Parameters
     ----------
-    S : :term:`array_like`
+    S : :term:`array_like` shaped (signals, observations)
         2-d array containing mixed input signals.
         Each column of `S` should be a different signal, and each row an
-        observation of the signals. For ``S[i,j]``, `j` = each signal,
-        ``i`` = signal content.
+        observation of the signals. For ``S[i, j]``, ``j`` is a signal, and
+        ``i`` is an observation.
         The first column, ``j = 0``, is considered the primary signal and the
         one for which we will try to extract a decontaminated equivalent.
 
@@ -36,13 +45,14 @@ def separate(
         - ``"nmf"``: Non-negative Matrix Factorization
 
     n : int, optional
-        How many components to estimate. If ``None`` (default), use PCA to
-        estimate how many components would explain at least 99% of the
-        variance and adopt this value for ``n``.
+        How many components to estimate. If ``None`` (default), for the NMF
+        method, ``n`` is the number of input signals; for the ICA method,
+        we use PCA to estimate how many components would explain at least 99%
+        of the variance and adopt this value for ``n``.
     maxiter : int, optional
-        Number of maximally allowed iterations. Default is ``500``.
+        Number of maximally allowed iterations. Default is ``10000``.
     tol : float, optional
-        Error tolerance for termination. Default is ``1e-5``.
+        Error tolerance for termination. Default is ``1e-4``.
     random_state : int or None, optional
         Initial state for the random number generator. Set to ``None`` to use
         the numpy.random default. Default seed is ``892``.
@@ -62,28 +72,34 @@ def separate(
 
     Returns
     -------
-    S_sep : numpy.ndarray
+    S_sep : :class:`numpy.ndarray` shaped (signals, observations)
         The raw separated traces.
-    S_matched : numpy.ndarray
+    S_matched : :class:`numpy.ndarray` shaped (signals, observations)
         The separated traces matched to the primary signal, in order
-        of matching quality (see Methods below).
-    A_sep : numpy.ndarray
+        of matching quality (see Notes below).
+    A_sep : :class:`numpy.ndarray` shaped (signals, signals)
         Mixing matrix.
     convergence : dict
-        Metadata for the convergence result, with keys:
-
-        - ``"random_state"``: seed for ICA initiation
-        - ``"iterations"``: number of iterations needed for convergence
-        - ``"max_iterations"``: maximum number of iterations allowed
-        - ``"converged"``: whether the algorithm converged or not (`bool`)
+        Metadata for the convergence result, with the following keys and
+        values:
+
+        ``convergence["random_state"]``
+            Seed for estimator initiation.
+        ``convergence["iterations"]``
+            Number of iterations needed for convergence.
+        ``convergence["max_iterations"]``
+            Maximum number of iterations allowed.
+        ``convergence["converged"]``
+            Whether the algorithm converged or not (:class:`bool`).
 
     Notes
     -----
-    Concept by Scott Lowe and Sander Keemink.
-    Normalize the columns in estimated mixing matrix A so that
-    ``sum(column) = 1``.
-    This results in a relative score of how strongly each separated signal
-    is represented in each ROI signal.
+    To identify which independent signal matches the primary signal best,
+    we first normalize the columns in the output mixing matrix `A` such that
+    ``sum(A[:, separated]) = 1``. This results in a relative score of how
+    strongly each raw signal contributes to each separated signal. From this,
+    we find the separated signal to which the ROI trace makes the largest
+    (relative) contribution.
 
     See Also
     --------
@@ -102,112 +118,112 @@ def separate(
 
     # estimate number of signals to find, if not given
     if n is None:
-        if sep_method == 'ica':
+        if sep_method.lower() == "ica":
             # Perform PCA
-            pca = PCA(whiten=False)
+            pca = sklearn.decomposition.PCA(whiten=False)
             pca.fit(S.T)
 
             # find number of components with at least x percent explained var
             n = sum(pca.explained_variance_ratio_ > 0.01)
         else:
             n = S.shape[0]
 
-    if sep_method == 'ica':
-        # Use sklearn's implementation of ICA.
+    for i_try in range(maxtries):
+
+        if sep_method.lower() in {"ica", "fastica"}:
+            # Use sklearn's implementation of ICA.
 
-        for ith_try in range(maxtries):
             # Make an instance of the FastICA class. We can do whitening of
             # the data now.
-            ica = FastICA(n_components=n, whiten=True, max_iter=maxiter,
-                          tol=tol, random_state=random_state)
+            estimator = sklearn.decomposition.FastICA(
+                n_components=n,
+                whiten=True,
+                max_iter=maxiter,
+                tol=tol,
+                random_state=random_state,
+            )
 
             # Perform ICA and find separated signals
-            S_sep = ica.fit_transform(S.T)
-
-            # check if max number of iterations was reached
-            if ica.n_iter_ < maxiter:
-                print((
-                    'ICA converged after {} iterations.'
-                ).format(ica.n_iter_))
-                break
-            print((
-                'Attempt {} failed to converge at {} iterations.'
-            ).format(ith_try + 1, ica.n_iter_))
-            if ith_try + 1 < maxtries:
-                print('Trying a new random state.')
-                # Change to a new random_state
-                if random_state is not None:
-                    random_state = (random_state + 1) % 2**32
-
-        if ica.n_iter_ == maxiter:
-            print((
-                'Warning: maximum number of allowed tries reached at {} '
-                'iterations for {} tries of different random seed states.'
-            ).format(ica.n_iter_, ith_try + 1))
-
-        A_sep = ica.mixing_
-
-    elif sep_method == 'nmf':
-        for ith_try in range(maxtries):
-            # nSignals = nRegions +1
-            # ICA = FastICA(n_components=nSignals)
-            # ica = ICA.fit_transform(mixed.T)  # Reconstruct signals
-            # A_ica = ICA.mixing_  # Get estimated mixing matrix
-            #
-            #
+            S_sep = estimator.fit_transform(S.T)
+
+        elif sep_method.lower() in {"nmf", "nnmf"}:
 
             # Make an instance of the sklearn NMF class
-            if W0 is None and H0 is None:
-                nmf = NMF(
-                    init='nndsvdar', n_components=n,
-                    alpha=alpha, l1_ratio=0.5,
-                    tol=tol, max_iter=maxiter, random_state=random_state)
-
-                # Perform NMF and find separated signals
-                S_sep = nmf.fit_transform(S.T)
-
-            else:
-                nmf = NMF(
-                    init='custom', n_components=n,
-                    alpha=alpha, l1_ratio=0.5,
-                    tol=tol, max_iter=maxiter, random_state=random_state)
-
-                # Perform NMF and find separated signals
-                S_sep = nmf.fit_transform(S.T, W=W0, H=H0)
-
-            # check if max number of iterations was reached
-            if nmf.n_iter_ < maxiter - 1:
-                print((
-                    'NMF converged after {} iterations.'
-                ).format(nmf.n_iter_ + 1))
-                break
-            print((
-                'Attempt {} failed to converge at {} iterations.'
-            ).format(ith_try, nmf.n_iter_ + 1))
-            if ith_try + 1 < maxtries:
-                print('Trying a new random state.')
-                # Change to a new random_state
-                if random_state is not None:
-                    random_state = (random_state + 1) % 2**32
-
-        if nmf.n_iter_ == maxiter - 1:
-            print((
-                'Warning: maximum number of allowed tries reached at {} '
-                'iterations for {} tries of different random seed states.'
-            ).format(nmf.n_iter_ + 1, ith_try + 1))
-
-        A_sep = nmf.components_.T
+            estimator = sklearn.decomposition.NMF(
+                init="nndsvdar" if W0 is None and H0 is None else "custom",
+                n_components=n,
+                alpha=alpha,
+                l1_ratio=0.5,
+                tol=tol,
+                max_iter=maxiter,
+                random_state=random_state,
+            )
+
+            # Perform NMF and find separated signals
+            S_sep = estimator.fit_transform(S.T, W=W0, H=H0)
+
+        elif hasattr(sklearn.decomposition, sep_method):
+
+            print(
+                "Using ad hoc signal decomposition method"
+                " sklearn.decomposition.{}. Only NMF and ICA are officially"
+                " supported.".format(sep_method)
+            )
+
+            # Load up arbitrary decomposition algorithm from sklearn
+            estimator = getattr(sklearn.decomposition, sep_method)(
+                n_components=n,
+                tol=tol,
+                max_iter=maxiter,
+                random_state=random_state,
+            )
+            S_sep = estimator.fit_transform(S.T)
 
+        else:
+            raise ValueError('Unknown separation method "{}".'.format(sep_method))
+
+        # check if max number of iterations was reached
+        if estimator.n_iter_ < maxiter:
+            print(
+                "{} converged after {} iterations.".format(
+                    repr(estimator).split("(")[0], estimator.n_iter_
+                )
+            )
+            break
+
+        print(
+            "Attempt {} failed to converge at {} iterations.".format(
+                i_try + 1, estimator.n_iter_
+            )
+        )
+        if i_try + 1 < maxtries:
+            print("Trying a new random state.")
+            # Change to a new random_state
+            if random_state is not None:
+                random_state = (random_state + 1) % 2 ** 32
+
+    if estimator.n_iter_ == maxiter:
+        print(
+            "Warning: maximum number of allowed tries reached at {} iterations"
+            " for {} tries of different random seed states.".format(
+                estimator.n_iter_, i_try + 1
+            )
+        )
+
+    if hasattr(estimator, "mixing_"):
+        A_sep = estimator.mixing_
     else:
-        raise ValueError('Unknown separation method "{}".'.format(sep_method))
-
-    # make empty matched structure
-    S_matched = np.zeros(np.shape(S_sep))
+        A_sep = estimator.components_.T
 
     # Normalize the columns in A so that sum(column)=1 (can be done in one line
     # too).
     # This results in a relative score of how strongly each separated signal
     # is represented in each ROI signal.
+    #
+    # Our mixing matrix is shaped (input/raw, output/separated). For each
+    # separated (output) signal, we find how much weighting each input (raw)
+    # signal contributes to that separated signal, relative to the other input
+    # signals.
     A = abs(np.copy(A_sep))
     for j in range(n):
         if np.sum(A[:, j]) != 0:
@@ -216,25 +232,23 @@ def separate(
     # get the scores for the somatic signal
     scores = A[0, :]
 
-    # get the order of scores
+    # Rank the separated signals in descending ordering of their score.
+    # The separated signal to which the somatic signal makes up the largest
+    # contribution is sorted first.
     order = np.argsort(scores)[::-1]
 
-    # order the signals according to their scores
+    # Order the signals according to their scores, and scale the magnitude
+    # back to the original magnitude.
+    S_matched = np.zeros_like(S_sep)
     for j in range(n):
-        s_ = A_sep[0, order[j]] * S_sep[:, order[j]]
-        S_matched[:, j] = s_
+        S_matched[:, j] = A_sep[0, order[j]] * S_sep[:, order[j]]
 
     # save the algorithm convergence info
     convergence = {}
-    convergence['max_iterations'] = maxiter
-    if sep_method == 'ica':
-        convergence['random_state'] = random_state
-        convergence['iterations'] = ica.n_iter_
-        convergence['converged'] = not ica.n_iter_ == maxiter
-    elif sep_method == 'nmf':
-        convergence['random_state'] = random_state
-        convergence['iterations'] = nmf.n_iter_
-        convergence['converged'] = not nmf.n_iter_ == maxiter
+    convergence["max_iterations"] = maxiter
+    convergence["random_state"] = random_state
+    convergence["iterations"] = estimator.n_iter_
+    convergence["converged"] = estimator.n_iter_ != maxiter
 
     # scale back to raw magnitudes
     S_matched *= median