In [None]:
%load_ext autoreload
%autoreload 2
# Imports necessary libraries
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor

import numpy as np
from natsort import natsorted
import matplotlib.pyplot as plt

import suite2p

In [None]:
# Figure Style settings for the notebook.
import matplotlib as mpl

mpl.rcParams.update(
    {
        "axes.spines.left": False,
        "axes.spines.bottom": False,
        "axes.spines.top": False,
        "axes.spines.right": False,
        "legend.frameon": False,
        "figure.subplot.wspace": 0.01,
        "figure.subplot.hspace": 0.01,
        "figure.figsize": (18, 13),
        "ytick.major.left": False,
    }
)
jet = mpl.colormaps.get_cmap("jet")
jet.set_bad(color="k")

# Running the multi-day suite2p pipeline on example data

This notebook will guide you through the various stages and outputs of the suite2p multi-day pipeline by running it on
your own dataset. This is an advanced tutorial that depends on the familiarity with the
single-day pipeline and requires at least two recording sessions processed with the
single-day pipeline to work as expected. This demonstration is written using the data featured in the
[OSM manuscript](https://www.nature.com/articles/s41586-024-08548-w).

**Note!** Unlike the single-day suite2p pipeline, the multi-day pipeline is not found in the 'original' suite2p
implementation. Instead, this pipeline has been developed by the authors of the OSM manuscript and
the reference implementation is available [here](https://github.com/sprustonlab/multiday-suite2p-public).

## Setting the runtime parameters

Similar to single-day runtime parameters, all parameters for the multi-day pipeline are stored in a
MultiDayS2PConfiguration dataclass instance. To generate a class instance with default configuration parameters, use the
 `generate_default_multiday_ops` function. To see the descriptions of available multi-day configuration parameters, see
 our repository and API documentation [here](https://github.com/Sun-Lab-NBB/suite2p).

In [None]:
ops = suite2p.generate_default_multiday_ops(as_dict=False)
print(ops)

## Overriding specific configuration parameters

The interface for the multi-day parameters functions the same way as the interface for the single-day parameters, but
uses different section names. Most multi-day parameters can be left at their default values, and shared between
runtimes. The most important parameter to change for each runtime is the list of `session_fodlers` paths to process.

**Note!** Each path in the input session folder list has to satisfy two conditions. First, it must point to a directory
containing exactly one `combined` plane folder. The 'combined' folder can be stored in the directory pointed by the
path or any of its subdirectories. Second, all input paths must have a unique path component, typically the session
ID, that is not present in any other input path. In other words, all sessions must be stored in uniquely named folders.

***Important!*** When running this demonstration on your dataset, adjust the parameters below to match your dataset.

In [None]:
# Provides the paths to the two sessions processed as part of the single-day demonstration notebook's runtime
ops.io.session_directories = [
    "/home/cyberaxolotl/Desktop/sl_suite2p_demos/Tyche/A7/2022-02-02-17-16-40-000000",
    "/home/cyberaxolotl/Desktop/sl_suite2p_demos/Tyche/A7/2022-02-03-17-23-54-000000",
]

# Provides the path to the root directory under which to generate the output folder. This is the same as the
# save_path0 parameter of the single-day pipeline.
ops.io.multiday_save_path = "/home/cyberaxolotl/Desktop/sl_suite2p_demos/Tyche/A7/"

# Defines the name of the output folder. This is the same as the save_folder parameter of the single-day pipeline.
ops.io.multiday_save_folder = "/home/cyberaxolotl/Desktop/sl_suite2p_demos/Tyche/A7/s2p_multiday"

# Since the processed data is a mesoscope recording, provides the x-coordinates of the ROI stripe borders.
# This is an important parameter for mesoscope recordings that ensures ROIs close to striped borders are not processed
# multiple times (one for each neighboring stripe).
ops.cell_selection.mesoscope_stripe_borders = [462, 924]

# This is the only single-day processing parameter that is also explicitly present in the multi-day configuration class.
# During multi-day registration, this determines how many sessions are processed in parallel. During multi-day trace
# extraction (roi processing), this determines how many CPU cores are used to process each session either sequentially
# or in parallel.
ops.main.parallel_workers = 10

# Converts the configuration class to the dictionary format expected by downstream functions.
ops = ops.to_ops()

## Overriding single-day pipeline parameters

**Critical!** The multi-day pipeline acts as an extension of the single-day pipeline. It requires the data to be
pre-processed with the single-day pipeline and **reuses some single-day configuration parameters stored in the
ops.npy file generated as part of single-day processing**. Each multi-day runtime merges the parameters from the
MultiDayS2PConfiguration instance with the parameters loaded from the single-day ops.npy file for each of the
processed sessions.

The MultiDayS2PConfiguration provides an explicit way for overriding most 'shared' parameters, such as those that
control signal extraction and spike deconvolution processing steps. However, some 'implicit' parameters, such as the
dimensions of each processed plane, are loaded from the single-day ops.npy file and are NOT modified as part of
resolving the multi-day ops.npy file for each session.

However, it is possible to override these parameters by specifying them either as part of the `ops` or, as demonstrated
here, the `db` dictionary passed to the `resolve_multiday_ops` function (see the following cell).

In [None]:
# The multi-day pipeline maintains a similar interface for generating the 'ops.npy' file as the single-day pipeline,
# taking both 'ops' and 'db' dictionaries. We recommend using the 'db' dictionary to override single-day parameters
# during multi-day runtimes.
db = {
    # While it is uncommon to run multiple multi-day pipelines in parallel via the same terminal, if you do, set
    # this to 'False' to prevent the terminal from being overwhelmed with progress bar updates.
    "progress_bars": True,
}

### Generating the ops.npy file
To integrate the parameters from the `db` dictionary into the `ops` dictionary, use the `resolve_multiday_ops` function.
This function works similar to the single-day one, but carries out some additional processing steps.

First, it merges the multi-day configuration parameters with the single-day configuration parameters stored
inside the single-day ops.npy configuration file of one of the sessions to be processed. Second, it generates the
output data hierarchy using the 'multiday_save_path' and 'multiday_save_folder' values. In that hierarchy, for each
processed session, it generates a single_day and a multi_day subfolder. Finally, it copies the output of the 'combined'
folder for each processed session into the single_day output folder, so that single_day and multi_day outputs are always
stored together. Note, however, that it does NOT copy the binary files.

**Note!** The multi-day ops.npy file stored at the path returned by the `resolve_multiday_ops` is stored under
the root output folder ('multiday_save_folder'). Like the single-day pipeline, the resolve_multiday_ops () function also
generates a .yaml representation of the resolved multi-day configuration parameters.

In [None]:
# Constructs the runtime configuration dictionary and dumps it as an .npy file
ops_path = suite2p.multi_day.resolve_multiday_ops(ops=ops, db=db)
print(ops_path)

# Loads and visualizes the final state of the configuration dictionary:
final_ops = np.load(file=ops_path, allow_pickle=True).item()

# Notice that the dictionary contains both the multi-day and the single-day keys.
print(final_ops.keys())

## Running the multi-day suite2p pipeline
The default way to run the multi-day pipeline is by calling the `run_s2p_multiday` function from the `multi_day` module.
This function executes all steps of the multi-day processing pipeline in a sequence, using the processing parameters
from the multi-day ops.npy file created via the `resolve_multiday_ops` function.

**Note!** The function will use up to 'parallel_workers' CPU cores when processing each session and may require a
significant amount of memory (RAM), depending on the input dataset size. However, compared to the single-day pipeline,
the multi-day pipeline is overall less memory and CPU intensive.

**Advanced users with access to powerful compute machines or distributed clusters should consult the sections at the
end of this notebook to learn about running pipeline steps in parallel.**

In [None]:
suite2p.multi_day.run_s2p_multiday(ops_path=ops_path)

## Multi-day pipeline outputs
This section goes over the outputs generated by the multi-day pipeline and briefly mentions how each can be used when
working with your own datasets.

**Note!** We highly recommend consulting the [OSM manuscript](https://www.nature.com/articles/s41586-024-08548-w)
to understand the principles behind the multi-day processing pipeline and outputs discussed below.

### Registration (images)
The multi-day registration works similar to the single-day registration, but instead of frames, it works with sessions.
Specifically, it tries to counteract the across-day session imaging field drift by shifting (deforming) the session
imaging fields to align them in the same deformed space, using 'DiffeomorphicDemonsRegistration' adapted from
[pirt](https://github.com/almarklein/pirt) library.

After aligning all sessions in this 'deformed' (registered) space, the algorithm clusters the cells from different
sessions based on their distance in the deformed space to track them across sessions. This relies on the assumption
that a cell would appear in roughly similar location in the deformed visual space across all sessions. The discovered
cell clusters are then used to generate a set of 'template' masks, one for each of the cells tracked across sessions.

Finally, the algorithm transforms the 'template' masks back to the original (non-deformed) visual space of each session,
so that they can be used to extract the fluorescence of the tracked cells for each session during the `roi processing`
pipeline step.

Unlike with the single-day pipeline, we currently do not offer a way of quantifying across-day registration metrics.
Instead, you can use the interactive GUI below to visually inspect how the registration pipeline transforms
the (registered) reference images and single-day detected cell masks from each session during the 'registration' step.

**Note!** Any image with 'transformed_', in its name is an image in the deformed (registered) visual space, while any
image without is in the original visual space of that session. There are 4 sets of cell masks that can be rendered:
'unregistered' (single-day pipeline output), 'registered' (single-day masks converted to deformed visual space),
'shared_multiday' masks (all template masks in the deformed space) and 'session_multiday' masks
(all template masks in the original session visual space).

In [None]:
%matplotlib widget
suite2p.show_images_with_masks(ops_path=ops_path);

## Signal Extraction
The second (and final) step of the multi-day pipeline extracts the fluorescence traces for cells tracked across days
from each session. This generates the same set of files as the single-day pipeline, stored inside the 'multi_day' output
folder of each session: the raw activity data for cells (F.npy), the surrounding neuropil (Fneu.npy), and the
deconvolved cell activity (spks).

The graphs below visualize the activity of a single chosen cell (ROI) across all processed sessions. The multi-day
pipeline allows tracking the same cells even during the sessions when that particular cell is silent (not firing), so
it is possible for the visualized ROI to not have any meaningful spike activity for some sessions.

In [None]:
# Clears the widget figure from the previous code cell
plt.close("all")

%matplotlib inline

# Selects the index of the cell (ROI) to be visualized. Adjust this to visualize different multi-day tracked cells.
roi_index = 1500

# Precreates temporary storage lists for fluorescence data
f_cells_sessions = []
f_neuropils_sessions = []
spks_sessions = []

# Reconstructs the path to the output folder from 'ops' parameters
output_folder = Path(ops["multiday_save_path"]).joinpath(ops["multiday_save_folder"])

# The output folder contains .npy and .yaml files and directories named after each processed session ID.
# This re-generates the list of session IDs from the directories stored in the output folder.
session_ids = [folder.stem for folder in output_folder.glob("*") if folder.is_dir()]

# Sorts session IDs for consistency
session_ids = natsorted(session_ids)

# Resolves paths to the multi-day output for each session
session_directories = [output_folder.joinpath(session_id) for session_id in session_ids]

# Loops over processed sessions and loads the data to be visualized below
for session in session_directories:
    f_cells_sessions.append(np.load(session.joinpath("F.npy")))
    f_neuropils_sessions.append(np.load(session.joinpath("Fneu.npy")))
    spks_sessions.append(np.load(session.joinpath("spks.npy")))

plt.figure(figsize=(20.0, 20.0))
plt.suptitle(f"Fluorescence and Deconvolved Traces for ROI {roi_index} Across Sessions", y=0.92)

# Assigns distinct color to visualized traces
colors = ["#1f77b4", "#2ca02c", "#d62728"]  # Blue, Green, Red

# Loops over sessions, extracts the fluorescence data for the specified cell from each session, and plots it
# on the canvas.
for i, (f_cells, f_neuropils, spks, session_name) in enumerate(
    zip(f_cells_sessions, f_neuropils_sessions, spks_sessions, session_ids)
):
    plt.subplot(len(f_cells_sessions), 1, i + 1)

    # Extracts data for the specific ROI from this session
    f = f_cells[roi_index]
    f_neu = f_neuropils[roi_index]
    sp = spks[roi_index]

    # Adjust range to match fluorescence traces
    fmax = np.maximum(f.max(), f_neu.max())
    fmin = np.minimum(f.min(), f_neu.min())
    frange = fmax - fmin

    # Normalizes spikes
    if sp.max() > 0:
        sp = sp / sp.max() * frange + fmin
    else:
        sp = np.zeros_like(sp) + fmin

    plt.plot(f, color=colors[0], label="Cell Fluorescence")
    plt.plot(f_neu, color=colors[1], label="Neuropil Fluorescence")
    plt.plot(sp, color=colors[2], label="Deconvolved")

    plt.xticks(np.arange(0, f.shape[0], f.shape[0] // 10))

    # Add title for session name above each plot
    plt.title(f"Session {session_name}")

    # Add y-axis label for fluorescence/pixel intensity
    plt.ylabel("fluorescence")

    plt.xlabel("frame")
    plt.grid(True, linestyle=":", alpha=0.6)

    if i == 0:
        plt.legend(bbox_to_anchor=(1.01, 1), loc="upper left")

plt.tight_layout()
plt.subplots_adjust(top=0.9)

## Parallelizing multi-day pipeline steps
This section demonstrates a more advanced workflow which consists of manually calling the two functions that jointly
form the multi-day pipeline. This is the preferred way for running the pipeline on large datasets and powerful
machines, offering more advanced users a flexible way to fine-tune the processing to match their specific needs.

**Note!** In contrast to the single-day pipeline, only the second step of the multi-day pipeline can be explicitly
parallelized across sessions.

### Preparation

In this demonstration, we will reuse the same parameters as used during the 'sequential' pipeline execution
demonstrated above. Since the multi-day pipeline always overwrites any existing pipeline outputs, we can
reuse the `ops_path` generated above to re-run all processing steps.

**Tip!** If you want to generate multiple partially overlapping multi-day datasets (using different combinations of
sessions from the same pool), change the 'multiday_save_folder' configuration parameter in the 'ops' configuration file
for each unique set. All sets can be processed in parallel, as the processing does not modify the single-day pipeline
outputs.

### Step 1: Tracking cells across sessions (days)
First, all processed sessions need to be registered to the same visual space to track the cells active across sessions
(days). See the discussion of multi-day pipeline outputs (above) for details on specific transformations carried out at
this processing step.

**Tip.** This step necessarily works with all sessions processed as part of the multi-day runtime. It automatically
parallelizes session processing according to the 'parallel_workers' parameter loaded from the 'ops' configuration file.
It is generally possible and safe to further parallelize this step by running multiple multi-day runtimes, as it
requires comparably minimal memory (RAM) resources.

In [None]:
suite2p.multi_day.discover_multiday_cells(ops_path=ops_path)

### Step 2: Extracting across-day tracked cell fluorescence from each session
While the default approach is to process all sessions sequentially, powerful machines can run ROI signal extraction
in parallel to reduce the overall processing time. Note that each parallel session will use the number of CPU cores
specified in the configuration file ('parallel_workers' field) to parallelize certain computations. Therefore, the
total number of CPU cores used at the same time will be equal to
`number_of_parallel_sessions * number_of_parallel_workers`.

**Tip.** This is the most computationally- and memory-intensive step of the multi-day pipeline. Since this step uses a
combination of multi-thread and single-thread operations, increasing the number of parallel session workers is
detrimental to the processing speed beyond a certain threshold. We recommend using no more than **20** threads per
session and advise testing lower thread numbers to determine the optimal number of parallel workers. However, compared
to the single-day pipeline, this step uses less RAM, so it is safe to run more sessions in parallel than single-day
planes.

In [None]:
# Extracts the list of session IDs from the 'ops' configuration file to run ROI data extraction for all sessions in
# parallel
ops = np.load(ops_path, allow_pickle=True).item()
session_ids = ops["session_ids"]

# Extracts fluorescence from across-day tracked cells for each processed session in parallel
with ProcessPoolExecutor(max_workers=10) as executor:
    futures = [
        executor.submit(suite2p.multi_day.extract_multiday_fluorescence, ops_path, session)
        for session in ops["session_ids"]
    ]
    for future in futures:
        future.result()

## Running suite2p via terminal
All suite2p functions used in the demonstration above can be called via terminal Command-Line-Interface (CLI) installed
into the python environment together with sl-suite2p package. Specifically, use `ss2p-config` command to generate the
human-readable pipeline configuration .yaml files (another way of specifying 'ops' parameters). Use `ss2p-run` to
execute the pipeline either as a sequence of all steps or as specific pipeline step(s). See our
[repository](https://github.com/Sun-Lab-NBB/suite2p) for more details and the API documentation for these CLI commands.

**Note!** The CLI commands are especially beneficial for running code on remote compute servers using shell scripts
or similar methods. Complex suite2p runtimes can be entirely coded and executed via shell scripts using the CLI
interface and precreated configuration files. For more details about running sl-suite2p on remote compute servers,
see the [sl-forgery](https://github.com/Sun-Lab-NBB/sl-forgery) repository.

---