In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib qt
%matplotlib inline
# Imports necessary libraries
import json
from typing import Any
from pathlib import Path
import warnings
from concurrent.futures import ProcessPoolExecutor

import numpy as np
import matplotlib.pyplot as plt

import suite2p

In [None]:
# Figure Style settings for the notebook.
import matplotlib as mpl

mpl.rcParams.update(
    {
        "axes.spines.left": False,
        "axes.spines.bottom": False,
        "axes.spines.top": False,
        "axes.spines.right": False,
        "legend.frameon": False,
        "figure.subplot.wspace": 0.01,
        "figure.subplot.hspace": 0.01,
        "figure.figsize": (18, 13),
        "ytick.major.left": False,
    }
)
jet = mpl.colormaps.get_cmap("jet")
jet.set_bad(color="k")

# Running the single-day suite2p pipeline on example data

This notebook will guide you through the various stages and outputs of the suite2p single-day pipeline. It is intended
to run on your own dataset, but, if you do not have a dataset to run this pipeline, you can explore dataset generation
options from the [original suite2p repository](https://github.com/MouseLand/suite2p/tree/main). This demonstration is
written using the data featured in the [OSM manuscript](https://www.nature.com/articles/s41586-024-08548-w).


## Setting the runtime parameters

All single-day runtime parameters are stored in a SingleDayS2PConfiguration dataclass instance. To generate an instance
filled with default parameters, use the `generate_default_ops` function. You can find an explanation of most single-day
parameters in the original suite2p documentation [here](https://suite2p.readthedocs.io/en/latest/settings.html).
Alternatively, you can check our repository and API documentation [here](https://github.com/Sun-Lab-NBB/suite2p).
Note, the default single-day parameters are preconfigured for mesoscope-acquired data used in the Sun lab and will
likely need tuning to work in other contexts.

In [None]:
ops = suite2p.generate_default_ops(as_dict=False)  # If 'as_dict' is true, this returns the 'ops' dictionary.
print(ops)

## Overriding specific configuration parameters

It is common to keep most processing parameters the same between different datasets and projects. However, some
processing parameters need to be adjusted for each processed dataset (recording session). Most of these parameters are
addressable via the configuration class.

***Important!*** When running this demonstration on your dataset, adjust the parameters below to match your dataset.

In [None]:
# Input directory stores the data to be processed. Typically, this would be a folder filled with .tif or .tiff files.
# If you are working with mesoscope data, this folder would also typically contain the ops.json file generated from
# ScanImage TIFs by the helper script
ops.file_io.data_path = ["/home/cyberaxolotl/Desktop/sl_suite2p_demos/Tyche/A7/2022-02-02-17-16-40-000000/data"]

# The output directory determines whether the processed data is saved.
ops.file_io.save_path0 = "/home/cyberaxolotl/Desktop/sl_suite2p_demos/Tyche/A7/2022-02-02-17-16-40-000000/"

# The type of the input dataset. In this demonstration, the processed data is a Mesoscope scan.
ops.file_io.mesoscan = True

# Limits the number of parallel workers to 10 per each plane
ops.main.parallel_workers = 10

# Enables registration metrics computation.
ops.registration.compute_registration_metrics = True

# This improves terminal output during sequential processing, but breaks during parallel processing.
ops.main.progress_bars = True

# Converts the configuration class to the dictionary format expected by downstream functions.
ops = ops.to_ops()

## Setting parameters not addressable through the configuration class

In addition to the configuration-class-addressable parameters, there are other suite2p parameters typically resolved
automatically during runtime. However, it is also possible to provide these parameters manually. To do so, we define an
additional shallow dictionary `db` that stores these parameters and their values.

**Note!** Most of the additional parameters for mesoscope recordings are resolved through helper scripts, such as the
one listed in the original suite2p repository or the one used in our
[sl-experiment](https://github.com/Sun-Lab-NBB/sl-experiment) library. For example, when working with mesoscope
recordings, the suite2p automatically searches for the ops.json file(s) inside all data folders and resolves
these additional parameters as necessary. Here, we load this data manually to demonstrate how to write the `db`
dictionary.

In [None]:
# Manually loads the ops.json file as a dictionary
ops_path = Path(ops["data_path"][0]).joinpath("ops.json")
with open(ops_path) as f:
    precursor: dict[str, Any] = json.load(f)

# Generates 'db' using a subset of data from the loaded JSON file
db = {
    "fs": precursor["fs"],
    "nplanes": precursor["nplanes"],
    "nchannels": precursor.get("nchannels", 1),  # If 'nchannels' not in ops.json, sets it to 1
    "nrois": precursor["nrois"],
    "dx": precursor["dx"],
    "dy": precursor["dy"],
    "lines": precursor["lines"],
}

## Generating the ops.npy file
To integrate the parameters from the `db` dictionary into the `ops` dictionary, use the `resolve_ops` function.
This function also translates the resultant dictionary into the ops.npy file and returns the path to this file.

**Tip** For specifying runtime parameters `ops` and `db` are functionally equivalent. The only difference is that any
parameters from `db` overwrite parameters from `ops` and parameters from `ops` override default configuration
parameters. If your use case does not require one or both of these dictionaries, set them to {} (empty) when calling
`resolve_ops`.

In [None]:
# Constructs the runtime configuration dictionary and dumps it as an .npy file
ops_path = suite2p.single_day.resolve_ops(ops=ops, db=db)
print(ops_path)

# Loads and visualizes the final state of the configuration dictionary:
final_ops = np.load(file=ops_path, allow_pickle=True).item()
print(final_ops)

**Note!** In sl-suite2p versions 2.0.0+, the resolve_ops() function now also generates a '.yaml' snapshot of the
resolved configuration parameters. Since .yaml files are human-readable and do not rely on unsafe pickling, saving the
final parameters as a 'yaml' file is primarily intended as an additional safety feature. It prevents data loss if you
lose the ability to load the 'ops.npy' file, for example, due to silent changes to the underlying pickling protocol.
The snapshot is saved to the same folder as the 'ops.npy' file.

## Running the single-day suite2p pipeline
The default way to run the single-day pipeline is by calling the `run_s2p` function from the `single_day` module.
This function executes all steps of the single-day processing pipeline in a sequence, using the processing parameters
from the ops.npy file created via the `resolve_ops` function.

**Note!** The function will use up to 'parallel_workers' CPU cores when processing each plane and may require a
significant amount of memory (RAM), depending on the input dataset size. However, it will only process a single plane
at a time, keeping both RAM and CPU requirements generally manageable for most use cases.

**Advanced users with access to powerful compute machines or distributed clusters should consult the sections at the
end of this notebook to learn about running pipeline steps in parallel.**

In [None]:
suite2p.single_day.run_s2p(ops_path=ops_path)

## Single-day pipeline outputs
This section goes over the outputs generated by the single-day pipeline and briefly mentions how each can be used when
working with your own datasets.

**Note!** We highly recommend consulting the original suite2p documentation and / or watching this
[video](https://www.youtube.com/watch?v=HpL5XNtC5wU) to understand the principles behind suite2p outputs discussed
below.

### Ops dictionaries
During processing, the pipeline caches various output data inside the `ops.npy` file for each processed plane. If
planes are combined during post-processing, the suite2p also generates a 'combined' `ops.npy` file. Each output ops file
contains all keys that went into the pipeline, plus new keys that contain additional metrics/outputs calculated during
the pipeline runtime:

In [None]:
# Loads the 'ops.npy' file from plane 0
combined_path = Path(ops_path).parent.joinpath("plane0", "ops.npy")
output_ops = np.load(combined_path, allow_pickle=True).item()
print(set(output_ops.keys()).difference(ops.keys()))  # Shows keys added during runtime.

### Registration (images)
During registration, the pipeline tries to register every frame of the processed plane movie to the 'reference image.'
A good registration result would therefore generate a 'mean image' that is visually similar to the 'reference image.'

**Important!** The best way to verify registration results is to use the suite2p GUI to render and evaluate the data
generated during registration metrics computation (see below). For well-registered data, the rendered video should have
no significant shift of cells in the X and Y axes.

In [None]:
plt.subplot(1, 4, 1)
plt.imshow(
    output_ops["refImg"],
    cmap="gray",
)
plt.title("Reference Image for Registration")

plt.subplot(1, 4, 2)
plt.imshow(output_ops["max_proj"], cmap="gray")
plt.title("Registered Image, Max Projection")

plt.subplot(1, 4, 3)
plt.imshow(output_ops["meanImg"], cmap="gray")
plt.title("Mean registered image")

plt.subplot(1, 4, 4)
plt.imshow(output_ops["meanImgE"], cmap="gray")
plt.title("High-pass filtered Mean registered image");

 ### Registration (metrics)
Additionally, the pipeline (optionally) computes the registration quality metrics. The metrics quantify the rigid and
non-rigid offsets (shifts) that need to be applied to register the top and bottom of processed movies’ n
Principal Components (here, we use 10) to each-other. Note, the PCs are extracted from a subset of frames evenly sampled
across the entire movie. For well-registered movies, the shifts should be close to zero.

In [None]:
# Extracts the offsets
offsets = output_ops["regDX"]
mean_offsets = np.mean(offsets, axis=0)
max_offsets = np.max(offsets, axis=0)

# Formats the offsets to match the original suite2p registration metric script output.
print(
    f"""
    Avg_Rigid: {mean_offsets[0]:.6f} \tAvg_Average NR: {mean_offsets[1]:.6f} \tAvg_Max NR: {mean_offsets[2]:.6f}
    Max_Rigid: {max_offsets[0]:.6f} \tMax_Average NR: {max_offsets[1]:.6f} \tMax_Max NR: {max_offsets[2]:.6f}
    """.replace("            ", "")
)

### ROI Detection
During ROI detection, the pipeline first discovers cell ROI candidates and then passes them through a classifier to
determine whether these ROIs are valid cells. The output of the classifier is stored in the `iscell.npy` file and the
ROI statistics, including cell masks, are stored in the `stat.npy` file. The data stored in these files always has
the same dimension.

**Note!** While this demonstration uses a simple built-in classifier, suite2p can be configured to use the
[cellpose](https://www.cellpose.org/) classifier to further augment ROI detection and classification.

In [None]:
stats_file = Path(output_ops["save_path"]).joinpath("stat.npy")
iscell = np.load(Path(output_ops["save_path"]).joinpath("iscell.npy"), allow_pickle=True)[:, 0].astype(bool)
stats = np.load(stats_file, allow_pickle=True)
stats.shape, iscell.shape

The purpose of the ROI detection (extraction) step is to identify the cells from which to extract the fluorescence
signals (see below). A well-configured ROI extraction step should discover most cells observable in the registered image
and discard most outlier objects a human rater would not consider a cell. The accuracy of the ROI extraction can be
assessed by comparing the discovered and rejected ROI to the registered maximum projection image.

In [None]:
# Converts ROI statistics stored as a list of dictionaries to a renderable NumPy matrix
im = suite2p.ROI.stats_dicts_to_3d_array(stats, Ly=output_ops["Ly"], Lx=output_ops["Lx"], label_id=True)
# Replaces zero-values with NaNs
im[im == 0] = np.nan

# Suppresses RuntimeWarnings generated by np.nanmax() calls when they receive all-NaN slices.
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", message="All-NaN slice encountered")
    plt.subplot(1, 4, 1)
    plt.imshow(output_ops["max_proj"], cmap="gray")
    plt.title("Registered Image, Max Projection")

    plt.subplot(1, 4, 2)
    plt.imshow(np.nanmax(im, axis=0), cmap="jet")
    plt.title("All ROIs Found")

    plt.subplot(1, 4, 3)
    plt.imshow(np.nanmax(im[~iscell], axis=0), cmap="jet")
    plt.title("All Non-Cell ROIs")

    plt.subplot(1, 4, 4)
    plt.imshow(np.nanmax(im[iscell], axis=0), cmap="jet")
    plt.title("All Cell ROIs")

## Signal Extraction
As a final step, suite2p extracts the fluorescence traces from each cell ROI and generates multiple output files.
Available files contain the raw activity data for cells (F.npy), the surrounding neuropil (Fneu.npy), and the
deconvolved cell activity (spks). All extracted data have the same shape of cell_number x movie frames.

In [None]:
f_cells = np.load(Path(output_ops["save_path"]).joinpath("F.npy"))
f_neuropils = np.load(Path(output_ops["save_path"]).joinpath("Fneu.npy"))
spks = np.load(Path(output_ops["save_path"]).joinpath("spks.npy"))
f_cells.shape, f_neuropils.shape, spks.shape

The graphs below visualize the activity of some ROIs over the course of the processed plane movie. For this
demonstration we use a small sample of all cell ROIs, but the entire dataset is available for further use in
analysis by loading the same .npy files.

In [None]:
plt.figure(figsize=(20.0, 20.0))
plt.suptitle("Fluorescence and Deconvolved Traces for Different ROIs", y=0.92)

# Evenly samples the required number of ROIs. Adjusting the step rate here allows controlling the number of
# rendered ROIs
rois = np.arange(len(f_cells))[::500]

# Assigns distinct color to visualized traces
colors = ["#1f77b4", "#2ca02c", "#d62728"]  # Blue, Green, Red

for i, roi in enumerate(rois):
    plt.subplot(len(rois), 1, i + 1)
    f = f_cells[roi]
    f_neu = f_neuropils[roi]
    sp = spks[roi]

    # Adjust range to match fluorescence traces
    fmax = np.maximum(f.max(), f_neu.max())
    fmin = np.minimum(f.min(), f_neu.min())
    frange = fmax - fmin

    # Normalizes spikes
    if sp.max() > 0:
        sp = sp / sp.max() * frange + fmin
    else:
        sp = np.zeros_like(sp) + fmin

    plt.plot(f, color=colors[0], label="Cell Fluorescence")
    plt.plot(f_neu, color=colors[1], label="Neuropil Fluorescence")
    plt.plot(sp, color=colors[2], label="Deconvolved")

    plt.xticks(np.arange(0, f_cells.shape[1], f_cells.shape[1] // 10))
    plt.ylabel(f"ROI {roi}", rotation=0, labelpad=10)
    plt.xlabel("frame")
    plt.grid(True, linestyle=":", alpha=0.6)  # Add grid for easier comparison

    if i == 0:
        plt.legend(bbox_to_anchor=(1.01, 1), loc="upper left")

plt.tight_layout()
plt.subplots_adjust(top=0.9)

## Parallelizing single-day pipeline steps
This section demonstrates a more advanced workflow which consists of manually calling the three functions that jointly
form the single-day pipeline. This is the preferred way for running the pipeline on large datasets and powerful
machines, offering more advanced users a flexible way to fine-tune the processing to match their specific needs.

**Note!** Each of the functions discussed below can be executed in parallel for multiple datasets (sessions) to increase
the overall processing speed. Additionally, the 'process_plane' function can be called in parallel for each plane
(of every session), resulting in an even faster processing speed.

### Preparation

The steps to prepare for a parallel runtime are the same as for the sequential runtime.

In [None]:
# To run the processing in parallel, we will use a different session and tweak some important parameters.
ops = suite2p.generate_default_ops(as_dict=False)

# Different session path
ops.file_io.data_path = ["/home/cyberaxolotl/Desktop/sl_suite2p_demos/Tyche/A7/2022-02-03-17-23-54-000000/data"]
ops.file_io.save_path0 = "/home/cyberaxolotl/Desktop/sl_suite2p_demos/Tyche/A7/2022-02-03-17-23-54-000000/"

# This is also a mesoscope session
ops.file_io.mesoscan = True

# Critical! Limits the number of workers to 10 per plane. When planes run in parallel, this will be multiplied by the
# number of planes. Adjust this value to work for your system.
ops.main.parallel_workers = 10

# Disables progress bars, as they cannot be properly displayed when running processing in parallel.
ops.main.progress_bars = False

# Converts ops to dictionary
ops = ops.to_ops()

# Resolves 'db', works the same way as in the example above
ops_path = Path(ops["data_path"][0]).joinpath("ops.json")
with open(ops_path) as f:
    precursor: dict[str, Any] = json.load(f)
db = {
    "fs": precursor["fs"],
    "nplanes": precursor["nplanes"],
    "nchannels": precursor.get("nchannels", 1),
    "nrois": precursor["nrois"],
    "dx": precursor["dx"],
    "dy": precursor["dy"],
    "lines": precursor["lines"],
}

# Generates the ops.npy file
ops_path = suite2p.single_day.resolve_ops(ops=ops, db=db)

### Step 1: Generating plane binary files and folders
First, the processed dataset needs to be converted into binary (.bin) files. Regardless of the input raw data format,
this step generates a separate output folder for each imaging plane in the dataset. Then, it writes the data.bin
(plane data) and ops.npy (plane processing parameters and intermediate pipeline outputs) files to the folder.

**Tip.** Since this step only uses a single core and memory mapping, it is safe to parallelize it for many sessions
without limiting the number of parallel workers or assigning significant memory allocations. We typically use 1 or 2
cores and 5 or 10 GB RAM per parallel session when running this on a compute cluster.

In [None]:
suite2p.single_day.resolve_binaries(ops_path=ops_path)

# Collects the paths to individual plane folders to be used below
planes = [path for path in ops_path.parent.glob("plane*")]

### Step 2: Processing planes
While the default approach is to process all planes sequentially, powerful machines can run plane processing in parallel
to reduce the overall processing time. Note that each parallel plane will use the number of CPU cores specified in the
configuration file ('parallel_workers' field) to parallelize certain computations. Therefore, the total number of CPU
cores used at the same time will be equal to `number_of_parallel_planes * number_of_parallel_workers`.

**Tip.** This is the most computationally- and memory-intensive step of the single-day pipeline. Since this step uses a
combination of multi-thread and single-thread operations, increasing the number of parallel plane workers is detrimental
to the processing speed beyond a certain threshold. We recommend using no more than **20** threads per plane and advise
testing lower thread numbers to determine the optimal number of parallel workers.

In [None]:
# Although using 'range' would have worked here too, extracts the indices using plane folder names
# for demonstration and added safety
plane_indices = [int(plane.name[-1]) for plane in planes]

# Sets up a ProcessPool executor and processes each plane in parallel. This demonstrates parallelizing single-day
# processing steps and can be applied to all steps across sessions (datasets).
with ProcessPoolExecutor(max_workers=len(plane_indices)) as executor:
    # ops_path is the same for all processing steps. Each plane has a unique index.
    futures = [executor.submit(suite2p.single_day.process_plane, ops_path, index) for index in plane_indices]

    # Waits for the processing to complete
    for future in futures:
        _ = future.result()

### Step 3: Post-processing the data

This is a comparatively minor step that optionally assembles a unified output dataset. Specifically, it allows
assembling all processed data into an .nwb file or a `combined` folder. The latter type of processing is required if
you intend to also process the session with the multi-day pipeline. It is safe to 'combine' sessions even if they only
have a single plane.

In [None]:
suite2p.single_day.combine_planes(ops_path=ops_path)

## Running suite2p via terminal
All suite2p functions used in the demonstration above can be called via terminal Command-Line-Interface (CLI) installed
into the python environment together with sl-suite2p package. Specifically, use `ss2p-config` command to generate the
human-readable pipeline configuration .yaml files (another way of specifying 'ops' parameters). Use `ss2p-run` to
execute the pipeline either as a sequence of all steps or as specific pipeline step(s). See our
[repository](https://github.com/Sun-Lab-NBB/suite2p) for more details and the API documentation for these CLI commands.

**Note!** The CLI commands are especially beneficial for running code on remote compute servers using shell scripts
or similar methods. Complex suite2p runtimes can be entirely coded and executed via shell scripts using the CLI
interface and precreated configuration files. For more details about running sl-suite2p on remote compute servers,
see the [sl-forgery](https://github.com/Sun-Lab-NBB/sl-forgery) repository.

---