# Bruker Preprocessing Pipeline for Pilot Girder

Libraries

In [None]:
# Built-ins
from pathlib import *
import itertools
import json
import os
import time
# VIP
from vip_client.utils import vip
# Girder
from girder_client import GirderClient, HttpError

## Parameters

**User variables**: should be checked and supplied at each execution

In [None]:
# API keys
VIP_KEY = os.environ["VIP_API_KEY"] # Your VIP API key here (string, file or environment variable)
GIRDER_KEY = os.environ["GIRDER_API_KEY"] # Your Girder API key here (string)

# Parameters to feed in the preprocessing pipeline, in addition to the preprocessing files.
PARAMETERS = {
    # [Argument]: [value]
    "outname"   : "sigproc",
    # Put additional parameters below
}

**Constant variables**: should not be changed unless the dataset structure or the pipeline have been modified

In [None]:
# Name of the pipeline performing the preprocessing on VIP
PIPELINE_ID = "Bruker-preproc/0.3" 
# Name of the preprocessing files for VIP
RAWDATA_FILES = {
    # [File name]   : [Argument for `PIPELINE_ID`]
    "acqp"          : "acqp",
    "fid"           : "fid",
    "method"        : "method",
    "rawdata.job0"  : "rawjob0",
    "fid.refscan"   : "refscan" 
}
# Maximum parallel jobs submitted to VIP
MAX_VIP_JOBS = 10
# Instantiate Girder client with Pilot URL
GIRDER_CLIENT = GirderClient(apiUrl='https://pilot-warehouse.creatis.insa-lyon.fr/api/v1')
# Prefix to add for launching VIP jobs on this Girder dataset
GIRDER_PREFIX = "pilotGirder:"
# Girder path to the MRS dataset
DATASET_PATH = "/collection/ishc-myeline"
# Name of the target sequence in which the rawdata should be found (for partial, case-insensitive match in the folder names)
TARGET_SEQUENCE = "STEAM"
# Path to the raw data within each sequence folder (must be a Girder folder)
RAWDATA_FOLDER = "Raw"
# Path to the processed data within each sequence folder (must be a Girder folder)
OUTPUT_FOLDER = "Proc_" + time.strftime("%y-%m-%d_%H-%M-%S", time.localtime())
# Metadata for the output folder
OUTPUT_METADATA = {
    "pipeline_id"   : PIPELINE_ID,
    "parameters"    : PARAMETERS
}
OUTPUT_DESCRIPTION = """
Preprocessing with the Crea-PASTIS algorithm on the Virtual Imaging Platform.
https://gitlab.in2p3.fr/pilot/rmn/processing/bruker-spectro-processing-pipeline
https://vip.creatis.insa-lyon.fr
"""

## Procedure

### Main Procedure

Search for `RAWDATA_FOLDER`(*Raw*) under the `DATASET_PATH` (*ishc-myeline*) whose parent includes the `TARGET_SEQUENCE` (*STEAM*) in its name.

For each new group of `MAX_VIP_JOBS` (*e.g.*, 10) signals found in the dataset : 
- Launch the preprocessing pipeline on those signals, using all `RAWDATA_FILES` found found each signal ;
- While the pipeline is still running, look for the next group of signals to process ;
- When the pipeline is over, save the results in the `OUTPUT_FOLDER` (*Proc_[timestamp]*) of each signal.

Save the signal paths and processing metadata in a local JSON file (`OUTPUT_FOLDER`.json).

In [None]:
# Criterion to detect the target rawdata folder
def is_rawdata_folder(folder: dict):
    return (
        (RAWDATA_FOLDER == folder["path"].name) 
        and (TARGET_SEQUENCE.lower() in folder["path"].parent.name.lower())
        # TODO: SUPPRESS BELOW WHEN OPTIONAL FILES WILL BE SUPPORTED BY VIP ------------------------------------
        # Check that all required input files are present 
        and (set(RAWDATA_FILES) < set([item["name"] for item in GIRDER_CLIENT.listItem(folderId=folder["id"])]))
        # ------------------------------------------------------------------------------------------------------
    )
# ------------------------------------------------  

# Generator to find each rawdata folder matching the criterion
def all_rawdata_folders(node) -> dict:
    # Wait to avoid request overflow
    time.sleep(0.1)
    # Search in all subfolders
    for folder in GIRDER_CLIENT.listFolder(parentId=node["id"], parentFolderType=node["type"]):
        # Get all the information we need
        folder_data = {
            "path": node["path"] / folder["name"],
            "id": folder["_id"],
            "parentId": folder["parentId"],
            "type": folder["_modelType"]
        }
        # Check if current folder is Rawdata
        if is_rawdata_folder(folder_data):
            yield folder_data
        else:
            # Search the current folder
            yield from all_rawdata_folders(folder_data)
# ------------------------------------------------  

# Function to extract the proprocessing files from an MRS acquisition folder
def get_files(folder: dict) -> dict:
    """
    Looks for rawdata files in `folder`.
    Returns a dictionnary mapping each file to its path on Girder.
    """
    # Retrieve all items in the file list
    return {
        # [Argument for the pipeline]: [path of the Girder item]
        RAWDATA_FILES[item["name"]]  : GIRDER_PREFIX + file["_id"]
            for item in GIRDER_CLIENT.listItem(folderId=folder["id"]) if item["name"] in RAWDATA_FILES
            for file in GIRDER_CLIENT.listFile(itemId=item["_id"], limit=1) # only 1 file
    }
# ------------------------------------------------

# Generator to make lists of rawdata files with respect to the maximum number of jobs on VIP
def get_data():
    # Initiate folder search
    dataset = GIRDER_CLIENT.resourceLookup(DATASET_PATH)
    iterator = all_rawdata_folders({
        "path": PurePosixPath(DATASET_PATH),
        "id": dataset["_id"],
        "parentId": "",
        "type": dataset["_modelType"],
    })
    # Begin folder search
    for first in iterator:
        # Reset the outputs
        signals = []
        rawdata_files = {param: [] for param in RAWDATA_FILES.values()}
        # Iterate across the new folders
        for folder in itertools.chain([first], itertools.islice(iterator, MAX_VIP_JOBS - 1)):
            # New signal
            signals.append(folder)
            # Get the rawdata files
            filenames = get_files(folder)
            # Fill the parameter matrix
            for param in rawdata_files:
                rawdata_files[param].append(filenames[param] if param in filenames else "")
        # Yield
        yield signals, rawdata_files
# ------------------------------------------------

# Function to create a Girder folder
def new_output_dir(parentId: str) -> str:
    try: 
        return GIRDER_CLIENT.createFolder(
            parentId=parentId, name=OUTPUT_FOLDER, metadata=OUTPUT_METADATA, description=OUTPUT_DESCRIPTION
            )["_id"]
    except HttpError:
        print(f"""(!) Output folder '{OUTPUT_FOLDER}' already exists in parent: '{parentId}' on Girder.
            **Please restart the Notebook or delete the folder on Girder.***""")
        raise
# ------------------------------------------------

# Generator to launch executions on VIP and return the Girder ID of the created folders
def launch_executions() -> dict:
    # Current running workflow 
    workflow_id = None
    # Browse the dataset
    print("Looking for the first acquisitions... ", end="", flush=True)
    for signals, rawdata_files in get_data():
        print("Done.")
        # Wait for the previous execution
        if workflow_id:
            print("- Waiting for the current workflow to end (this can be monitored on the VIP website)... ", end="", flush=True)
            while vip.execution_info(workflow_id)["status"] == "Running":
                time.sleep(5)
            print("Done.")
        print("\nNew signals:\n\t", "\n\t".join([str(sig["path"].parent) for sig in signals]), sep="")
        # Create the output folders on Girder
        print("- Creating the output folders on Girder... ", end="", flush=True)
        output_dirs = {
            str(sig["path"].parent): new_output_dir(sig["parentId"]) for sig in signals
        }
        print("Done.")
        # Yield the list of folders ID
        yield output_dirs
        # Update the VIP inputs with user's parameters and the current outputs directories
        rawdata_files.update(PARAMETERS)
        rawdata_files["results-directory"] = [GIRDER_PREFIX + output_dirs[sig] for sig in output_dirs]
        # Launch a new worflow on VIP
        print("- Launching a new workflow on VIP... ", end="", flush=True)
        workflow_id = vip.init_exec_without_resultsLocation(
            pipeline = PIPELINE_ID,
            name = OUTPUT_FOLDER,
            inputValues = rawdata_files,
        )
        print("Done. Current workflow:", workflow_id)
        # Prepare the next iteration
        print("- Looking for new signals to process... ", end="", flush=True)
    # End of the loop
    print("No more signals to process.")
    # Wait for the previous execution
    if workflow_id:
        print("Waiting for the current workflow to end ... ", end="", flush=True)
        while vip.execution_info(workflow_id)["status"] == "Running":
            time.sleep(5)
        print("Done.")
    # End display
    print("END")
# ------------------------------------------------

# Main procedure
if __name__ == "__main__":
    # Handshake with VIP
    vip.setApiKey(VIP_KEY)
    # Handshake with Girder
    GIRDER_CLIENT.authenticate(apiKey=GIRDER_KEY)
    # Launch executions on VIP
    all_outputs = {} # output metadata
    try: # Output metadata are recorded at each iteration to have them registered in case of error
        for output_dirs in launch_executions(): all_outputs.update(output_dirs) 
    except: raise
    finally: # Output metadata are saved in a Json file to handle them later automatically
        OUTPUT_METADATA.update({
            "name": OUTPUT_FOLDER,
            "outputs": all_outputs
        })
        with open(OUTPUT_FOLDER + ".json", 'w') as save_file:
            json.dump(OUTPUT_METADATA, save_file, indent=4)

### Database Cleanup
Run the following cell *only* to **erase previous results on Girder**

In [None]:
# Name of the output folder to suppress under every signal
output_name = "Proc_23-09-04_15-18-34" # ex: "Proc_23-07-25_17-03-59"
# Delete from the JSON file
if output_name:
    metadata_file = output_name + ".json"
    # Open the JSON file
    with open(metadata_file, 'r') as save_file:
        metadata = json.load(save_file)
    # Browse outputs
    print("Deleting the output folders on Girder...")
    for folderPath in metadata["outputs"]:
        print(folderPath, ": ", end="") 
        # Delete the folder
        try :
            rep = GIRDER_CLIENT.delete(path="folder/" + metadata["outputs"][folderPath])
            print(rep["message"])
        except HttpError as e:
            if e.status == 400:
                print(rep["message"])
            else: raise
    print("Done.")
    os.remove(metadata_file)