In [16]:
import os
import pandas as pd
import suite2p
import numpy as np

In [17]:
def search_for_files(listOfFactorNames, path):
    for f in os.listdir(path):
        for factor in listOfFactorNames:
            if factor in f:
                yield f
                break

In [266]:
def pre_process_calcium(parent_directory, file_substring, ops_substring=None, identifiers=None, binary_file='data.bin', refined_file='Fall.mat'):    
    ## Find all movie directories ##
    movie_directories = {}
    for root, dirs, files in os.walk(parent_directory):
        for f in files:
            if file_substring in f:
                movie_directories[root] = {}

    ## First find movie directories that contain a binary file ##
    refined_directories = {}
    processed_directories = {}
    unprocessed_directories = {}
    for directory in movie_directories.keys():
        subdirectories = [x[0] for x in os.walk(directory)] #get a list of all the files/subdirectories in the movie directory

        directory_categorized = 0 #a counter to check if a directory has been categorized or not
        for sd in subdirectories:
            sd_files = os.listdir(sd)
            if binary_file in sd_files:#check if there is a binary file in subdirectory (i.e. processed)
                if refined_file in sd_files: #ROIs have been refined if "Fall.mat" file exists (i.e. refined)
                    refined_directories[directory] = {}
                    directory_categorized = 1
                else: #if it is processed but unrefined, then it is just processed
                    processed_directories[directory] = {}
                    directory_categorized = 1

        if directory_categorized == 0: #if it didn't meet either criteria of processed or refined
            unprocessed_directories[directory] = {} 
    
    ## Flag directories that appeared in multiple processing categories ##
    all_dirs = list(refined_directories.keys()) + list(processed_directories.keys()) + list(unprocessed_directories.keys())
    dir_counts = pd.DataFrame(all_dirs).value_counts().to_frame()
    duplicates = [x[0] for x in dir_counts[dir_counts[0] > 1].index]
    
    ## Print identifiers for each directory ##
    if identifiers != None:
        for upd in unprocessed_directories.keys():  
            for identifier_type, type_dict in identifiers.items():
                type_identified = 0
                for identifier_name, identifier_list in type_dict.items():
                    if len([x for x in identifier_list if x in upd]) > 0:
                        type_identified = 1
                    else:
                        None
            
    ## Find ops file for each directory ##
    if ops_substring != None: #only look for ops file if ops_substring is defined
        for upd in unprocessed_directories.keys():
            for f in search_for_files(['ops'], parent_directory): #search for suite2p ops file
                if (ops_substring in f):
                    unprocessed_directories[upd] = parent_directory+'//'+f
    
    ## Run suite2p pipeline in unprocessed directories using ops files ##
    print('Suite2p will run in the following unprocessed directories:')
    for upd in unprocessed_directories.keys():
        print(upd)
    run = input('Continue? Enter 1 if yes: ')
    if run == '1':
        for i, (upd, upd_ops) in enumerate(unprocessed_directories.items()):
            if upd in refined_directories.keys():
                None
            else:
                if ops_substring != None:
                    ops = np.load(upd_ops, allow_pickle=True).item()
                    db = {'data_path': [upd]}
                    output_ops = suite2p.run_s2p(ops=ops, db=db)
                else: #run suite2p with default ops if ops_substring is not defined
                    db = {'data_path': [upd]}
                    output_ops = suite2p.run_s2p(db=db)      
    
    return {'all':movie_directories, 
            'refined':refined_directories, 'processed':processed_directories, 
            'unprocessed':unprocessed_directories,
            'duplicate':duplicates}

### Parameters to set ###
- ***parent_directory (str):*** The top level directory you want to start at, to find subdirectories to run the pipeline in
- ***file_substring (str):*** Substring that is found in all of the movie file names (e.g. '.tiff')

### Keyword arguments ###
- ***ops_substring (str):*** A substring that is found in your ops file that you want to use. If not defined, then the default ops parameters will be used.
- ***identifiers (dict):*** A nested dictionary, where each sub-dictionary identifies anything of your choice (e.g. recording channel). Within each sub-dictionary should be key:value pairs of the name of the identifier (e.g. "Red" for the red channel):a list of substrings that may be found in the directory name that would help identify the "Red" channel.
- ***binary_file (str):*** The name of the binary output file of suite2p. If not defined, the default "data.bin" will be used
- ***refined_file (str):*** The name of the refined file of suite2p, after manual validation of ROI selection. If not defined, it will assume the "Fall.mat" file is what defines if the ROIs have been "refined" in a given directory

### Returns ###
A dictionary containing all movie directories **(key: 'all')**, refined directories **(key: 'refined')**, processed directories **(key: 'processed')**, unprocessed directories **(key: 'unprocessed')**, and directories that were found in multiple directories **(key: 'duplicate')**

In [268]:
parent_directory = r'D:\three'
file_substring = '.tiff'
# ops_substring = 'expert'
identifiers = {'channel':{'Green': ['reen','REEN','XON','xon'],
                       '  Red': ['ed','ED','END','end','OMA','oma']
                         }
              }
# binary_file = 'data.bin'
# refined_file = 'Fall.mat'

directories = pre_process_calcium(parent_directory, file_substring, 
                    identifiers=identifiers)

Suite2p will run in the following unprocessed directories:
D:\three\221204\Red\FOV2
D:\three\221205\Green\FOV3
D:\three\221205\Green\FOV4
D:\three\221205\Red\FOV3
D:\three\221205\Red\FOV4
D:\three\221206\Green\FOV3
D:\three\221206\Green\FOV4
D:\three\221206\Red\FOV3
D:\three\221206\Red\FOV4
D:\three\221207\Red\FOV4
D:\three\221209\Green\FOV3_NOVICE_PROBABILISTIC_DETECTION
D:\three\221209\Green\FOV3_NOVICE_PROBABILISTIC_DETECTION2
D:\three\221209\Green\FOV4
D:\three\221209\Red\FOV3_NOVICE_PROBABILISTIC_DETECTION
D:\three\221209\Red\FOV3_NOVICE_PROBABILISTIC_DETECTION2
D:\three\221209\Red\FOV4
D:\three\221221\Green\FOV5
D:\three\221221\Red\FOV5
D:\three\221222\Green\FOV4.2
D:\three\221222\Green\FOV4_EXPERT_PROBABILISTIC_DETECTION
D:\three\221222\Red\FOV4.2
D:\three\221222\Red\FOV4.3
D:\three\221222\Red\FOV4_EXPERT_PROBABILISTIC_DETECTION
D:\three\221223\Green\FOV4
D:\three\221223\Green\FOV6
D:\three\221223\Red\FOV4
D:\three\221223\Red\FOV6
D:\three\221224\Green\FOV4
D:\three\221224\Green