In [2]:
import np_tools
import pathlib
import npc_lims
import npc_session

In [3]:
SRC = pathlib.Path(
    '//allen/programs/mindscope/workgroups/dynamicrouting/PilotEphys/Task 2 pilot'
)

MIN_TB_TO_MOVE = 20

sessions_to_skip = sorted(
    s.id for s in npc_lims.get_session_info()
    if "https://github.com/AllenInstitute/npc_lims/issues/5" in s.issues
)

In [8]:
sessions_to_skip

['628801_2022-09-20',
 '670180_2023-07-25',
 '670181_2023-07-20',
 '681532_2023-10-19',
 '686176_2023-12-06',
 '702131_2024-02-27']

In [4]:
import concurrent.futures    
import dataclasses

@dataclasses.dataclass
class Info:
    size: float
    path: pathlib.Path
    date: str
    session: npc_session.SessionRecord
    
def get_session_dir_info(session_dir):
    if not session_dir.is_dir():
        return None
    try:
        session = npc_session.SessionRecord(session_dir)
    except:
        return None
    if session in sessions_to_skip:
        return None
    return Info(
        size=np_tools.dir_size_gb(session_dir),
        path=session_dir,
        date=session.date,
        session=session,
    )
    
dirs = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    for session in executor.map(get_session_dir_info, SRC.iterdir()):
        if session:
            dirs.append(session)

In [5]:
test_ephys_sessions = [d for d in dirs if d.size >= 10 and d.size <= 200]
sum(t.size for t in test_ephys_sessions)

7628.800000000002

In [6]:
import datetime

scratch = pathlib.Path('//allen/programs/mindscope/workgroups/dynamicrouting/ben/vast_transfers')
scratch.mkdir(parents=True, exist_ok=True)
dt = datetime.datetime.now()
DEST = pathlib.Path('//allen/aind/scratch/dynamic-routing/Task 2 pilot')

def get_file_manifest(info: Info) -> list[pathlib.Path]:
    return list(info.path.rglob('*'))

def write_manifest(info: Info) -> None:   
    get_manifest_path(info).write_text(
        '\n'.join(p.relative_to(SRC).as_posix() for p in get_file_manifest(info)),
        newline='\n',
    )

def get_hpc_output_path(info: Info) -> pathlib.Path:
    return scratch / f'{info.session}_{dt:%Y-%m-%d_%H%M}.log'

def get_manifest_path(info: Info) -> pathlib.Path:   
    p = get_hpc_output_path(info).with_suffix('.txt')
    p.touch(exist_ok=True)
    return p

def get_log_path(info: Info) -> pathlib.Path:
    p = get_hpc_output_path(info)
    p.touch(exist_ok=True)
    return p

def get_shell_script_path(info: Info) -> pathlib.Path:
    p = get_hpc_output_path(info).with_suffix('.sh')
    p.touch(exist_ok=True)
    return p

def get_rsync_cmd(info: Info) -> str:
    src = info.path.as_posix()
    dest = DEST.as_posix()
    rsync_cmd = f'rsync -Larv --remove-source-files --log-file={get_log_path(info).as_posix()} --files-from={get_manifest_path(info).as_posix()} "{src}" "{dest}"'
    # -a archive mode
    # -r recursive (for dirs)
    # -v verbose
    # -L copy the data that symlinks point to
    # --remove-source-files deletes source files after copying, but not dirs
    return rsync_cmd

def get_shell_script_cmd(info: Info) -> str:
    script = f"""#!/bin/bash
#SBATCH --job-name=npexp_to_incoming                        # Job name
#SBATCH --mail-type=FAIL                                    # Mail events (NONE, BEGIN, END, FAIL, ALL)
#SBATCH --mail-user=ben.hardcastle@alleninstitute.org       # Where to send mail  
#SBATCH --ntasks=1                                          # Run on a single CPU
#SBATCH --mem=4gb                                           # Job memory request (per node)
#SBATCH --time=20:00:00                                     # Time limit hrs:min:sec
#SBATCH --output=dynamicrouting_to_vast%j.log               # Standard output and error log
#SBATCH --partition braintv                                 # Partition used for processing
#SBATCH --tmp=100M                                          # Request the amount of space your jobs needs on /scratch/fast

pwd; hostname; date

echo 'Running rsync job on a single thread'

{get_rsync_cmd(info)}

date
"""
    return script

def write_shell_script(info: Info) -> None:
    get_shell_script_path(info).write_text(get_shell_script_cmd(info), newline='\n') 
    # if writing on Windows, newline==\r\n by default, which isn't compatible with bash on linux

def submit_job(info: Info) -> None:
    with np_tools.hpc as ssh:
        ssh.run(f'sbatch {get_shell_script_path(info).as_posix()}')

def process(info: Info) -> None:
    write_manifest(info)
    write_shell_script(info)

with concurrent.futures.ThreadPoolExecutor() as executor:
    for future in concurrent.futures.as_completed([executor.submit(process, t) for t in test_ephys_sessions]):
        _ = future.result() # wait for completion / handle exceptions

for info in test_ephys_sessions:
    submit_job(info) # submit jobs to HPC in series to avoid overloading the scheduler

Submitted batch job 21568333
Submitted batch job 21568337
Submitted batch job 21568343
Submitted batch job 21568345
Submitted batch job 21568349
Submitted batch job 21568350
Submitted batch job 21568354
Submitted batch job 21568355
Submitted batch job 21568357
Submitted batch job 21568360
Submitted batch job 21568363
Submitted batch job 21568365
Submitted batch job 21568367
Submitted batch job 21568376
Submitted batch job 21568378
Submitted batch job 21568382
Submitted batch job 21568383
Submitted batch job 21568384
Submitted batch job 21568385
Submitted batch job 21568388
Submitted batch job 21568389
Submitted batch job 21568390
Submitted batch job 21568394
Submitted batch job 21568403
Submitted batch job 21568410
Submitted batch job 21568417
Submitted batch job 21568425
Submitted batch job 21568432
Submitted batch job 21568440
Submitted batch job 21568447
Submitted batch job 21568452
Submitted batch job 21568485
Submitted batch job 21568492
Submitted batch job 21568498
Submitted batc

In [63]:
get_shell_script_cmd(info)

'#!/bin/bash\n#SBATCH --job-name=npexp_to_incoming                        # Job name\n#SBATCH --mail-type=END,FAIL                                # Mail events (NONE, BEGIN, END, FAIL, ALL)\n#SBATCH --mail-user=ben.hardcastle@alleninstitute.org       # Where to send mail  \n#SBATCH --ntasks=1                                          # Run on a single CPU\n#SBATCH --mem=4gb                                           # Job memory request (per node)\n#SBATCH --time=20:00:00                                     # Time limit hrs:min:sec\n#SBATCH --output=dynamicrouting_to_vast%j.log               # Standard output and error log\n#SBATCH --partition braintv                                 # Partition used for processing\n#SBATCH --tmp=100M                                          # Request the amount of space your jobs needs on /scratch/fast\n\npwd; hostname; date\n\necho \'Running rsync job on a single thread\'\n\nrsync -Larv --remove-source-files --log-file=//allen/programs/mindscope/workgro

'rsync -Larv --remove-source-files --log-file=//allen/programs/mindscope/workgroups/dynamicrouting/ben/vast_transfers/2024-10-23_1907.log --files-from=//allen/programs/mindscope/workgroups/dynamicrouting/ben/vast_transfers/2024-10-23_1907.txt "//allen/programs/mindscope/workgroups/dynamicrouting/PilotEphys/Task 2 pilot" "//allen/aind/scratch/dynamic-routing/Task 2 pilot"'

1285

Submitted batch job 21567168
