In [1]:
import np_tools
import pathlib
import npc_lims
import npc_session

In [None]:
SRC = pathlib.Path('//allen/aind/scratch/dynamic-routing/Task 2 pilot')
DEST = pathlib.Path('//allen/programs/mindscope/workgroups/dynamicrouting/PilotEphys/Task 2 pilot')

MAX_TB_TO_MOVE = None

In [21]:
import concurrent.futures    
import dataclasses

@dataclasses.dataclass
class Info:
    size: float
    path: pathlib.Path
    
def get_session_dir_info(session_dir):
    if not session_dir.is_dir():
        return None
    return Info(
        size=np_tools.dir_size_gb(session_dir),
        path=session_dir,
    )
    
dirs = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    for session in executor.map(get_session_dir_info, SRC.iterdir()):
        if session:
            dirs.append(session)

In [22]:
move_sessions = []
move_size = 0
for d in dirs:
    if d.size == 0:
        continue
    move_size += d.size
    move_sessions.append(d)
    if MAX_TB_TO_MOVE is not None and move_size >= MAX_TB_TO_MOVE * 1024:
        break
sum(t.size for t in move_sessions)

5227.500000000004

In [23]:
import datetime

scratch = pathlib.Path('//allen/programs/mindscope/workgroups/dynamicrouting/ben/vast_transfers')
scratch.mkdir(parents=True, exist_ok=True)
dt = datetime.datetime.now()

def get_file_manifest(info: Info) -> list[pathlib.Path]:
    return list(info.path.rglob('*'))

def write_manifest(info: Info) -> None:   
    get_manifest_path(info).write_text(
        '\n'.join(p.relative_to(SRC).as_posix() for p in get_file_manifest(info)),
        newline='\n',
    )

def get_hpc_output_path(info: Info) -> pathlib.Path:
    return scratch / f'{info.path.name}_{dt:%Y-%m-%d_%H%M}.log'

def get_manifest_path(info: Info) -> pathlib.Path:   
    p = get_hpc_output_path(info).with_suffix('.txt')
    p.touch(exist_ok=True)
    return p

def get_log_path(info: Info) -> pathlib.Path:
    p = get_hpc_output_path(info)
    p.touch(exist_ok=True)
    return p

def get_shell_script_path(info: Info) -> pathlib.Path:
    p = get_hpc_output_path(info).with_suffix('.sh')
    p.touch(exist_ok=True)
    return p

def get_rsync_cmd(info: Info) -> str:
    src = SRC.as_posix()#.replace('//', '/') # rsync doesn't like double slashes
    dest = DEST.as_posix()#.replace('//', '/')
    rsync_cmd = f'rsync -Larv --remove-source-files --log-file={get_log_path(info).as_posix()} --files-from={get_manifest_path(info).as_posix()} "{src}" "{dest}"'
    # -a archive mode
    # -r recursive (for dirs)
    # -v verbose
    # -L copy the data that symlinks point to
    # --remove-source-files deletes source files after copying, but not dirs
    return rsync_cmd

def get_shell_script_cmd(info: Info) -> str:
    script = f"""#!/bin/bash
#SBATCH --job-name=npexp_to_incoming                        # Job name
#SBATCH --mail-type=FAIL                                    # Mail events (NONE, BEGIN, END, FAIL, ALL)
#SBATCH --mail-user=ben.hardcastle@alleninstitute.org       # Where to send mail  
#SBATCH --ntasks=1                                          # Run on a single CPU
#SBATCH --mem=8gb                                           # Job memory request (per node)
#SBATCH --time=20:00:00                                     # Time limit hrs:min:sec
#SBATCH --output=vast_to_dynamicrouting_%j.log              # Standard output and error log
#SBATCH --partition braintv                                 # Partition used for processing
#SBATCH --tmp=100M                                          # Request the amount of space your jobs needs on /scratch/fast

pwd; hostname; date

echo 'Running rsync job on a single thread'

{get_rsync_cmd(info)}

date
"""
    return script

def write_shell_script(info: Info) -> None:
    get_shell_script_path(info).write_text(get_shell_script_cmd(info), newline='\n') 
    # if writing on Windows, newline==\r\n by default, which isn't compatible with bash on linux

def submit_job(info: Info) -> None:
    with np_tools.hpc as ssh:
        ssh.run(f'sbatch {get_shell_script_path(info).as_posix()}')

def process(info: Info) -> None:
    write_manifest(info)
    write_shell_script(info)

with concurrent.futures.ThreadPoolExecutor() as executor:
    for future in concurrent.futures.as_completed([executor.submit(process, t) for t in move_sessions]):
        _ = future.result() # wait for completion / handle exceptions

for info in move_sessions:
    submit_job(info) # submit jobs to HPC in series to avoid overloading the scheduler

Submitted batch job 22041515
Submitted batch job 22041516
Submitted batch job 22041517
Submitted batch job 22041518
Submitted batch job 22041519
Submitted batch job 22041520
Submitted batch job 22041521
Submitted batch job 22041522
Submitted batch job 22041523
Submitted batch job 22041524
Submitted batch job 22041525
Submitted batch job 22041526
Submitted batch job 22041527
Submitted batch job 22041528
Submitted batch job 22041529
Submitted batch job 22041530
Submitted batch job 22041531
Submitted batch job 22041532
Submitted batch job 22041533
Submitted batch job 22041534
Submitted batch job 22041535
Submitted batch job 22041536
Submitted batch job 22041537
Submitted batch job 22041538
Submitted batch job 22041539
Submitted batch job 22041540
Submitted batch job 22041541
Submitted batch job 22041542
Submitted batch job 22041543
Submitted batch job 22041640
Submitted batch job 22041647
Submitted batch job 22041648
Submitted batch job 22041649
Submitted batch job 22041650
Submitted batc

In [17]:
info

Info(size=44.3, path=WindowsPath('//allen/aind/scratch/dynamic-routing/Task 2 pilot/DRpilot_668755_20230829_surface_channels (2)'))

In [9]:
get_manifest_path(info)

WindowsPath('//allen/programs/mindscope/workgroups/dynamicrouting/ben/vast_transfers/741148_2024-10-18_2024-11-26_1032.txt')

In [None]:
get_shell_script_cmd(info)