# Clear raw and sorted .dat files on np-exp based on filesize comparison alone

Assumes that transfers to lims have been performed with checksum validation at
every stage.

Does not touch npx2 files - some of which pre-date checksum validated transfers,
and some are known to have bad copies

In [7]:
import contextlib
import pathlib

import np_config
import np_session

NPEXP_ROOT = pathlib.Path(
    "//allen/programs/mindscope/workgroups/np-exp"
)
assert NPEXP_ROOT.exists()

DRYRUN = True
unlinked_paths = []

def size(path: pathlib.Path) -> int:
    return sum(
        f.stat().st_size
        for f in path.rglob('*')
        if f.is_file()
    ) if path.is_dir() else path.stat().st_size
    
def sizes_match(p1, p2):
    return size(p1) == size(p2)

def unlink(path):
    if DRYRUN:
        unlinked_paths.append(path)
        return
    if path.is_dir():
        while path.exists():
            for _ in path.iterdir():
                with contextlib.suppress(OSError):
                    unlink(_)
                    unlinked_paths.append(path)
            path.rmdir()
    else:
        path.unlink()
        unlinked_paths.append(path)

Delete raw ephys data that's already on lims (judging by file size):

In [4]:
import itertools


total_bytes = 0  
for npexp_path in NPEXP_ROOT.iterdir():
    try:
        session = np_session.PipelineSession(npexp_path)
    except np_session.SessionError:
        continue
    
    if not session.lims_path:
        continue
    
    for npexp_file in itertools.chain(npexp_path.rglob('*.npx2'), npexp_path.rglob('*.dat')):
        if (
            not npexp_file.is_symlink()
            and (lims_copy := session.lims_path / npexp_file.relative_to(npexp_path)).exists()
            and sizes_match(npexp_file, lims_copy)
        ):
            total_bytes += size(npexp_file)
            unlink(npexp_file)
            print(f"{'Dryrun: ' if DRYRUN else ''}Cleared {total_bytes/1024**3:,.0f} GB\r", end='')

Delete sorted probe data that's already on lims (judging by file size and
last-modified time, in case multiple probe uploads have been made):

In [13]:
total_bytes = 0  
for probe in 'ABCDEF':
    for probe_folder in NPEXP_ROOT.rglob(f'*probe{probe}_sorted'):
        
        try:
            session = np_session.Session(probe_folder)
        except np_session.SessionError:
            continue
        
        if not session.lims_path:
            continue
        
        for npexp_path in probe_folder.rglob('*'):
            if npexp_path.is_dir():
                continue
            if (existing := tuple(session.lims_path.rglob(f'*/*_probe{probe}/{npexp_path.relative_to(probe_folder)}') )):
                for lims_path in existing:
                    if (
                        not npexp_file.is_symlink()
                        and sizes_match(lims_path, npexp_path) 
                        and lims_path.stat().st_mtime == npexp_path.stat().st_mtime
                    ):
                        total_bytes += size(npexp_path)
                        unlink(npexp_path)
                        print(f"Cleared {total_bytes/1024**3:,.0f} GB\r", end='')
                        break
                        

Cleared 90 GB

Mismatch between session folder strings - file may be in the wrong folder: '\\\\allen\\programs\\mindscope\\workgroups\\np-exp\\1170937835_604910_20220415\\1170937835_366122_20220415_probeA_sorted'
Mismatch between session folder strings - file may be in the wrong folder: '\\\\allen\\programs\\mindscope\\workgroups\\np-exp\\1170937835_604910_20220415\\1170937835_366122_20220415_probeA_sorted'


Cleared 195 GB

Mismatch between session folder strings - file may be in the wrong folder: '\\\\allen\\programs\\mindscope\\workgroups\\np-exp\\1170937835_604910_20220415\\1170937835_366122_20220415_probeB_sorted'
Mismatch between session folder strings - file may be in the wrong folder: '\\\\allen\\programs\\mindscope\\workgroups\\np-exp\\1170937835_604910_20220415\\1170937835_366122_20220415_probeB_sorted'


Cleared 276 GB

Mismatch between session folder strings - file may be in the wrong folder: '\\\\allen\\programs\\mindscope\\workgroups\\np-exp\\1170937835_604910_20220415\\1170937835_366122_20220415_probeC_sorted'
Mismatch between session folder strings - file may be in the wrong folder: '\\\\allen\\programs\\mindscope\\workgroups\\np-exp\\1170937835_604910_20220415\\1170937835_366122_20220415_probeC_sorted'


Cleared 362 GB

Mismatch between session folder strings - file may be in the wrong folder: '\\\\allen\\programs\\mindscope\\workgroups\\np-exp\\1170937835_604910_20220415\\1170937835_366122_20220415_probeD_sorted'
Mismatch between session folder strings - file may be in the wrong folder: '\\\\allen\\programs\\mindscope\\workgroups\\np-exp\\1170937835_604910_20220415\\1170937835_366122_20220415_probeD_sorted'


Cleared 413 GB

AttributeError: 'DRPilotSession' object has no attribute 'lims_path'

In [None]:
unlinked_paths

True