In [3]:
%reload_ext autoreload
%autoreload 2

import os
import shutil
import numpy as np

import matplotlib.pyplot as plt
plt.set_cmap('gray')
plt.rcParams['figure.figsize'] = (12, 10)

from glob import glob
from tqdm.notebook import tqdm
import pydicom
import pandas as pd
import uuid

def get_dcm_metadata(dpath_dcm):
    fp = [f for f in glob('{}/**/*.dcm'.format(dpath_dcm), recursive=True)][0]
    return pydicom.dcmread(fp)

def rename_dcm(fpath_dcm, suffix):
    dcm = pydicom.dcmread(fpath_dcm)
    dcm.SeriesDescription = '{} {}'.format(dcm.SeriesDescription, suffix)
    dcm.save_as(fpath_dcm)
    
def cleanup_series_desc(fpath_dcm):
    dcm = pydicom.dcmread(fpath_dcm)
    sdesc = dcm.SeriesDescription.replace('SubtleGad: ', '')
    sdesc = sdesc.replace(' SMRboost_reader_study', '')
    dcm.SeriesDescription = sdesc
    dcm.save_as(fpath_dcm)
    
def get_dcm_ser_nums(dpath_root):
    dcm_sers = [d for d in glob('{}/*'.format(dpath_root))]
    ser_nums = []
    for dpath in dcm_sers:
        fp_dcms = [f for f in glob('{}/*.dcm'.format(dpath))]
        dcm = pydicom.dcmread(fp_dcms[0])
        ser_nums.append((dpath, int(dcm.SeriesNumber)))
    ser_nums = sorted(ser_nums, key=lambda r:r[1])
    return ser_nums

def generate_uuid():
    prefix = "1.2.826.0.1.3680043.10.221."
    entropy_src = uuid.uuid4().int
    avail_digits = 64 - len(prefix)
    int_val = entropy_src % (10 ** avail_digits)
    return prefix + str(int_val)

def anonymize_study(dirpath_study, sidebyside=False, uid_ref=None):
    case_num = dirpath_study.split('/')[-1]
    
    ser_desc = ['3D T1', '3D T1 +C']
    if sidebyside:
        ser_desc = ['3D T1', '3D T1 +C - A', '3D T1 +C - B', '3D T1 +C - C']
    
    ser_ord = get_dcm_ser_nums(dirpath_study)
    shuffle_info = []
    if sidebyside:
        id_str = np.array(['pre', 'soc', 'gad', 'gad+smr'])
        rnd_idx = np.arange(1, 4)
        np.random.shuffle(rnd_idx)
        new_ord = [ser_ord[0]]
        for i in rnd_idx:
            new_ord.append(ser_ord[i])
        ser_ord = new_ord
        shuffle_info = id_str[rnd_idx]

    if not uid_ref:
        study_uid = generate_uuid()
    else:
        fp_dcm = [fp for fp in glob('{}/**/*.dcm'.format(uid_ref), recursive=True) if case_num in fp][0]
        dcm_hdr = pydicom.dcmread(fp_dcm)
        study_uid = dcm_hdr.StudyInstanceUID
    
    for idx, (ser, snum) in enumerate(ser_ord):
        dir_name = ser_desc[idx].replace(' ', '_')
        dir_name = '{:02d}_{}'.format(idx+1, dir_name)
        new_path = os.path.join(dirpath_study, dir_name)
        
        shutil.move(ser, new_path)
        
        fp_dcms = [fp for fp in glob('{}/*.dcm'.format(new_path))]
        for fp in fp_dcms:
            dcm_ds = pydicom.dcmread(fp)
            inst_num = int(dcm_ds.InstanceNumber)
            new_fname = os.path.join(new_path, '{:04d}.dcm'.format(inst_num))
            
            dcm_ds.PatientID = case_num
            dcm_ds.PatientsName = case_num
            dcm_ds.PatientName = case_num
            dcm_ds.StudyInstanceUID = study_uid
            dcm_ds.StudyID = case_num
            dcm_ds.AccessionNumber = case_num
            dcm_ds.SeriesNumber = str(idx+100)
            dcm_ds.OtherPatientIDs = case_num
            dcm_ds.OtherPatientIDsSequence = ''
            
            dcm_ds.SeriesDescription = ser_desc[idx]
            dcm_ds.StudyDescription = ser_desc[idx]
            dcm_ds.ProtocolName = ser_desc[idx]

            pydicom.dcmwrite(new_fname, dcm_ds)
            os.remove(fp)
    return shuffle_info

<Figure size 864x720 with 0 Axes>

### Copy cases from source

In [None]:
src_path = '/home/srivathsa/projects/studies/gad/stanford/data'
dest_path = '/home/srivathsa/projects/studies/gad/boost/0_cases'
cases = [
    "Patient_0086", "Patient_0131", "Patient_0133", "Patient_0167", "Patient_0171", "Patient_0173", "Patient_0184", 
    "Patient_0194", "Patient_0199", "Patient_0201", "Patient_0206", "Patient_0214", "Patient_0215", "Patient_0224", 
    "Patient_0247", "Patient_0256", "Patient_0267", "Patient_0269", "Patient_0271", "Patient_0274", "Patient_0275", 
    "Patient_0280", "Patient_0289", "Patient_0294", "Patient_0298", "Patient_0303", "Patient_0304", "Patient_0320", 
    "Patient_0323", "Patient_0329", "Patient_0336", "Patient_0341", "Patient_0350", "Patient_0352", "Patient_0353", 
    "Patient_0360", "Patient_0362", "Patient_0363", "Patient_0378", "Patient_0385", "Patient_0408", "Patient_0409", 
    "Patient_0412", "Patient_0417", "Patient_0419", "Patient_0429", "Patient_0441", "Patient_0443", "Patient_0462", 
    "Patient_0463", "Patient_0468", "Patient_0474", "Patient_0477", "Patient_0479", "Patient_0481", "Patient_0492", 
    "Patient_0526", "Patient_0532", "Patient_0561", "Patient_0563", "Patient_0568"
]

# kw = 'MPRAGE'

# for cnum in tqdm(cases, total=len(cases)):
#     try:
#         dcm_dirs = [d for d in glob('{}/{}/*'.format(src_path, cnum)) if kw.lower() in d.lower()]
#         dcm_dirs = sorted(dcm_dirs, key=lambda f: int(get_dcm_metadata(f).SeriesNumber))

#         shutil.copytree(dcm_dirs[0], os.path.join(dest_path, cnum, dcm_dirs[0].split('/')[-1]))
#         shutil.copytree(dcm_dirs[-1], os.path.join(dest_path, cnum, dcm_dirs[-1].split('/')[-1]))
#     except Exception as exc:
#         print('Error in {}:{}'.format(cnum, exc))

### Rename series with DNE suffix for SMR processing

In [None]:
dcm_files = [fp for fp in glob('{}/**/*.dcm'.format(dest_path), recursive=True)]
for fpath_dcm in tqdm(dcm_files, total=len(dcm_files)):
    rename_dcm(fpath_dcm, 'DNE')

### Group SubtleMR output into cases

In [None]:
smr_output = '/home/srivathsa/projects/studies/gad/boost/3_cases_mr_gad_mr/dicoms'
dcm_dest = '/home/srivathsa/projects/studies/gad/boost/3_cases_mr_gad_mr'

study_uid_map = {}

for cnum in tqdm(cases, total=len(cases)):
    fpath_dcm = [fp for fp in glob('{}/{}/**/*.dcm'.format(src_path, cnum), recursive=True)][0]
    study_uid = pydicom.dcmread(fpath_dcm).StudyInstanceUID
    study_uid_map[study_uid] = cnum

In [None]:
dcm_outs = [dp for dp in sorted(glob('{}/*'.format(smr_output)))]
for dpath_dcm_ser in tqdm(dcm_outs, total=len(dcm_outs)):
    fpath_dcm = [fp for fp in glob('{}/*.dcm'.format(dpath_dcm_ser))][0]
    dcm = pydicom.dcmread(fpath_dcm)
    ser_desc = dcm.SeriesDescription.replace(' DNE SMR', ' GAD+SMR').replace(' ', '_')
    study_uid = dcm.StudyInstanceUID
    ser_num = dcm.SeriesNumber-1000
    ser_desc = '{}_{}'.format(ser_num, ser_desc)
    cnum = study_uid_map[study_uid]
    
    new_path = '{}/{}/{}'.format(dcm_dest, cnum, ser_desc)
    
    shutil.copytree(dpath_dcm_ser, new_path)

### Rename SubtleGad processed

In [None]:
dpath_gad_proc = '/home/srivathsa/projects/studies/gad/boost/2_cases_mr_gad'
dcm_files = [fp for fp in glob('{}/**/*.dcm'.format(dpath_gad_proc), recursive=True)]
for fpath_dcm in tqdm(dcm_files, total=len(dcm_files)):
    cleanup_series_desc(fpath_dcm)

### Group input and output series

In [None]:
dp_input = '/home/srivathsa/projects/studies/gad/boost/0_cases'
dp_output = '/home/srivathsa/projects/studies/gad/boost/3_cases_mr_gad_mr'
dp_dest = '/home/srivathsa/projects/studies/gad/boost/final_cases'

for cnum in tqdm(cases, total=len(cases)):
    shutil.copytree('{}/{}'.format(dp_input, cnum), '{}/{}'.format(dp_dest, cnum))
    
    dp_ser = [d for d in glob('{}/{}/*'.format(dp_output, cnum))][0]
    shutil.copytree(dp_ser, '{}/{}/{}'.format(dp_dest, cnum, dp_ser.split('/')[-1]))

In [None]:
dp_gad = '/home/srivathsa/projects/studies/gad/boost/0_cases/7acf03_386e1d'
for cnum in tqdm(cases, total=len(cases)):
    dp_ser = [d for d in glob('{}/{}/{}_SubtleGad'.format(dp_gad, cnum, cnum))][0]
    ser_desc = '1000_AX_BRAVO_+C_GAD'
    shutil.copytree(dp_ser, '{}/{}/{}'.format(dp_dest, cnum, ser_desc))

### Randomize cases

In [None]:
cases = [
    "Patient_0086", "Patient_0131", "Patient_0133", "Patient_0167", "Patient_0171", "Patient_0173", "Patient_0184", 
    "Patient_0194", "Patient_0199", "Patient_0201", "Patient_0206", "Patient_0214", "Patient_0215", "Patient_0224", 
    "Patient_0247", "Patient_0256", "Patient_0267", "Patient_0269", "Patient_0271", "Patient_0274", "Patient_0275", 
    "Patient_0280", "Patient_0289", "Patient_0294", "Patient_0298", "Patient_0303", "Patient_0304", "Patient_0320", 
    "Patient_0323", "Patient_0329", "Patient_0336", "Patient_0341", "Patient_0350", "Patient_0352", "Patient_0353", 
    "Patient_0360", "Patient_0362", "Patient_0363", "Patient_0378", "Patient_0385", "Patient_0408", "Patient_0409", 
    "Patient_0412", "Patient_0417", "Patient_0419", "Patient_0429", "Patient_0441", "Patient_0443", "Patient_0462", 
    "Patient_0463", "Patient_0468", "Patient_0474", "Patient_0477", "Patient_0479", "Patient_0481", "Patient_0492", 
    "Patient_0526", "Patient_0532", "Patient_0561", "Patient_0563", "Patient_0568"
]

# cases.extend(['{}_boost'.format(c) for c in cases])
# np.random.shuffle(cases)

# case_randomization_map = {}

# for idx, cnum in enumerate(cases):
#     rcnum = 'Case{:04d}'.format(idx+1)
#     case_randomization_map[rcnum] = cnum

In [143]:
case_mapping = pd.read_csv('/home/srivathsa/projects/studies/gad/boost/case_mapping.csv').to_dict(orient='records')
dp_dest = '/home/srivathsa/projects/studies/gad/boost/anon_cases'
dp_src = '/home/srivathsa/projects/studies/gad/boost/final_cases'

for drow in tqdm(case_mapping, total=len(case_mapping)):
    case_dir = os.path.join(dp_dest, drow['Case ID'])
    os.makedirs(case_dir, exist_ok=False)
    orig_id = drow['Original ID']
    ref_id = orig_id.replace('_boost', '')
    
    snums = get_dcm_ser_nums(os.path.join(dp_src, ref_id))
    shutil.copytree(snums[0][0], os.path.join(case_dir, snums[0][0].split('/')[-1]))
    
    if '_boost' in orig_id:
        shutil.copytree(snums[-1][0], os.path.join(case_dir, snums[-1][0].split('/')[-1]))
    else:
        shutil.copytree(snums[1][0], os.path.join(case_dir, snums[1][0].split('/')[-1]))

  0%|          | 0/100 [00:00<?, ?it/s]

### Anonymize Studies

In [144]:
dpath_cases = sorted([d for d in glob('{}/Case*'.format(dp_dest))])

for dpath in tqdm(dpath_cases, total=len(dpath_cases)):
    anonymize_study(dpath)

  0%|          | 0/100 [00:00<?, ?it/s]



### Process flagged cases

In [137]:
df_flag = pd.read_csv('/home/srivathsa/projects/studies/gad/boost/case_flagged.csv')
dict_cases = df_flag[df_flag['Flagged for removal'] == 0].to_dict(orient='records')
case_ids = sorted([r['Answer key'] for r in dict_cases])
soc_set = [c for c in case_ids if '_boost' not in c]
boost_set = [c for c in case_ids if c not in soc_set]
rpt_set_soc = np.random.choice(soc_set, size=10, replace=False)
rpt_set_boost = np.random.choice([
    c for c in boost_set 
    if c.replace('_boost', '') not in rpt_set_soc], size=10, replace=False)

case_ids.extend(rpt_set_soc)
case_ids.extend(rpt_set_boost)

In [138]:
wsize = 3
case_ids = np.array(case_ids)
repeat_occurs = True

# keep shuffling until you don't find a case and it's pair or repeat within 3 blocks

while repeat_occurs:
    np.random.shuffle(case_ids)
    repeats = 0
    for idx in np.arange(len(case_ids)):
        cur_case = case_ids[idx]
        idxs = np.array([i for i in np.arange(idx-wsize, idx+wsize)])
        idxs = np.clip(idxs, 0, len(case_ids)-1)
        idxs = [i for i in idxs if i != idx]
        vis_cases = [c.replace('_boost', '') for c in case_ids[idxs]]
        if cur_case.replace('_boost', '') in vis_cases:
            repeats += 1
    if repeats == 0:
        repeat_occurs = False

In [140]:
case_randomization_map = {}

for idx, cnum in enumerate(case_ids):
    rcnum = 'Case{:04d}'.format(idx+1)
    case_randomization_map[rcnum] = cnum

### Prepare side-by-side cases

In [5]:
case_mapping = pd.read_csv('/home/srivathsa/projects/studies/gad/boost/case_mapping.csv').to_dict(orient='records')
case_mapping = [r for r in case_mapping if '_boost' not in r['Original ID']]

dp_dest = '/home/srivathsa/projects/studies/gad/boost/anon_sidebyside'
dp_src = '/home/srivathsa/projects/studies/gad/boost/final_cases'

for drow in tqdm(case_mapping, total=len(case_mapping)):
    case_dir = os.path.join(dp_dest, drow['Case ID'])
    os.makedirs(case_dir, exist_ok=False)
    orig_id = drow['Original ID']
    ref_id = orig_id.replace('_boost', '')
    
    snums = get_dcm_ser_nums(os.path.join(dp_src, ref_id))
    for sno in np.arange(len(snums)):
        shutil.copytree(snums[sno][0], os.path.join(case_dir, snums[sno][0].split('/')[-1]))

  0%|          | 0/50 [00:00<?, ?it/s]

In [6]:
dpath_cases = sorted([d for d in glob('{}/Case*'.format(dp_dest))])
shuffle_info = []

for dpath in tqdm(dpath_cases, total=len(dpath_cases)):
    sinfo = anonymize_study(dpath, sidebyside=True, uid_ref='/home/srivathsa/projects/studies/gad/boost/anon_cases')
    shuffle_info.append((dpath.split('/')[-1], sinfo))

  0%|          | 0/50 [00:00<?, ?it/s]

In [13]:
for cnum, sinfo in shuffle_info:
    or_row = [r for r in case_mapping if r['Case ID'] == cnum][0]
    print('{},{},{}'.format(cnum, or_row['Original ID'], ','.join(sinfo)))

Case0001,Patient_0256,soc,gad,gad+smr
Case0003,Patient_0443,gad,soc,gad+smr
Case0004,Patient_0329,soc,gad+smr,gad
Case0005,Patient_0477,gad+smr,gad,soc
Case0007,Patient_0336,gad+smr,gad,soc
Case0008,Patient_0479,gad+smr,gad,soc
Case0011,Patient_0215,soc,gad,gad+smr
Case0012,Patient_0224,soc,gad+smr,gad
Case0013,Patient_0303,soc,gad+smr,gad
Case0017,Patient_0224,gad,soc,gad+smr
Case0018,Patient_0298,gad+smr,gad,soc
Case0021,Patient_0320,soc,gad+smr,gad
Case0023,Patient_0563,gad,gad+smr,soc
Case0027,Patient_0267,soc,gad,gad+smr
Case0028,Patient_0247,soc,gad+smr,gad
Case0029,Patient_0441,gad,gad+smr,soc
Case0030,Patient_0492,gad+smr,soc,gad
Case0031,Patient_0304,soc,gad,gad+smr
Case0033,Patient_0352,gad+smr,gad,soc
Case0037,Patient_0336,soc,gad,gad+smr
Case0038,Patient_0412,gad,gad+smr,soc
Case0039,Patient_0408,gad,soc,gad+smr
Case0041,Patient_0341,soc,gad+smr,gad
Case0043,Patient_0280,soc,gad,gad+smr
Case0044,Patient_0378,gad,soc,gad+smr
Case0045,Patient_0267,gad,soc,gad+smr
Case0046,Pat

In [18]:
bpath = '/home/srivathsa/projects/studies/gad/boost/anon_sidebyside'
case_list = sorted([c.split('/')[-1] for c in glob('{}/Case*'.format(bpath))])
case_chunks = np.split(np.array(case_list), 5)

for chunk_num, chunk_list in enumerate(case_chunks):
    for cnum in chunk_list:
        shutil.move('{}/{}'.format(bpath, cnum), '{}/Batch{}'.format(bpath, chunk_num+1))