In [10]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from pathlib import Path
import shutil
import os
from PIL import Image

Note: Folder name for ROI mask and image file are different, we're saving these under ROI mask names when generating large patches

In [11]:
def process_one_patch_path(patch_name,MANIFEST_DF):
    fold1,fold2,abnorm_id = patch_name.stem.split('-')
    fold1_bool = MANIFEST_DF['image file path'].str.find(fold1)>-1
    fold2_bool = MANIFEST_DF['image file path'].str.find(fold2)>-1
    abnorm_bool = MANIFEST_DF["abnormality id"]==int(abnorm_id.replace("abnorm_",""))
    find_bool = fold1_bool&fold2_bool&abnorm_bool
    
    fold1_bool2 = MANIFEST_DF['ROI mask file path'].str.find(fold1)>-1
    fold2_bool2 = MANIFEST_DF['ROI mask file path'].str.find(fold2)>-1
    abnorm_bool2 = MANIFEST_DF["abnormality id"]==int(abnorm_id.replace("abnorm_",""))
    find_bool2 = fold1_bool2&fold2_bool2&abnorm_bool2
    
    if find_bool.any():
        res = MANIFEST_DF.loc[find_bool,["image file path","ROI mask file path"]].iloc[0]
        name_components = res["ROI mask file path"].split('/')
        return '-'.join([name_components[1],name_components[2],abnorm_id]) + patch_name.suffix
    elif find_bool2.any():
        res = MANIFEST_DF.loc[find_bool2,["image file path","ROI mask file path"]].iloc[0]
        name_components = res["ROI mask file path"].split('/')
        return '-'.join([name_components[1],name_components[2],abnorm_id]) + patch_name.suffix
    else:
        return np.nan

def create_mapping(valid_mm,MANIFEST_DF):
    ''' output: dataframe with 2 columns small_patches_name and large_patches_name'''
    # this is a Series of small context patches
    valid_mm_s = pd.Series(valid_mm)
    # # remove the patches suffix from each patch filename, we're only using the filename not the path
    valid_mm_s = valid_mm_s.apply(lambda x: Path('-'.join(x.stem.split('-')[:-1]) + x.suffix))
    valid_mm_s.name ="small_patches_name"
    valid_mm_s = valid_mm_s.drop_duplicates()
    valid_mm_s = valid_mm_s.to_frame()
    valid_mm_s["large_patches_name"] = valid_mm_s["small_patches_name"].apply(
        lambda x: process_one_patch_path(x,MANIFEST_DF))
    return valid_mm_s

Do for Mass Patches

In [7]:
MANIFEST_DF = pd.read_csv('metadata/mass_case_description_train_set.csv')
label_class = "MALIGNANT_MASS"

destination_dir = Path('../cbis-ddsm-large-patch/valid')
src_dir = Path('../cbis-ddsm-large-patch/train')


valid_mm = list(Path(f'../../cbis-ddsm-patches/valid/{label_class}').glob('*'))
valid_mm_s = create_mapping(valid_mm,MANIFEST_DF)

for _,s in valid_mm_s.iterrows():
    fname_to_move_to_valid = s["large_patches_name"]
    if pd.isnull(fname_to_move_to_valid): continue
    destination = destination_dir/label_class/fname_to_move_to_valid
    src = src_dir/label_class/fname_to_move_to_valid
    if not src.exists(): continue
    if not destination.parent.exists():destination.parent.mkdir(parents=True)
    print(destination)
    src.replace(destination)



Do for Calfication Patches

In [9]:
MANIFEST_DF = pd.read_csv('metadata/calc_case_description_train_set.csv')
label_class = "MALIGNANT_CALCIFICATION"

destination_dir = Path('../cbis-ddsm-large-patch/valid')
src_dir = Path('../cbis-ddsm-large-patch/train')


valid_mm = list(Path(f'../../cbis-ddsm-patches/valid/{label_class}').glob('*'))
valid_mm_s = create_mapping(valid_mm,MANIFEST_DF)

for _,s in valid_mm_s.iterrows():
    fname_to_move_to_valid = s["large_patches_name"]
    if pd.isnull(fname_to_move_to_valid): continue
    destination = destination_dir/label_class/fname_to_move_to_valid
    src = src_dir/label_class/fname_to_move_to_valid
    if not src.exists(): continue
    if not destination.parent.exists():destination.parent.mkdir(parents=True)
    print(destination)
    src.replace(destination)



In [15]:
course_dir = Path('/Users/Ryan/HarvardCodes/MIT6862/')
src_dir = course_dir / 'cbis-ddsm-large-patch/train'
destination_dir = course_dir / 'cbis-ddsm-large-patch/valid'
label_class = "MALIGNANT_MASS"

valid_mm = list((course_dir / f'cbis-ddsm-patches/valid/{label_class}').glob('*'))
# this is a Series of small context patches
valid_mm_s = pd.Series(valid_mm)
# # remove the patches suffix from each patch filename, we're only using the filename not the path
valid_mm_s = valid_mm_s.apply(lambda x: Path('-'.join(x.stem.split('-')[:-1]) + x.suffix))
valid_mm_s.name ="small_patches_name"
valid_mm_s = valid_mm_s.drop_duplicates()
valid_mm_s = valid_mm_s.to_frame()

for _,s in valid_mm_s.iterrows():
    fname_to_move_to_valid = s["small_patches_name"]
    destination = destination_dir/label_class/fname_to_move_to_valid
    src = src_dir/label_class/fname_to_move_to_valid
    if not src.exists(): continue
    if not destination.parent.exists():destination.parent.mkdir(parents=True)
    print(destination)
    src.replace(destination)

/Users/Ryan/HarvardCodes/MIT6862/cbis-ddsm-large-patch/valid/MALIGNANT_MASS/1.3.6.1.4.1.9590.100.1.2.338795663112734693134356811890051999302-1.3.6.1.4.1.9590.100.1.2.245398881011643929625650254320825052239-abnorm_1.png
/Users/Ryan/HarvardCodes/MIT6862/cbis-ddsm-large-patch/valid/MALIGNANT_MASS/1.3.6.1.4.1.9590.100.1.2.155477128311691748030352170780111788170-1.3.6.1.4.1.9590.100.1.2.293728369211975468236490743672836489355-abnorm_1.png
/Users/Ryan/HarvardCodes/MIT6862/cbis-ddsm-large-patch/valid/MALIGNANT_MASS/1.3.6.1.4.1.9590.100.1.2.193352127112436491420177929451789935595-1.3.6.1.4.1.9590.100.1.2.168263165711078144037408664653348842349-abnorm_1.png
/Users/Ryan/HarvardCodes/MIT6862/cbis-ddsm-large-patch/valid/MALIGNANT_MASS/1.3.6.1.4.1.9590.100.1.2.73761162312191593237702232411750515552-1.3.6.1.4.1.9590.100.1.2.60156917810914737525620819822954849568-abnorm_1.png
/Users/Ryan/HarvardCodes/MIT6862/cbis-ddsm-large-patch/valid/MALIGNANT_MASS/1.3.6.1.4.1.9590.100.1.2.2014272624116298602066955

In [19]:
course_dir = Path('/Users/Ryan/HarvardCodes/MIT6862/')
src_dir = course_dir / 'cbis-ddsm-large-patch-contrast/train'
destination_dir = course_dir / 'cbis-ddsm-large-patch-contrast/valid'
label_class = "MALIGNANT_MASS"
label_classes = ["MALIGNANT_MASS","MALIGNANT_CALCIFICATION","BENIGN_MASS","BENIGN_CALCIFICATION"]

for label_class in label_classes:
    valid_mm = list((course_dir / f'cbis-ddsm-patches/valid/{label_class}').glob('*'))
    # this is a Series of small context patches
    valid_mm_s = pd.Series(valid_mm)
    # # remove the patches suffix from each patch filename, we're only using the filename not the path
    valid_mm_s = valid_mm_s.apply(lambda x: Path('-'.join(x.stem.split('-')[:-1]) + x.suffix))
    valid_mm_s.name ="small_patches_name"
    valid_mm_s = valid_mm_s.drop_duplicates()
    valid_mm_s = valid_mm_s.to_frame()

    for _,s in valid_mm_s.iterrows():
        fname_to_move_to_valid = s["small_patches_name"]
        destination = destination_dir/label_class/fname_to_move_to_valid
        src = src_dir/label_class/fname_to_move_to_valid
        if not src.exists(): continue
        if not destination.parent.exists():destination.parent.mkdir(parents=True)
        print(destination)
        src.replace(destination)

/Users/Ryan/HarvardCodes/MIT6862/cbis-ddsm-large-patch-contrast/valid/MALIGNANT_MASS/1.3.6.1.4.1.9590.100.1.2.338795663112734693134356811890051999302-1.3.6.1.4.1.9590.100.1.2.245398881011643929625650254320825052239-abnorm_1.png
/Users/Ryan/HarvardCodes/MIT6862/cbis-ddsm-large-patch-contrast/valid/MALIGNANT_MASS/1.3.6.1.4.1.9590.100.1.2.155477128311691748030352170780111788170-1.3.6.1.4.1.9590.100.1.2.293728369211975468236490743672836489355-abnorm_1.png
/Users/Ryan/HarvardCodes/MIT6862/cbis-ddsm-large-patch-contrast/valid/MALIGNANT_MASS/1.3.6.1.4.1.9590.100.1.2.193352127112436491420177929451789935595-1.3.6.1.4.1.9590.100.1.2.168263165711078144037408664653348842349-abnorm_1.png
/Users/Ryan/HarvardCodes/MIT6862/cbis-ddsm-large-patch-contrast/valid/MALIGNANT_MASS/1.3.6.1.4.1.9590.100.1.2.73761162312191593237702232411750515552-1.3.6.1.4.1.9590.100.1.2.60156917810914737525620819822954849568-abnorm_1.png
/Users/Ryan/HarvardCodes/MIT6862/cbis-ddsm-large-patch-contrast/valid/MALIGNANT_MASS/1.3.6