In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.cloud import storage

## Reading/Writing to Bucket

In [3]:
project_id = "final-307422"
!gsutil ls -p $project_id

gs://cbis-ddsm-png/
gs://cbis-ddsm-test/
gs://cbis-ddsm-train/
gs://cbis_ddsm_raw/


In [4]:
client = storage.Client(project=project_id)

In [20]:
mode='train'
bucket_name = f'cbis-ddsm-{mode}'
bucket = client.get_bucket(bucket_name)
print("Bucket name: {}".format(bucket.name))
print("Bucket location: {}".format(bucket.location))
print("Bucket storage class: {}".format(bucket.storage_class))

Bucket name: cbis-ddsm-train
Bucket location: US
Bucket storage class: STANDARD


##### Listing files in our bucket

In [21]:
blobs = bucket.list_blobs()
print("Blobs in {}:".format(bucket.name))
filepaths = []
for item in blobs: filepaths.append(item.name)
len(filepaths)   

Blobs in cbis-ddsm-train:


## Construct image path mappings from downloaded paths into that of label table's specified img paths

In [None]:
fdf = pd.Series(filepaths,name='fullpath').str.strip().to_frame()

fdf_parsed = pd.DataFrame.from_records(fdf.fullpath.apply(lambda x: x.split('/')),index=fdf.index)
fdf_parsed.columns = ['patient_id','folder_1','folder_2','image_name']
fdf = fdf.join(fdf_parsed)

fdf['folder_1_last5'] = fdf['folder_1'].apply(lambda x: x[-5:])
fdf['folder_2_last5'] = fdf['folder_2'].apply(lambda x: x[-5:])
fdf['new_image_name'] = fdf.image_name.str.replace('1-1.dcm','000000.dcm').str.replace('1-2.dcm','000001.dcm')

fdf['for_join'] = fdf.patient_id +'/'+ fdf.folder_1_last5 + '/' \
            + fdf.folder_2_last5 + '/' + fdf.new_image_name

fdf.shape

Read file paths from label csvs

In [25]:
labels = pd.concat([pd.read_csv(f'metadata/calc_case_description_{mode}_set.csv'),
                    pd.read_csv(f'metadata/mass_case_description_{mode}_set.csv')],ignore_index=True)

old_img_filepath_parsed =  pd.DataFrame.from_records(labels['image file path'].str.strip().str.split('/'),index=labels.index)
old_img_filepath_parsed.columns = ['patient_id','folder_1','folder_2','image_name']
old_img_filepath_parsed = old_img_filepath_parsed.join(labels['image file path'].str.strip(),how='left')
old_img_filepath_parsed = old_img_filepath_parsed.rename(columns={'image file path':'old_image_filepath'})

old_crop_filepath_parsed =  pd.DataFrame.from_records(labels['cropped image file path'].str.strip().str.split('/'),index=labels.index)
old_crop_filepath_parsed.columns = ['patient_id','folder_1','folder_2','image_name']
old_crop_filepath_parsed = old_crop_filepath_parsed.join(labels['cropped image file path'].str.strip(),how='left')
old_crop_filepath_parsed = old_crop_filepath_parsed.rename(columns={'cropped image file path':'old_image_filepath'})

old_roi_filepath_parsed =  pd.DataFrame.from_records(labels['ROI mask file path'].str.strip().str.split('/'),index=labels.index)
old_roi_filepath_parsed.columns = ['patient_id','folder_1','folder_2','image_name']
old_roi_filepath_parsed = old_roi_filepath_parsed.join(labels['ROI mask file path'].str.strip(),how='left')
old_roi_filepath_parsed = old_roi_filepath_parsed.rename(columns={'ROI mask file path':'old_image_filepath'})

old_parsed = pd.concat([old_img_filepath_parsed,old_crop_filepath_parsed,old_roi_filepath_parsed],ignore_index=True)

old_parsed['folder_1_last5'] = old_parsed['folder_1'].apply(lambda x: x[-5:])
old_parsed['folder_2_last5'] = old_parsed['folder_2'].apply(lambda x: x[-5:])

old_parsed['for_join'] = old_parsed.patient_id +'/'+ old_parsed.folder_1_last5 + '/' \
            + old_parsed.folder_2_last5 + '/' + old_parsed.image_name

old_parsed.shape

Join these 2 together based on for_join column to get mapping

In [27]:
train_mappings = fdf.merge(old_parsed[['for_join','old_image_filepath']],on='for_join',how='inner')
train_mappings.to_csv(f'metadata/{mode}_set_path_mapping.csv',index=False)
train_mappings.shape

(8592, 10)

Check to make sure there are 2 images MAX per directory

In [28]:
(fdf.groupby(['patient_id','folder_1','folder_2']).size() <= 2).all()

True

Move the pngs to the old filepaths

In [29]:
from pathlib import Path
import shutil

In [31]:
DATA_DIR = f'/home/jupyter/data_png/cbis-ddsm-{mode}/cbis-ddsm-{mode}/'

src_paths = train_mappings.fullpath.apply(lambda x: DATA_DIR + x.replace('.dcm','.png'))

for p in Path(DATA_DIR).glob('**/*.png'):
    if p.is_dir(): continue
    
    p_to_match = str(p)
    
    match_b = src_paths==p_to_match
    if not match_b.any(): 
        print(p)
        continue
    path_to_mv_to = train_mappings.loc[match_b,'old_image_filepath'].iloc[0]
    path_to_mv_to = Path('/home/jupyter/data_png_reorganized/' +  path_to_mv_to.replace('.dcm','.png'))
    if not path_to_mv_to.parent.exists(): path_to_mv_to.parent.mkdir(parents=True)
    if path_to_mv_to.exists(): path_to_mv_to.unlink()
    shutil.copyfile(str(p), str(path_to_mv_to))
    #p.replace(path_to_mv_to)
    #path_to_mv_to.symlink_to(p)

Download to file

In [12]:
blob_name = item.name
blob = bucket.get_blob(blob_name)
output_file_name = 'data/test_downloald.dcm'
blob.download_to_filename(output_file_name)

In [13]:
output_file = open(output_file_name,'rb')