In [1]:
!pip install pydicom

Collecting pydicom
  Downloading pydicom-2.3.0-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 5.1 MB/s 
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-2.3.0


In [2]:
import numpy as np
import os
import pydicom as pdm
import skimage as skm
import pandas as pd
from skimage.transform import resize

In [3]:
#mount drive
from google.colab import drive
drive.mount('/content/gdrive')
os.chdir("/content/gdrive/MyDrive/ColabNotebooks/data")

Mounted at /content/gdrive


In [4]:
def resize_png(image, new_size):
  read_image = skm.io.imread(image)
  return resize(read_image, new_size, anti_aliasing=False)

def resize_dcm(image, new_size):
  read_image = pdm.dcmread(image, defer_size=False, force=True)
  pixels = read_image.pixel_array
  return resize(pixels, new_size, anti_aliasing=False) 


#def save_to_numpy(path, file):
#  np.save(path, file)

def patient_id(subject_id):
  if "Mass-Test_" in subject_id:
    start = len("Mass-Test_")
    return subject_id[start:start+7]
  else:
    start = len("Mass-Training_")
    return subject_id[start:start+7]

def left_or_right(subject_id):
  if "LEFT" in subject_id:
    return "LEFT"
  else:
    return "RIGHT"

def breast_density(patient_id, left_right, descriptors):
  patient = descriptors[descriptors["patient_id"] == patient_id]
  breast = patient[patient["left or right breast"] == left_right]
  density = breast["breast_density"].values[0] 
  return density

def numpyz_file(path, new_loc, bd_info):
  loaded_file = np.load(file=path + '.npy')
  subject_id = path[path.index("Mass-"):]
  id_patient = patient_id(subject_id)
  which_breast = left_or_right(subject_id)
  density = breast_density(id_patient, which_breast, bd_info)
  density_arr = np.array([density])
  np.savez(new_loc, arr_0=loaded_file, arr_1=density_arr)

def main_helper(test_csv, train_csv, meta_csv, new_size):
  test = pd.read_csv(test_csv)
  train = pd.read_csv(train_csv)
  meta = pd.read_csv(meta_csv)
  descriptors = pd.concat([test, train], axis=0) 

  f_locations = meta["File Location"].str[2:]
  patient_ids = meta["Subject ID"]
  
  for i in range(meta.shape[0]):

    # putting together file path 
    location = f_locations[i]
    subject_id = patient_ids[i]
    f_name = os.listdir(location)[0]
    fpath = os.path.join(location, f_name)

    # resize data
    if f_name.endswith("png"):
      resized = resize_png(fpath, new_size)
    else: 
      resized = resize_dcm(fpath, new_size)

    # save as numpy files
    np_name = os.path.join('numpy_files', subject_id)
    np.save(np_name, resized)

    # find breast density and combine to create numpyz file
    new_loc = os.path.join("numpyz_files", subject_id)
    numpyz_file(np_name, new_loc, descriptors)

    # print progress
    p_id = patient_id(subject_id)
    print(p_id + " Complete")

In [None]:
main_helper("mass_case_description_test_set.csv","mass_case_description_train_set.csv","metadata.csv",(229, 229))

P_01090 Complete
P_01090 Complete
P_01090 Complete
P_01090 Complete
P_01090 Complete
P_01090 Complete
P_01090 Complete
P_01090 Complete
P_01090 Complete
P_01101 Complete
P_01101 Complete
P_01108 Complete
P_01106 Complete
P_01106 Complete
P_01108 Complete
P_01108 Complete
P_01108 Complete
P_01110 Complete
P_01110 Complete
P_00058 Complete
P_00058 Complete
P_00059 Complete
P_01114 Complete
P_00059 Complete
P_00059 Complete
P_00059 Complete
P_00059 Complete
P_00061 Complete
P_00061 Complete
P_00059 Complete
P_00061 Complete
P_00061 Complete
P_00064 Complete
P_00064 Complete
P_00065 Complete
P_00065 Complete
P_00068 Complete
P_00065 Complete
P_00065 Complete
P_00068 Complete
P_00068 Complete
P_00068 Complete
P_00074 Complete
P_00074 Complete
P_00074 Complete
P_00074 Complete
P_00074 Complete
P_00074 Complete
P_00076 Complete
P_00076 Complete
P_00076 Complete
P_00076 Complete
P_00079 Complete
P_00079 Complete
P_00079 Complete
P_00079 Complete
P_00080 Complete
P_00080 Complete
P_00080 Comple

In [None]:
def densities_csv(fpath, new_loc):
  file_names = np.array([])
  densities = np.array([])

  for f in os.listdir(fpath):
    path = os.path.join(fpath, f)
    file_names = np.append(file_names, f)
    density = np.load(path)['arr_1'][0]
    if density == 1 or density == 2:
      density = 0
    else:
      density = 1
    densities = np.append(densities, density)



  results_tbl = pd.DataFrame({"Example": file_names, 
                              "Densities": densities})
  results_tbl.to_csv(new_loc, index=False)
  return results_tbl

In [None]:
densities_csv("numpyz_files", "densities.csv")

Unnamed: 0,Example,Densities
0,Mass-Test_P_01090_LEFT_CC_2.npz,0.0
1,Mass-Test_P_01090_LEFT_CC_3.npz,0.0
2,Mass-Test_P_01090_LEFT_MLO.npz,0.0
3,Mass-Test_P_01090_LEFT_MLO_2.npz,0.0
4,Mass-Test_P_01090_LEFT_MLO_1.npz,0.0
...,...,...
115,Mass-Training_P_00109_LEFT_CC_1.npz,1.0
116,Mass-Training_P_00109_LEFT_CC.npz,1.0
117,Mass-Training_P_00110_LEFT_CC_1.npz,1.0
118,Mass-Training_P_00110_LEFT_CC.npz,1.0
