In [1]:
import os 
import pandas as pd 
import pydicom as dicom
import matplotlib.pyplot as plt

In [4]:
def curated_data():
    
    """
    This function retrieves the path of all masks (3568 masks) in the CBIS-DDSM.
    You need to import the original csv files from 
    "https://wiki.cancerimagingarchive.net/pages/viewpage.action?pageId=22516629".
    This  function outputs 4 csv files.Each csv file include the following columns:
    img id. Corresponds to the folder name where the mask is located. 
    img_path: Image path on your local machine.
    label: Image pathology (Benign or Malignant). BENIGN-WHITOUT-CALLBACK is replaced to Benign. 
    
    """
    
    img_root = '/home/sposso22/Desktop/Breast_Cancer/CBIS-DDSM'
    csv_root = '/home/sposso22/Desktop/DATA_REVIEW/original_csv_files'
    csv_output_path = '/home/sposso22/Desktop/DATA_REVIEW/final_MASK_csv_files'
    folders = os.listdir(img_root)
    cases_dict = dict()  # save image id and path
    count =0
    for f in folders:
        if f[-1].isdigit():
            path =list()#get mask 
            for roots, dirs, files in os.walk(img_root + "/"+f):
                if len(dirs)>1: #cropped images and masks are stored in a different directory .
                    for i in dirs:
                        for root,dirs,file in os.walk(img_root + "/"+f+"/"+i):
                            for final in dirs:
                                if final.startswith("1.000000-ROI"):

                                    path.append(final)
                                    path.append('1-1.dcm')

                                    img_path = root + '/' + '/'.join(path)  # generate image path

                                    cases_dict[f] = img_path


                else: #cropped images and masks are stored in a single directory . 

                    for d in dirs:
                        path.append(d)                    
                    for filenames in files:
                        if filenames.endswith('2.dcm'): #There is no consistent mask naming convention. 
                            path.append(filenames)

                            img_path = img_root + '/' + f + '/' + '/'.join(path)  # generate image path

                            img = dicom.dcmread(img_path).pixel_array
                            if img.dtype == 'uint16':

                                img_path = img_path[:-5]+"1.dcm" #There is no consistent mask naming convention
                                cases_dict[f] = img_path

                            else:
                                cases_dict[f] = img_path
                            
    df = pd.DataFrame(list(cases_dict.items()), columns=['img', 'roi_path']) #dictionary is converted to dataframe
    #original csv files 
    type_dict = {'Calc-Test': 'calc_case_description_test_set.csv',
                 'Calc-Training': 'calc_case_description_train_set.csv',
                 'Mass-Test': 'mass_case_description_test_set.csv',
                 'Mass-Training': 'mass_case_description_train_set.csv'}


    
    for t in type_dict.keys():  # handle images based on the type
            df_subset = df[df['img'].str.startswith(t)]

            df_csv = pd.read_csv(csv_root + '/' + type_dict[t],
                                 usecols=['pathology', 'ROI mask file path'])  # read original csv file
            df_csv['img'] = df_csv.apply(lambda row: row['ROI mask file path'].split('/')[0],
                                         axis=1)  # extract image id from the path
            df_csv['pathology'] = df_csv.apply(
                lambda row: 'BENIGN' if row['pathology'] == 'BENIGN_WITHOUT_CALLBACK' else row['pathology'],
                axis=1)  # replace pathology 'BENIGN_WITHOUT_CALLBACK' to 'BENIGN'

            df_subset_new = pd.merge(df_subset, df_csv, how='inner',
                                     on='img') # Merge image path and pathology in img id. 
            df_subset_new = df_subset_new.drop(columns=["ROI mask file path"]) #Drop original file path 

            df_subset_new.to_csv(csv_output_path + '/' + t.lower() + '.csv',
                                 index=False)  # output merged dataframe in csv format

            print(t)
            print('data_cnt: %d' % len(df_subset_new))
        
    print('Finished retrieval of masks!')
                            
                            
                            
                        
                                      
        

In [5]:
curated_data()

Calc-Test
data_cnt: 326
Calc-Training
data_cnt: 1546
Mass-Test
data_cnt: 378
Mass-Training
data_cnt: 1318
Finished retrieval of masks!
