In [1]:
import os 
import pandas as pd 
import pydicom as dicom
import matplotlib.pyplot as plt


In [2]:
#This function is made based on the code devolepd by Adam Jaamour in 
#https://github.com/Adamouization/Breast-Cancer-Detection-Mammogram-Deep-Learning
def mammograms_retrieval():
    
    
    """
    This function retrieves the path of all mammograms (3103) in the CBIS-DDSM.
    You need to import the original csv files from 
    "https://wiki.cancerimagingarchive.net/pages/viewpage.action?pageId=22516629".
    This  function outputs 4 csv files.Each csv file include the following columns:
    img id. Corresponds to the folder name where the mask is located. 
    img_path: Image path on your local machine.
    label: Image pathology (Benign or Malignant). BENIGN-WHITOUT-CALLBACK is replaced to Benign. 
    
    """
    #Path of folder that contains mammograms
    img_root = '/home/sposso22/Desktop/Breast_Cancer/CBIS-DDSM'
    #Path of folder that contains csv files.
    csv_root = '/home/sposso22/Desktop/DATA_REVIEW/original_csv_files'
    #Folder where you want to save the new csv files that will contain the  local paths of the mammograms
    csv_output_path = '/home/sposso22/Desktop/DATA_REVIEW/final_Full_mam_csv_files'
    folders = os.listdir(img_root)
    cases_dict = dict()  # save image id and path
    
    
    for f in folders:
        if f.endswith('_CC') or f.endswith('_MLO'):  # filter out the cropped images
            path = list()

            for root, dirs, files in os.walk(img_root + '/' + f):  # retrieve the path of image
                for d in dirs:
                    path.append(d)
                for filename in files:
                    path.append(filename)

            img_path = img_root + '/' + f + '/' + '/'.join(path)  # generate image path
            cases_dict[f] = img_path
            
    df = pd.DataFrame(list(cases_dict.items()), columns=['img', 'img_path']) 
    
    type_dict = {'Calc-Test': 'calc_case_description_test_set.csv',
                 'Calc-Training': 'calc_case_description_train_set.csv',
                 'Mass-Test': 'mass_case_description_test_set.csv',
                 'Mass-Training': 'mass_case_description_train_set.csv'}

    for t in type_dict.keys():  # handle images based on the type
            df_subset = df[df['img'].str.startswith(t)]

            df_csv = pd.read_csv(csv_root + '/' + type_dict[t],
                                 usecols=['pathology', 'image file path'])  # read original csv file
            df_csv['img'] = df_csv.apply(lambda row: row['image file path'].split('/')[0],
                                         axis=1)  # extract image id from the path
            df_csv['pathology'] = df_csv.apply(
                lambda row: 'BENIGN' if row['pathology'] == 'BENIGN_WITHOUT_CALLBACK' else row['pathology'],
                axis=1)  # replace pathology 'BENIGN_WITHOUT_CALLBACK' to 'BENIGN'

            df_csv = df_csv.drop_duplicates(subset=["img"]) # Remove duplicate mammograms (orginal csv files assign mammograms with multi abnoramlities to different masks)

            df_subset_new = pd.merge(df_subset, df_csv, how='inner',
                                     on='img') #merge images path and pathology on img id. 


            df_subset_new = df_subset_new.drop(columns=["image file path"])

            df_subset_new.to_csv(csv_output_path + '/' + t.lower() + '.csv',
                                 index=False)  # output merged dataframe in csv format

            print(t)
            print('data_cnt: %d' % len(df_subset_new))
            
    print('Finished retrieval of mammogram paths!')

In [3]:
mammograms_retrieval()

Calc-Test
data_cnt: 284
Calc-Training
data_cnt: 1227
Mass-Test
data_cnt: 361
Mass-Training
data_cnt: 1231
Finished retrieval of mammogram paths!
