### csv로 이미지를 읽어와서, 좌표값과 클래스가 들어가있는 csv를 생성  
-> path/to/image.jpg,x1,y1,x2,y2,class_name

예:  
/data/imgs/img_001.jpg,837,346,981,456,cow  
/data/imgs/img_002.jpg,215,312,279,391,cat  
/data/imgs/img_002.jpg,22,5,89,84,bird

In [8]:
import pandas as pd
import cv2
import os
from glob import glob
import pydicom as dicom
import numpy as np
from matplotlib import pyplot as plt

%matplotlib inline

In [9]:
cbis_base_path = "/home/huray/data/CBIS-DDSM"
dicom_base_path = os.path.join(cbis_base_path, "dicom")

img_save_path = "/home/huray/data/CBIS-DDSM/img"

WIDTH = 1400
HEIGHT = 1750

In [10]:
calc_csv_files = glob("{}/csv/calc_*.csv".format(cbis_base_path))
mass_csv_files = glob("{}/csv/mass_*.csv".format(cbis_base_path))
print(calc_csv_files)
print(mass_csv_files)

## to see examples
df = pd.read_csv(mass_csv_files[1], header=0)
df.head()

['/home/huray/data/CBIS-DDSM/csv/calc_case_description_train_set.csv', '/home/huray/data/CBIS-DDSM/csv/calc_case_description_test_set.csv']
['/home/huray/data/CBIS-DDSM/csv/mass_case_description_train_set.csv', '/home/huray/data/CBIS-DDSM/csv/mass_case_description_test_set.csv']


Unnamed: 0,patient_id,breast_density,left or right breast,image view,abnormality id,abnormality type,mass shape,mass margins,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00016,4,LEFT,CC,1,mass,IRREGULAR,SPICULATED,5,MALIGNANT,5,Mass-Test_P_00016_LEFT_CC/1.3.6.1.4.1.9590.100...,Mass-Test_P_00016_LEFT_CC_1/1.3.6.1.4.1.9590.1...,Mass-Test_P_00016_LEFT_CC_1/1.3.6.1.4.1.9590.1...
1,P_00016,4,LEFT,MLO,1,mass,IRREGULAR,SPICULATED,5,MALIGNANT,5,Mass-Test_P_00016_LEFT_MLO/1.3.6.1.4.1.9590.10...,Mass-Test_P_00016_LEFT_MLO_1/1.3.6.1.4.1.9590....,Mass-Test_P_00016_LEFT_MLO_1/1.3.6.1.4.1.9590....
2,P_00017,2,LEFT,CC,1,mass,ROUND,CIRCUMSCRIBED,4,MALIGNANT,4,Mass-Test_P_00017_LEFT_CC/1.3.6.1.4.1.9590.100...,Mass-Test_P_00017_LEFT_CC_1/1.3.6.1.4.1.9590.1...,Mass-Test_P_00017_LEFT_CC_1/1.3.6.1.4.1.9590.1...
3,P_00017,2,LEFT,MLO,1,mass,ROUND,ILL_DEFINED,4,MALIGNANT,4,Mass-Test_P_00017_LEFT_MLO/1.3.6.1.4.1.9590.10...,Mass-Test_P_00017_LEFT_MLO_1/1.3.6.1.4.1.9590....,Mass-Test_P_00017_LEFT_MLO_1/1.3.6.1.4.1.9590....
4,P_00032,3,RIGHT,CC,1,mass,ROUND,OBSCURED,0,BENIGN,2,Mass-Test_P_00032_RIGHT_CC/1.3.6.1.4.1.9590.10...,Mass-Test_P_00032_RIGHT_CC_1/1.3.6.1.4.1.9590....,Mass-Test_P_00032_RIGHT_CC_1/1.3.6.1.4.1.9590....


### 이미지와 마스크를 읽어오기(마스크인지 crop된 패치인지 구분해줘야 함)

In [11]:
# 일단 테스트셋 하나만 가지고 할까.
# cbis_df = pd.read_csv('/home/huray/data/CBIS-DDSM/csv/mass_case_description_test_set.csv', header=0)

# cbis_df = cbis_df[:100]

train_img_path = []
train_x1 = []
train_y1 = []
train_x2 = []
train_y2 = []
train_class_name = []

test_img_path = []
test_x1 = []
test_y1 = []
test_x2 = []
test_y2 = []
test_class_name = []

errored_data_counter = 0

for csv in calc_csv_files+mass_csv_files:
    if 'calc' in csv:
        if 'train_set' in csv:
            img_sub_path = 'calc_train'
        elif 'test_set' in csv:
            img_sub_path = 'calc_test'
        else:
            raise
    elif 'mass' in csv:
        if 'train_set' in csv:
            img_sub_path = 'mass_train'
        elif 'test_set' in csv:
            img_sub_path = 'mass_test'
        else:
            raise
        
    
    cbis_df = pd.read_csv(csv, header=0)

    for i in range(len(cbis_df)):
        if 'BENIGN_WITHOUT_CALLBACK' in cbis_df['pathology']:
            continue

        # Image file path의 split[0]이 root 경로.
        mammo_relative_root_path = cbis_df['image file path'][i].split('/')[0]
        mammo_root_path = os.path.join(dicom_base_path, mammo_relative_root_path)
        mammo_dcm_path = glob(mammo_root_path + "/**/**/*.dcm")
        if len(mammo_dcm_path) == 0:
            print("NO DCM HERE!", mammo_root_path)
            errored_data_counter += 1
            continue
        mammo_dcm_path = mammo_dcm_path[0]
    #     print(mammo_dcm_path, '\n -------------------')

        mask_relative_root_path = cbis_df['ROI mask file path'][i].split('/')[0]
        mask_root_path = os.path.join(dicom_base_path, mask_relative_root_path)
        tmp_mask_dcm_path = sorted(glob(mask_root_path + "/**/**/*.dcm"))
        if len(tmp_mask_dcm_path) == 0:
            print("NO DCM HERE! ", tmp_mask_dcm_path)
            errored_data_counter += 1
            continue
        if len(tmp_mask_dcm_path) == 1:
            print("only 1 DCM in mask path:", tmp_mask_dcm_path )
            if os.path.getsize(tmp_mask_dcm_path[0])/1024/1024 >= 3 :
                mask_dcm_path = tmp_mask_dcm_path[0]
            else:
                print("NO MASK! ", tmp_mask_dcm_path)
                errored_data_counter += 1
                continue
        else: # mask와 패치가 둘 다 잘 들어있는 경우.
            for f in tmp_mask_dcm_path: # 패치와 mask가 파일 뒤섞인 경우 있으므로 사이즈 비교로 확인: 
                if os.path.getsize(tmp_mask_dcm_path[1])/1024/1024 >= 3 : # 일반적으론 000001이 mask. 얘가 5메가 이상이면 마스크인걸로.
                    mask_dcm_path = tmp_mask_dcm_path[1]
                elif os.path.getsize(tmp_mask_dcm_path[0])/1024/1024 >= 3 :
                    mask_dcm_path = tmp_mask_dcm_path[0]
                else:
                    print('both files are not bigger than 3MB')
                    raise
            
            
    #     print(tmp_mask_dcm_path, '\n -------------------')

    #     print(mask_dcm_path, '\n -------------------')


        mammo_dcm = dicom.read_file(mammo_dcm_path)
        mask_dcm = dicom.read_file(mask_dcm_path)

        try:
            mammo_arr = mammo_dcm.pixel_array
            mammo_arr = mammo_arr.astype(np.uint16)
        except AttributeError: # 종종 파일 자체가 문제가 있는 경우 있음.
            try:
                pixel_data = mammo_dcm[0x7fe0,0x0010].value # 파일 자체에 저장된 pixel_data값
                rows = mammo_dcm[0x0028, 0x0010].value # metadata로 들어있는 row
                cols = mammo_dcm[0x0028, 0x0011].value # metadata로 들어있는col

                mammo_arr = np.fromstring(pixel_data[:-1], dtype=np.uint16)
                mammo_arr = np.reshape(mammo_arr, (rows, cols))
            except ValueError:
                print('corrupted file: ' + mammo_dcm_path[:70])
                errored_data_counter += 1
                continue
            else:
                print('Attribute error" ' + mammo_dcm_path[:70])
        except:
            print('different error: ' + mammo_dcm_path[:70])
            errored_data_counter += 1
            continue

        try:
            mask_arr = mask_dcm.pixel_array
            mask_arr = mask_arr.astype(np.uint8)
        except AttributeError: # 종종 파일 자체가 문제가 있는 경우 있음.
            try:
                pixel_data = mammo_dcm[0x7fe0,0x0010].value # 파일 자체에 저장된 pixel_data값
                rows = mammo_dcm[0x0028, 0x0010].value # metadata로 들어있는 row
                cols = mammo_dcm[0x0028, 0x0011].value # metadata로 들어있는col

                mask_arr = np.fromstring(pixel_data[:-1], dtype=np.uint8)
                mask_arr = np.reshape(mask_arr, (rows, cols, -1))
            except ValueError:
                print('corrupted file: ' + mask_dcm_path[:70])
                errored_data_counter += 1
                continue
            else:
                print('Attribute error" ' + mask_dcm_path[:70])
        except:
            print('different error: ' + mask_dcm_path[:70])
            abnormal += 1
            continue

        # 논문 내용대로 0~255로 리스케일.
        mammo_arr_final = (mammo_arr - np.amin(mammo_arr))/(np.amax(mammo_arr) - np.amin(mammo_arr)) * 255
        mammo_arr_final = mammo_arr_final.astype(np.uint8)
        mammo_arr_final = cv2.resize(mammo_arr_final, (WIDTH, HEIGHT))
        mammo_arr_final = np.asarray(np.dstack((mammo_arr_final, mammo_arr_final, mammo_arr_final)), dtype=np.uint8)

        mask_arr = cv2.resize(mask_arr, (WIDTH, HEIGHT))
        mask_arr[mask_arr >= 1] = 255

        ret, thresh = cv2.threshold(mask_arr, 127, 255, cv2.THRESH_BINARY)
        image, contours, hierarchy = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)

        if len(contours) > 1: # 혹시 annotation이 끊어져있을 경우를 대비.
            contours = [max(contours, key=len)]

        cnt = contours[0]

        x,y,w,h = cv2.boundingRect(cnt) # 좌상단 꼭지점과 거기서부터의 가로길이 w, 세로길이 h가 return 됨.
    #     print(x,y,w,h)

        # 제대로 bounding box 만들고있는지 체크하기 위한 코드
    #     mask_arr_color = cv2.cvtColor(mask_arr, cv2.COLOR_GRAY2BGR)
    #     cv2.rectangle(mask_arr_color, (x, y), (x+w, y+h), (0,0,255), 3)

    #     plt.imshow(mask_arr_color)
    #     plt.title('image')
    #     plt.show() 



        #### mammo를 이미지로 저장
        mammo_jpg_path = os.path.join(img_save_path, img_sub_path, mask_dcm_path.split("/")[6]+".jpg")
        cv2.imwrite(mammo_jpg_path, mammo_arr_final)
        
        if 'train_set' in csv:
            train_img_path.append(mammo_jpg_path)
            train_x1.append(x)
            train_y1.append(y)
            train_x2.append(x+w)
            train_y2.append(y+h)
#             train_class_name.append(cbis_df['pathology'][i][0] + '_' + cbis_df['abnormality type'][i][:4])
            train_class_name.append(cbis_df['pathology'][i])
            
        elif 'test_set' in csv:
            test_img_path.append(mammo_jpg_path)
            test_x1.append(x)
            test_y1.append(y)
            test_x2.append(x+w)
            test_y2.append(y+h)
#             test_class_name.append(cbis_df['pathology'][i][0] + '_' + cbis_df['abnormality type'][i][:4])
            test_class_name.append(cbis_df['pathology'][i])
            
        else:
            raise

corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_00011_LEFT_MLO_1/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_00016_LEFT_MLO_2/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_00019_RIGHT_CC_1/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_00024_LEFT_MLO_1/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_00034_RIGHT_MLO_2/1.3
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_00049_RIGHT_CC_1/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_00105_LEFT_MLO_1/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_00151_RIGHT_MLO_1/1.3
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_00162_LEFT_CC_1/1.3.6
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_00181_RIGHT_CC_1/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_00182_LEFT_CC_1/1.3.6
corrupted file: /home/huray/data/CBIS-DDSM/

corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_01387_LEFT_MLO_1/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_01422_RIGHT_MLO_1/1.3
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_01424_LEFT_CC_1/1.3.6
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_01446_LEFT_MLO_1/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_01459_LEFT_MLO_1/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_01497_LEFT_CC_1/1.3.6
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_01524_LEFT_CC_1/1.3.6
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_01524_LEFT_MLO_1/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_01548_LEFT_CC_1/1.3.6
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_01549_LEFT_CC_1/1.3.6
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Training_P_01571_RIGHT_MLO_1/1.3
corrupted file: /home/huray/data/CBIS-DDSM/

corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Test_P_02198_LEFT_CC_1/1.3.6.1.4
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Test_P_02275_LEFT_CC_1/1.3.6.1.4
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Test_P_02275_LEFT_MLO_1/1.3.6.1.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Test_P_02418_RIGHT_MLO_1/1.3.6.1
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Test_P_02420_RIGHT_CC_1/1.3.6.1.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Test_P_02420_RIGHT_MLO_1/1.3.6.1
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Test_P_02464_RIGHT_CC_1/1.3.6.1.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Calc-Test_P_02501_RIGHT_MLO_1/1.3.6.1
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Training_P_00004_RIGHT_MLO_1/1.3
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Training_P_00023_RIGHT_CC_1/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Training_P_00034_RIGHT_MLO_1/1.3
corrupted file: /home/huray/data/CBIS-DDSM/

corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Training_P_01394_LEFT_MLO_1/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Training_P_01394_LEFT_MLO_2/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Training_P_01423_RIGHT_CC_1/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Training_P_01433_LEFT_MLO_1/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Training_P_01439_LEFT_MLO_1/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Training_P_01442_RIGHT_CC_1/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Training_P_01445_RIGHT_MLO_1/1.3
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Training_P_01474_RIGHT_MLO_1/1.3
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Training_P_01485_LEFT_CC_1/1.3.6
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Training_P_01485_LEFT_MLO_1/1.3.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Training_P_01495_RIGHT_MLO_1/1.3
corrupted file: /home/huray/data/CBIS-DDSM/

corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Test_P_01661_LEFT_CC_1/1.3.6.1.4
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Test_P_01661_LEFT_MLO_1/1.3.6.1.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Test_P_01765_RIGHT_CC_1/1.3.6.1.
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Test_P_01765_RIGHT_MLO_1/1.3.6.1
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Test_P_01807_RIGHT_MLO_1/1.3.6.1
corrupted file: /home/huray/data/CBIS-DDSM/dicom/Mass-Test_P_01865_LEFT_MLO_1/1.3.6.1.


In [12]:
train_data_df = pd.DataFrame({'img_path':train_img_path, 'x1':train_x1, 'y1':train_y1, 'x2':train_x2, 'y2':train_y2, 'class_name':train_class_name})
train_data_df = train_data_df[['img_path', 'x1', 'y1', 'x2', 'y2', 'class_name']]

test_data_df = pd.DataFrame({'img_path':test_img_path, 'x1':test_x1, 'y1':test_y1, 'x2':test_x2, 'y2':test_y2, 'class_name':test_class_name})
test_data_df = test_data_df[['img_path', 'x1', 'y1', 'x2', 'y2', 'class_name']]

whole_data_df = train_data_df.append(test_data_df, ignore_index=True)

print('DATA COUNTS: ', len(train_data_df)+len(test_data_df))
print('Train data COUNTS: ', len(train_data_df))
print('Test data COUNTS: ', len(test_data_df))

DATA COUNTS:  3185
Train data COUNTS:  2579
Test data COUNTS:  606


In [13]:
train_data_df.to_csv('/home/huray/data/CBIS-DDSM/img/data_train.csv', header=False, index=False)
test_data_df.to_csv('/home/huray/data/CBIS-DDSM/img/data_test.csv', header=False, index=False)
whole_data_df.to_csv('/home/huray/data/CBIS-DDSM/img/data_whole.csv', header=False, index=False)

In [14]:
print(errored_data_counter)

383
