In [5]:
import pandas as pd
import cv2
import os
from glob import glob
import pydicom as dicom
import numpy as np
from matplotlib import pyplot as plt

%matplotlib inline

In [6]:
ncc_base_path = "/home/huray/data/NCC"
dicom_base_path = os.path.join(ncc_base_path, "dicom")

img_save_path = dicom_base_path.replace('dicom', 'img_retinanet')

WIDTH = 1400
HEIGHT = 1750

In [7]:
import shutil
import os

files = sorted(glob("/home/huray/data/NCC/dicom/**/*.dcm", recursive=True))
print(files[:5])

['/home/huray/data/NCC/dicom/abn_1/_ACD_/00000/0001.dcm', '/home/huray/data/NCC/dicom/abn_1/_ACD_/00000/0002.dcm', '/home/huray/data/NCC/dicom/abn_1/_ACD_/00000/0003.dcm', '/home/huray/data/NCC/dicom/abn_1/_ACD_/00000/0004.dcm', '/home/huray/data/NCC/dicom/abn_1/_ACD_/00001/0001.dcm']


In [8]:
abn1_img_path = []
abn1_x1 = []
abn1_y1 = []
abn1_x2 = []
abn1_y2 = []
abn1_class_name = []

abn2_img_path = []
abn2_x1 = []
abn2_y1 = []
abn2_x2 = []
abn2_y2 = []
abn2_class_name = []

normal_img_path = []
normal_x1 = []
normal_y1 = []
normal_x2 = []
normal_y2 = []
normal_class_name = []

errored_data_counter = 0
files_with_wrong_size = []

for mammo_path in files:        
    mammo_dcm = dicom.read_file(mammo_path)
    
    if os.path.getsize(mammo_path)/(1024*1024) <= 1: # dcm파일이 1메가보다 작으면 pass
        files_with_wrong_size.append(mammo_path)
        print('DCM file is too small ' + mammo_path)
        continue
    
    try:
        mammo_arr = mammo_dcm.pixel_array
        mammo_arr = mammo_arr.astype(np.uint16)
    except AttributeError: # 종종 파일 자체가 문제가 있는 경우 있음.
        try:
            pixel_data = mammo_dcm[0x7fe0,0x0010].value # 파일 자체에 저장된 pixel_data값
            rows = mammo_dcm[0x0028, 0x0010].value # metadata로 들어있는 row
            cols = mammo_dcm[0x0028, 0x0011].value # metadata로 들어있는col

            mammo_arr = np.fromstring(pixel_data[:-1], dtype=np.uint16)
            mammo_arr = np.reshape(mammo_arr, (rows, cols))
        except ValueError:
            print('corrupted file: ' + mammo_path[:70])
            errored_data_counter += 1
            continue
        else:
            print('Attribute error" ' + mammo_path[:70])
    except Exception as e:
        print('different error: ' + mammo_path[:70])
        print(e)
        errored_data_counter += 1
        raise
        continue
        
    mammo_arr_final = (mammo_arr - np.amin(mammo_arr))/(np.amax(mammo_arr) - np.amin(mammo_arr)) * 255
    mammo_arr_final = mammo_arr_final.astype(np.uint8)
    mammo_arr_final = cv2.resize(mammo_arr_final, (WIDTH, HEIGHT))
    mammo_arr_final = np.asarray(np.dstack((mammo_arr_final, mammo_arr_final, mammo_arr_final)), dtype=np.uint8)
    
    
    mask_path = mammo_path.replace('.dcm', '.jpg')
    mask_arr = cv2.imread(mask_path)
    if 'normal' in mammo_path:
        mask_arr = np.zeros([WIDTH, HEIGHT, 3])
    else:
        try:
            mask_arr = cv2.resize(mask_arr, (WIDTH, HEIGHT))
        except Exception as e:
            print("no image size for mask! " + mask_path)
            continue
    
    lower=np.array([0,0,81],np.uint8)
    upper=np.array([80,80,255],np.uint8)
    mask = cv2.inRange(mask_arr,lower,upper)
    
#     plt.figure(figsize=(20, 20))
#     plt.imshow(mask)
#     plt.title('image')
#     plt.show() 
    
    ret, thresh = cv2.threshold(mask, 127, 255, cv2.THRESH_BINARY)
    _image, contours, hierarchy = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    annotation_counter = 0
    
    # 이미지 하나에 annotation 여러개 있는 경우 있으므로 리스트로 저장해 둠.
    x1_tmp = []
    y1_tmp = []
    x2_tmp = []
    y2_tmp = []
    cls_tmp = []

    if len(contours) >= 1:
        for index, c in enumerate(contours):
            x, y, w, h = cv2.boundingRect(c)
            
            if w*h < 100:
                continue
                
            x1_tmp.append(x)
            y1_tmp.append(y)
            x2_tmp.append(x+w)
            y2_tmp.append(y+h)
            cls_tmp.append('M')
            
            annotation_counter += 1

#             cv2.rectangle(mammo_arr_final, (x, y), (x+w, y+h), (0,255,255), 5)

#         x,y,w,h = cv2.boundingRect(cnt) # 좌상단 꼭지점과 거기서부터의 가로길이 w, 세로길이 h가 return 됨


        # 제대로 bounding box 만들고있는지 체크하기 위한 코드
#         plt.figure(figsize=(10, 10))
#         plt.imshow(mammo_arr_final)
#         plt.title('image')
#         plt.show()
#         raise
        
    elif len(contours) == 0:
        annotation_counter += 1 #annotation은 없지만 한번은 저장해야 하므로 1로 만들어 줌.
        x1_tmp.append('')
        y1_tmp.append('')
        x2_tmp.append('')
        y2_tmp.append('')
        cls_tmp.append('')
        
    
    if 'abn_1' in mammo_path:
        data_sub_path = 'abn_1'
    elif 'abn_2' in mammo_path:
        data_sub_path = 'abn_2'
    elif 'normal' in mammo_path:
        data_sub_path = 'normal'
    else:
        raise
        
    _split = mammo_path.split('/')
    img_file_name = '{}-{}-{}'.format(_split[-3], _split[-2], _split[-1].replace('.dcm', ''))
    
    for i in range(annotation_counter):
        mammo_jpg_path = os.path.join(img_save_path, data_sub_path, img_file_name+'_'+str(i)+'.jpg')
        cv2.imwrite(mammo_jpg_path, mammo_arr_final)
        
        if 'abn_1' in mammo_path:
            abn1_img_path.append(mammo_jpg_path)
            abn1_x1.append(x1_tmp[i])
            abn1_y1.append(y1_tmp[i])
            abn1_x2.append(x2_tmp[i])
            abn1_y2.append(y2_tmp[i])
            abn1_class_name.append(cls_tmp[i])
        elif 'abn_2' in mammo_path:
            abn2_img_path.append(mammo_jpg_path)
            abn2_x1.append(x1_tmp[i])
            abn2_y1.append(y1_tmp[i])
            abn2_x2.append(x2_tmp[i])
            abn2_y2.append(y2_tmp[i])
            abn2_class_name.append(cls_tmp[i])
        elif 'normal' in mammo_path:
            normal_img_path.append(mammo_jpg_path)
            normal_x1.append(x1_tmp[i])
            normal_y1.append(y1_tmp[i])
            normal_x2.append(x2_tmp[i])
            normal_y2.append(y2_tmp[i])
            normal_class_name.append(cls_tmp[i])
        else:
            raise

DCM file is too small /home/huray/data/NCC/dicom/normal/normal/00027/30001.dcm
DCM file is too small /home/huray/data/NCC/dicom/normal/normal/00027/30002.dcm
DCM file is too small /home/huray/data/NCC/dicom/normal/normal/00027/30003.dcm
DCM file is too small /home/huray/data/NCC/dicom/normal/normal/00027/30004.dcm
DCM file is too small /home/huray/data/NCC/dicom/normal/normal/00033/20001.dcm
DCM file is too small /home/huray/data/NCC/dicom/normal/normal/00033/20002.dcm
DCM file is too small /home/huray/data/NCC/dicom/normal/normal/00033/20003.dcm
DCM file is too small /home/huray/data/NCC/dicom/normal/normal/00033/20004.dcm
DCM file is too small /home/huray/data/NCC/dicom/normal/normal/00059/10001.dcm
DCM file is too small /home/huray/data/NCC/dicom/normal/normal/00059/10002.dcm
DCM file is too small /home/huray/data/NCC/dicom/normal/normal/00059/10003.dcm
DCM file is too small /home/huray/data/NCC/dicom/normal/normal/00059/10004.dcm
DCM file is too small /home/huray/data/NCC/dicom/nor

In [9]:
print("ERRORED DATA COUNT: ", errored_data_counter)
print("FILES WITH WRONG SIZE COUNT: ", len(files_with_wrong_size))
print("FILES WITH WRONG SIZE: ", files_with_wrong_size)

ERRORED DATA COUNT:  0
FILES WITH WRONG SIZE COUNT:  16
FILES WITH WRONG SIZE:  ['/home/huray/data/NCC/dicom/normal/normal/00027/30001.dcm', '/home/huray/data/NCC/dicom/normal/normal/00027/30002.dcm', '/home/huray/data/NCC/dicom/normal/normal/00027/30003.dcm', '/home/huray/data/NCC/dicom/normal/normal/00027/30004.dcm', '/home/huray/data/NCC/dicom/normal/normal/00033/20001.dcm', '/home/huray/data/NCC/dicom/normal/normal/00033/20002.dcm', '/home/huray/data/NCC/dicom/normal/normal/00033/20003.dcm', '/home/huray/data/NCC/dicom/normal/normal/00033/20004.dcm', '/home/huray/data/NCC/dicom/normal/normal/00059/10001.dcm', '/home/huray/data/NCC/dicom/normal/normal/00059/10002.dcm', '/home/huray/data/NCC/dicom/normal/normal/00059/10003.dcm', '/home/huray/data/NCC/dicom/normal/normal/00059/10004.dcm', '/home/huray/data/NCC/dicom/normal/normal/00065/20001.dcm', '/home/huray/data/NCC/dicom/normal/normal/00065/20002.dcm', '/home/huray/data/NCC/dicom/normal/normal/00065/20003.dcm', '/home/huray/data/N

In [10]:
abn1_data_df = pd.DataFrame({'img_path':abn1_img_path, 'x1':abn1_x1, 'y1':abn1_y1, 'x2':abn1_x2, 'y2':abn1_y2, 'class_name':abn1_class_name})
abn1_data_df = abn1_data_df[['img_path', 'x1', 'y1', 'x2', 'y2', 'class_name']]

abn2_data_df = pd.DataFrame({'img_path':abn2_img_path, 'x1':abn2_x1, 'y1':abn2_y1, 'x2':abn2_x2, 'y2':abn2_y2, 'class_name':abn2_class_name})
abn2_data_df = abn2_data_df[['img_path', 'x1', 'y1', 'x2', 'y2', 'class_name']]

normal_data_df = pd.DataFrame({'img_path':normal_img_path, 'x1':normal_x1, 'y1':normal_y1, 'x2':normal_x2, 'y2':normal_y2, 'class_name':normal_class_name})
normal_data_df = normal_data_df[['img_path', 'x1', 'y1', 'x2', 'y2', 'class_name']]

print('TOTAL DATA COUNTS: ', len(abn1_data_df) + len(abn2_data_df) + len(normal_data_df))
print('ABN1 DATA COUNTS: ', len(abn1_data_df))
print('ABN2 DATA COUNTS: ', len(abn2_data_df))
print('NORMAL DATA COUNTS: ', len(normal_data_df))

TOTAL DATA COUNTS:  816
ABN1 DATA COUNTS:  238
ABN2 DATA COUNTS:  225
NORMAL DATA COUNTS:  353


In [11]:
abn1_data_df.to_csv('/home/huray/data/NCC/img_retinanet/data_abn1.csv', header=False, index=False)
abn2_data_df.to_csv('/home/huray/data/NCC/img_retinanet/data_abn2.csv', header=False, index=False)
normal_data_df.to_csv('/home/huray/data/NCC/img_retinanet/data_normal.csv', header=False, index=False)