## png파일명에서 어노테이션과dcm파일을 찾아낸 후 csv파일로 아래 형식대로 저장.
-> path/to/image.jpg,x1,y1,x2,y2,class_name

예:  
/data/imgs/img_001.jpg,837,346,981,456,cow  
/data/imgs/img_002.jpg,215,312,279,391,cat  
/data/imgs/img_002.jpg,22,5,89,84,bird

### 일단은 png로 뽑힌 mass만 사용. calc는 어노테이션 처리가 복잡하고 정확도도 충분하므로.

In [1]:
import pandas as pd
import cv2
import os
from glob import glob
import pydicom as dicom
import numpy as np
from matplotlib import pyplot as plt

%matplotlib inline

In [2]:
inb_base_path = "/home/huray/data/inbreast/raw_files"
dicom_base_path = os.path.join(inb_base_path, "AllDICOMs")
anno_base_path = os.path.join(inb_base_path, "extras/mass")

img_save_path = "/home/huray/data/inbreast/img"

WIDTH = 1400
HEIGHT = 1750

In [3]:
dcm_files = sorted(glob("{}/*.dcm".format(dicom_base_path)))
anno_files = sorted(glob("{}/*.png".format(anno_base_path)))
print(dcm_files[:5])
print(anno_files[:5])

['/home/huray/data/inbreast/raw_files/AllDICOMs/20586908_6c613a14b80a8591_MG_R_CC_ANON.dcm', '/home/huray/data/inbreast/raw_files/AllDICOMs/20586934_6c613a14b80a8591_MG_L_CC_ANON.dcm', '/home/huray/data/inbreast/raw_files/AllDICOMs/20586960_6c613a14b80a8591_MG_R_ML_ANON.dcm', '/home/huray/data/inbreast/raw_files/AllDICOMs/20586986_6c613a14b80a8591_MG_L_ML_ANON.dcm', '/home/huray/data/inbreast/raw_files/AllDICOMs/20587054_b6a4f750c6df4f90_MG_R_CC_ANON.dcm']
['/home/huray/data/inbreast/raw_files/extras/mass/20586908_mask.png', '/home/huray/data/inbreast/raw_files/extras/mass/20586934_mask.png', '/home/huray/data/inbreast/raw_files/extras/mass/20586960_mask.png', '/home/huray/data/inbreast/raw_files/extras/mass/20586986_mask.png', '/home/huray/data/inbreast/raw_files/extras/mass/20587612_mask.png']


In [4]:
dcms_with_anno = []
matched_anno = []

for anno in anno_files:
    full_anno_name = anno.split('/')[-1]
    anno_number = full_anno_name.split('_')[0]
    
    for dcm in dcm_files:
        full_dcm_name = dcm.split('/')[-1]
        dcm_number = full_dcm_name.split('_')[0]
        
        if anno_number in dcm_number:
            dcms_with_anno.append(dcm)
            matched_anno.append(anno)
            
            
print(len(dcms_with_anno))
print(dcms_with_anno[:5])
print(matched_anno[:5])

107
['/home/huray/data/inbreast/raw_files/AllDICOMs/20586908_6c613a14b80a8591_MG_R_CC_ANON.dcm', '/home/huray/data/inbreast/raw_files/AllDICOMs/20586934_6c613a14b80a8591_MG_L_CC_ANON.dcm', '/home/huray/data/inbreast/raw_files/AllDICOMs/20586960_6c613a14b80a8591_MG_R_ML_ANON.dcm', '/home/huray/data/inbreast/raw_files/AllDICOMs/20586986_6c613a14b80a8591_MG_L_ML_ANON.dcm', '/home/huray/data/inbreast/raw_files/AllDICOMs/20587612_f4b2d377f43ba0bd_MG_R_CC_ANON.dcm']
['/home/huray/data/inbreast/raw_files/extras/mass/20586908_mask.png', '/home/huray/data/inbreast/raw_files/extras/mass/20586934_mask.png', '/home/huray/data/inbreast/raw_files/extras/mass/20586960_mask.png', '/home/huray/data/inbreast/raw_files/extras/mass/20586986_mask.png', '/home/huray/data/inbreast/raw_files/extras/mass/20587612_mask.png']


### anno 파일에서 바운딩박스 검출

In [5]:
img_path = []
x1 = []
y1 = []
x2 = []
y2 = []
class_name = []

for anno_path, dcm_path in zip(matched_anno, dcms_with_anno):
    dcm = dicom.read_file(dcm_path)
    anno = cv2.imread(anno_path, cv2.IMREAD_GRAYSCALE)
    
    
    
    # mammo 이미지 처리
    mammo_arr = dcm.pixel_array
    mammo_arr = mammo_arr.astype(np.uint16)
    
    # rescale to 0~255
    mammo_arr_final = (mammo_arr - np.amin(mammo_arr))/(np.amax(mammo_arr) - np.amin(mammo_arr)) * 255
    mammo_arr_final = mammo_arr_final.astype(np.uint8)
    mammo_arr_final = cv2.resize(mammo_arr_final, (WIDTH, HEIGHT))
    mammo_arr_final = np.asarray(np.dstack((mammo_arr_final, mammo_arr_final, mammo_arr_final)), dtype=np.uint8)
    
    # mammo를 이미지로 저장
    mammo_jpg_path = os.path.join(img_save_path, anno_path.split("/")[-1].replace('.png','.jpg').replace('_mask', ''))
    cv2.imwrite(mammo_jpg_path, mammo_arr_final)
    
    
    
    # anno 이미지 처리
    mask_arr = cv2.resize(anno, (WIDTH, HEIGHT))
    mask_arr[mask_arr >= 1] = 255

    ret, thresh = cv2.threshold(mask_arr, 127, 255, cv2.THRESH_BINARY)
    image, contours, hierarchy = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)

    if len(contours) != 1: # 혹시 annotation 인식이 잘못됐을 경우 대비 -> mass만 본 결과로는 문제는 없었음.
        print(str(len(contours)) + " CONTOURS! CHECK THIS: " + anno_path)
    
    for cnt in contours:
        x,y,w,h = cv2.boundingRect(cnt) # 좌상단 꼭지점과 거기서부터의 가로길이 w, 세로길이 h가 return 됨.
        
        # 리스트에 저장
        img_path.append(mammo_jpg_path)
        x1.append(x)
        y1.append(y)
        x2.append(x+w)
        y2.append(y+h)
        class_name.append('M')

        # 제대로 bounding box 만들고있는지 체크하기 위한 코드
#         print(x,y,w,h)
#         mask_arr_color = cv2.cvtColor(mask_arr, cv2.COLOR_GRAY2BGR)
#         cv2.rectangle(mask_arr_color, (x, y), (x+w, y+h), (255,0,255), 3)

#         if len(contours) != 1:
#             plt.figure(figsize=(10, 10))
#             plt.imshow(mask_arr_color)
#             plt.title('image')
#             plt.show()

2 CONTOURS! CHECK THIS: /home/huray/data/inbreast/raw_files/extras/mass/20586908_mask.png
2 CONTOURS! CHECK THIS: /home/huray/data/inbreast/raw_files/extras/mass/22427840_mask.png
2 CONTOURS! CHECK THIS: /home/huray/data/inbreast/raw_files/extras/mass/22579730_mask.png
2 CONTOURS! CHECK THIS: /home/huray/data/inbreast/raw_files/extras/mass/22613770_mask.png
2 CONTOURS! CHECK THIS: /home/huray/data/inbreast/raw_files/extras/mass/51049107_mask.png


In [6]:
df = pd.DataFrame({'img_path':img_path, 'x1':x1, 'y1':y1, 'x2':x2, 'y2':y2, 'class_name':class_name})
df = df[['img_path', 'x1', 'y1', 'x2', 'y2', 'class_name']]

print('DATA COUNTS: ', len(df))

DATA COUNTS:  112


In [7]:
df.to_csv('/home/huray/data/inbreast/data.csv', header=False, index=False)