In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!conda install gdcm -c conda-forge -y


In [None]:
import os
import glob
import warnings
warnings.filterwarnings("ignore")

import cv2
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
from tqdm import tqdm

import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

In [None]:
INPUT_PATH = '/kaggle/input/siim-covid19-detection' 

In [None]:
train_image_level = pd.read_csv(f'{INPUT_PATH}/train_image_level.csv')

In [None]:
train_image_level.head(10)

In [None]:
print("Picture-level training set data volume:", len(train_image_level))

In [None]:
label_type = dict()
for x in train_image_level[['label']].iterrows():
    label = x[1].values[0].split(' ')[0]
    if label not in label_type:
        label_type[label] = 0
    label_type[label] += 1
print(label_type)


In [None]:
train_image_level['boxes'][0]

In [None]:
train_image_level['label'][0]

In [None]:
train_study_level = pd.read_csv(f'{INPUT_PATH}/train_study_level.csv')

In [None]:
train_study_level.head(5)

In [None]:
print("The amount of training data at the research level:", len(train_study_level))

In [None]:
train_study_level.values.tolist()[0][1:5]

In [None]:
train_study_level1 = train_study_level.copy(deep=True)
train_study_level1['StudyInstanceUID'] = train_study_level1['id'].apply(lambda x: x.replace('_study', ''))

# 检查有没有重复的study号
flag = train_study_level1['StudyInstanceUID'].duplicated()
print("Whether there is a duplicate study number：", flag.any())

# 删除id并进行image水平和study水平融合
del train_study_level1['id']
train_df = pd.merge(train_image_level, train_study_level1, how='left', on='StudyInstanceUID')

group_col = 'StudyInstanceUID'
count_df=pd.DataFrame(train_df.groupby(group_col)['id'].count())
count_df.columns = [f'{group_col}_count']
train_df=train_df.merge(count_df.reset_index(), on=group_col)
train_df.head(2)

one_study_multi_image_df = train_df[train_df[f'{group_col}_count'] > 1]
one_study_multi_image_df.head(5)
print("Number of special cases：", len(one_study_multi_image_df))

# 删除特殊情况
train_df = train_df[train_df[f'{group_col}_count'] == 1] # delete 'StudyInstanceUID_count > 1' data
print("Sample size after removing special cases：", len(train_df))

In [None]:
sample_submission = pd.read_csv(f'{INPUT_PATH}/sample_submission.csv')

In [None]:
sample_submission

In [None]:
sample_submission.info()

In [None]:
print("train:",len(os.listdir(f'{INPUT_PATH}/train')), len(glob.glob(f'{INPUT_PATH}/train/*/*/*.dcm')))
print("test :",len(os.listdir(f'{INPUT_PATH}/test')), len(glob.glob(f'{INPUT_PATH}/test/*/*/*.dcm')))

In [None]:
def dicom2array(path, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    # VOI LUT (if available by DICOM device) is used to
    # transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data
        
    
def plot_img(img, size=(7, 7), is_rgb=True, title="", cmap='gray'):
    plt.figure(figsize=size)
    plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()

def plot_imgs(imgs, cols=4, size=7, is_rgb=True, title="", cmap='gray', img_size=(500,500)):
    rows = len(imgs)//cols + 1
    fig = plt.figure(figsize=(cols*size, rows*size))
    for i, img in enumerate(imgs):
        if img_size is not None:
            img = cv2.resize(img, img_size)
        fig.add_subplot(rows, cols, i+1)
        plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()

dicom_paths = glob.glob(f'{INPUT_PATH}/train/*/*/*.dcm')
imgs = [dicom2array(path) for path in dicom_paths[:4]]
print(imgs[0].shape, imgs[1].shape)  # 注意，(2336, 2836) (3488, 4256) 图片大小不一致
plot_imgs(imgs)


In [None]:
def pie_plot(train_df, variable):
    """
        input: variable ex: "Sex"
        output: bar plot & value count
    """
    # get feature
    var = train_df[variable]
    # count number of categorical variable(value/sample)
    varValue = var.value_counts()
    
    # visualize
    plt.figure(figsize = (20,10))
    plt.pie(varValue, labels=varValue.index, autopct="%1.1f%%")
#     plt.xticks(varValue.index, varValue.index.values)
#     plt.ylabel("Frequency")
    plt.title('target')
    plt.show()
    
train_df['target'] = 'Negative for Pneumonia'
train_df.loc[train_df['Typical Appearance']==1, 'target'] = 'Typical Appearance'
train_df.loc[train_df['Indeterminate Appearance']==1, 'target'] = 'Indeterminate Appearance'
train_df.loc[train_df['Atypical Appearance']==1, 'target'] = 'Atypical Appearance'
print(train_df['target'].value_counts())
pie_plot(train_df, 'target')   

In [None]:
train_df['boxes'].values[0] # x_min, y_min, width, height

In [None]:
train_df['label'].values[0] # x_min, y_min, x_max, y_max

In [None]:
class_names = ['Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance'] # we have 3 positive classes
unique_classes = np.unique(train_df[class_names].values, axis=0)  
unique_classes

In [None]:
label2color = {
    '[1, 0, 0]': [255,0,0], # Typical Appearance
    '[0, 1, 0]': [0,255,0], # Indeterminate Appearance
    '[0, 0, 1]': [0,0,255], # Atypical Appearance
    '[0, 0, 0]': None, # negative
}

label2target = {
    '[1, 0, 0]': 'typical',
    '[0, 1, 0]': 'indeterminate',
    '[0, 0, 1]': 'atypical',
    '[0, 0, 0]': 'negative'
}

In [None]:
THICKNESS = 3
SCALE = 5
FONT = cv2.FONT_HERSHEY_SIMPLEX; FONT_SCALE = 1; FONT_THICKNESS = 2; FONT_LINE_TYPE = cv2.LINE_AA;

In [None]:
# Plot img with bounding box
imgs = []
for _, row in train_df.iloc[:8].iterrows():
    study_id = row['StudyInstanceUID']
    img_path = glob.glob(f'{INPUT_PATH}/train/{study_id}/*/*.dcm')[0]
    img = dicom2array(path=img_path)
    img = cv2.resize(img, None, fx=1/SCALE, fy=1/SCALE) # 尺度变换
    img = np.stack([img, img, img], axis=-1)  # 灰度图像转RGB 堆叠
    
    claz = row[class_names].values
    color = label2color[str(claz.tolist())]
    target = label2target[str(claz.tolist())]

    bboxes = []
    bbox = []
    for i, l in enumerate(row['label'].split(' ')): 
        # 'opacity 1 789.28836 582.43035 1815.94498 2499.73327 opacity 1 2245.91208 591.20528 3340.5737 2352.75472'
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l)/SCALE)
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []    

    text_width, text_height = cv2.getTextSize(target, FONT, FONT_SCALE, FONT_THICKNESS)[0]
    for box in bboxes:
        img = cv2.rectangle(
            img,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, THICKNESS
        )
        box_width = int(box[2]) - int(box[0])
        img = cv2.putText(img, target, (int(box[0])-(text_width-box_width)//2, int(box[1])-10),
                        FONT, FONT_SCALE, color, FONT_THICKNESS, FONT_LINE_TYPE)          
    img = cv2.resize(img, (500, 500))
    imgs.append(img)
    
plot_imgs(imgs, cmap=None)


In [None]:
imgs = []
for _, row in train_df[train_df['Typical Appearance'] == 1].iloc[:8].iterrows():
    study_id = row['StudyInstanceUID']
    img_path = glob.glob(f'{INPUT_PATH}/train/{study_id}/*/*.dcm')[0]
    img = dicom2array(path=img_path)
    img = cv2.resize(img, None, fx=1/SCALE, fy=1/SCALE)
    img = np.stack([img, img, img], axis=-1)
    
    claz = row[class_names].values
    color = label2color[str(claz.tolist())]
    target = label2target[str(claz.tolist())]

    bboxes = []
    bbox = []
    for i, l in enumerate(row['label'].split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l)/SCALE)
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []    
    text_width, text_height = cv2.getTextSize(target, FONT, FONT_SCALE, FONT_THICKNESS)[0]
    for box in bboxes:
        img = cv2.rectangle(
            img,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, THICKNESS
    	)
        box_width = int(box[2]) - int(box[0])
        img = cv2.putText(img, target, (int(box[0])-(text_width-box_width)//2, int(box[1])-10),
                        FONT, FONT_SCALE, color, FONT_THICKNESS, FONT_LINE_TYPE) 
    img = cv2.resize(img, (500,500))
    imgs.append(img)
    
plot_imgs(imgs, cmap=None)


In [None]:
imgs = []
for _, row in train_df[train_df['Indeterminate Appearance'] == 1].iloc[:8].iterrows():
    study_id = row['StudyInstanceUID']
    img_path = glob.glob(f'{INPUT_PATH}/train/{study_id}/*/*.dcm')[0]
    img = dicom2array(path=img_path)
    img = cv2.resize(img, None, fx=1/SCALE, fy=1/SCALE)
    img = np.stack([img, img, img], axis=-1)
    
    claz = row[class_names].values
    color = label2color[str(claz.tolist())]
    target = label2target[str(claz.tolist())]

    bboxes = []
    bbox = []
    for i, l in enumerate(row['label'].split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l)/SCALE)
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []    
    
    for box in bboxes:
        img = cv2.rectangle(
            img,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, THICKNESS
    	)
        box_width = int(box[2]) - int(box[0])
        img = cv2.putText(img, target, (int(box[0])-(text_width-box_width)//2, int(box[1])-10),
                        FONT, FONT_SCALE, color, FONT_THICKNESS, FONT_LINE_TYPE) 
    img = cv2.resize(img, (500,500))
    imgs.append(img)
    
plot_imgs(imgs, cmap=None)


In [None]:
imgs = []
for _, row in train_df[train_df['Atypical Appearance'] == 1].iloc[:8].iterrows():
    study_id = row['StudyInstanceUID']
    img_path = glob.glob(f'{INPUT_PATH}/train/{study_id}/*/*.dcm')[0]
    img = dicom2array(path=img_path)
    img = cv2.resize(img, None, fx=1/SCALE, fy=1/SCALE)
    img = np.stack([img, img, img], axis=-1)
    
    claz = row[class_names].values
    color = label2color[str(claz.tolist())]
    target = label2target[str(claz.tolist())]

    bboxes = []
    bbox = []
    for i, l in enumerate(row['label'].split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l)/SCALE)
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []    
    
    for box in bboxes:
        img = cv2.rectangle(
            img,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, THICKNESS
    	)
        box_width = int(box[2]) - int(box[0])
        img = cv2.putText(img, target, (int(box[0])-(text_width-box_width)//2, int(box[1])-10),
                        FONT, FONT_SCALE, color, FONT_THICKNESS, FONT_LINE_TYPE) 
    img = cv2.resize(img, (500,500))
    imgs.append(img)
    
plot_imgs(imgs, cmap=None)


In [None]:
imgs = []

for i in range(8):
    if i % 2 == 0:
        row = train_df[train_df['Negative for Pneumonia']==1].iloc[i]
    else:
        row = train_df[train_df['Typical Appearance']==1].iloc[i]
        
    study_id = row['StudyInstanceUID']
    img_path = glob.glob(f'{INPUT_PATH}/train/{study_id}/*/*')[0]
    img = dicom2array(path=img_path)
    img = cv2.resize(img, None, fx=1/SCALE, fy=1/SCALE)
    img = np.stack([img, img, img], axis=-1)
    
    claz = row[class_names].values
    color = label2color[str(claz.tolist())]
    target = label2target[str(claz.tolist())]

    bboxes = []
    bbox = []
    for i, l in enumerate(row['label'].split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l)/SCALE)
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []    
    
    for box in bboxes:
        img = cv2.rectangle(
            img,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, THICKNESS
    	)
        box_width = int(box[2]) - int(box[0])
        img = cv2.putText(img, target, (int(box[0])-(text_width-box_width)//2, int(box[1])-10),
                        FONT, FONT_SCALE, color, FONT_THICKNESS, FONT_LINE_TYPE) 
    img = cv2.resize(img, (500,500))
    imgs.append(img)
    
plot_imgs(imgs, cmap=None)


In [None]:
sample_submission.to_csv('submission.csv', index = False)