In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**在这次比赛中，你将自动定位和分类14种胸片胸椎异常类型。您将使用由经验丰富的放射科医生注释的18000个扫描组成的数据集。您可以使用15000个独立标记的图像训练模型，并将在3000个图像的测试集上进行评估。这些注释是通过VinBigData的基于web的平台VinLab收集的。关于构建数据集的详细信息可以在我们最近的论文“VinDr CXR：一个开放的胸部X射线数据集和放射科医生的注释”中找到。**

# 1.Dicom to Numpy array

**将dicom数据转换成png/jpg看起来很简单，但是，您必须考虑到，原始dicom数据实际上并不能线性转换为“人性化”的png/jpg。事实上，大多数DICOM的存储像素值在指数级。因此，为了得到放射学家最初在工作区看到的jpg/png，您需要应用一些转换。DICOM元数据存储了如何进行这种“人性化”转换的信息。
下面是我使用的示例代码：**

In [None]:
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from glob import glob
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from skimage import exposure
import cv2
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


def read_xray(path, voi_lut = True, fix_monochrome = True):
    dicom = pydicom.read_file(path)
    
    # VOI LUT（如果DICOM设备可用）用于将原始DICOM数据转换为“人性化”视图
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # 根据这个值，X射线可能看起来是反向的修复这个问题:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

In [None]:
img = read_xray('../input/vinbigdata-chest-xray-abnormalities-detection/train/0108949daa13dc94634a7d650a05c0bb.dicom')
plt.figure(figsize = (12,12))
plt.imshow(img, 'gray')

没有解决单色问题,设置fix_monochrome = False

In [None]:
img = read_xray('../input/vinbigdata-chest-xray-abnormalities-detection/train/0108949daa13dc94634a7d650a05c0bb.dicom',fix_monochrome = False)
plt.figure(figsize = (12,12))
plt.imshow(img, 'gray')

In [None]:
dataset_dir = '../input/vinbigdata-chest-xray-abnormalities-detection'

In [None]:
def dicom2array(path, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    # VOI LUT (if available by DICOM device) is used to
    # transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data
        
    
def plot_img(img, size=(7, 7), is_rgb=True, title="", cmap='gray'):
    plt.figure(figsize=size)
    plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()


def plot_imgs(imgs, cols=4, size=7, is_rgb=True, title="", cmap='gray', img_size=(500,500)):
    rows = len(imgs)//cols + 1
    fig = plt.figure(figsize=(cols*size, rows*size))
    for i, img in enumerate(imgs):
        if img_size is not None:
            img = cv2.resize(img, img_size)
        fig.add_subplot(rows, cols, i+1)
        plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()
    
# def draw_bboxes(img, boxes, thickness=10, color=(255, 0, 0), img_size=(500,500)):
#     img_copy = img.copy()
#     if len(img_copy.shape) == 2:
#         img_copy = np.stack([img_copy, img_copy, img_copy], axis=-1)
#     for box in boxes:
#         img_copy = cv2.rectangle(
#             img_copy,
#             (int(box[0]), int(box[1])),
#             (int(box[2]), int(box[3])),
#             color, thickness)
#     if img_size is not None:
#         img_copy = cv2.resize(img_copy, img_size)
#     return img_copy

In [None]:
dicom_paths = glob(f'{dataset_dir}/train/*.dicom')
imgs = [dicom2array(path) for path in dicom_paths[:4]]
plot_imgs(imgs)

可以试试均衡直方图处理，对比差异

In [None]:
imgs = [exposure.equalize_hist(img) for img in imgs]
plot_imgs(imgs)

# 2.EDA csv

In [None]:
from bokeh.plotting import figure as bokeh_figure
from bokeh.io import output_notebook, show, output_file
from bokeh.models import ColumnDataSource, HoverTool, Panel
from bokeh.models.widgets import Tabs
import pandas as pd
from PIL import Image
from sklearn import preprocessing
import random
from random import randint

下面开始提取重要特征，预处理数据

In [None]:
def get_bbox_area(row):
    return (row['x_max']-row['x_min'])*(row['y_max']-row['y_min'])

train_df = pd.read_csv(f'{dataset_dir}/train.csv')
le = preprocessing.LabelEncoder()  # encode rad_id
train_df['rad_label'] = le.fit_transform(train_df['rad_id'])

finding_df = train_df[train_df['class_name'] != 'No finding']
finding_df['bbox_area'] = finding_df.apply(get_bbox_area, axis=1)
finding_df.head()

# 2.1 Plot bounding box

In [None]:
imgs = []
img_ids = finding_df['image_id'].values
class_ids = finding_df['class_id'].unique()

# map label_id to specify color
label2color = {class_id:[randint(0,255) for i in range(3)] for class_id in class_ids}
thickness = 3
scale = 5


for i in range(8):
    img_id = random.choice(img_ids)
    img_path = f'{dataset_dir}/train/{img_id}.dicom'
    img = dicom2array(path=img_path)
    img = cv2.resize(img, None, fx=1/scale, fy=1/scale)
    img = np.stack([img, img, img], axis=-1)
    
    boxes = finding_df.loc[finding_df['image_id'] == img_id, ['x_min', 'y_min', 'x_max', 'y_max']].values/scale
    labels = finding_df.loc[finding_df['image_id'] == img_id, ['class_id']].values.squeeze()
    
    for label_id, box in zip(labels, boxes):
        color = label2color[label_id]
        img = cv2.rectangle(
            img,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, thickness
    )
    img = cv2.resize(img, (500,500))
    imgs.append(img)
    
plot_imgs(imgs, cmap=None)

您可以看到：在每个图像中，都有许多重叠的框。请注意，这场比赛的一个关键部分是从多个放射科医生的地面真相工作。我想如果你处理得好的话，在这场比赛中获得最好的名次是关键。

# 2.2 Plot histogram

**我们将尝试绘制一些直方图。**

In [None]:
def hist_hover(dataframe, column, color=["#94c8d8", "#ea5e51"], bins=30, title="", value_range=None):
    """
    Plot histogram
    """
    hist, edges = np.histogram(dataframe[column], bins=bins, range=value_range)
    hist_frame = pd.DataFrame({
        column: hist,
        "left": edges[:-1],
        "right": edges[1:]
    })
    hist_frame["interval"] = ["%d to %d" %
                              (left, right) for left, right in zip(edges[:-1], edges[1:])]
    src = ColumnDataSource(hist_frame)
    plot = bokeh_figure(
        plot_height=400, plot_width=600,
        title=title, x_axis_label=column,
        y_axis_label="Count"
    )
    plot.quad(
        bottom=0, top=column, left="left", right="right",
        source=src, fill_color=color[0], line_color="#35838d",
        fill_alpha=0.7, hover_fill_alpha=0.7,
        hover_fill_color=color[1]
    )
    hover = HoverTool(
        tooltips=[("Interval", "@interval"), ("Count", str(f"@{column}"))]
    )
    plot.add_tools(hover)
    output_notebook()
    show(plot)
    
    
hist_hover(train_df, column='class_id')

**可以看到每个类的图像质量之间的不平衡**

In [None]:
#Note that a key part of this competition is working with ground truth from multiple radiologists.
hist_hover(train_df, column='rad_label')

**每个放射科医生的图像质量之间的不平衡**

In [None]:
## histogram of bbox area
hist_hover(finding_df, column='bbox_area')

**经过一些EDA步骤，我们认识到数据集在许多方面相当不平衡。也许，我们需要用一些增广的方法来解决这个问题。数据增广还没处理，以后会继续更新**