# Part 1 - Tradition Image processing for classification: feature extraction
使用传统图像处理方法构造更多特征

In [None]:
import pandas as pd 
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
# 进度条库
from tqdm import tqdm_notebook
from matplotlib.patches import Rectangle
import seaborn as sns
# 医学图像处理库
import pydicom as dcm
%matplotlib inline 
plt.set_cmap(plt.cm.bone)
IS_LOCAL = True
import os
import cv2
# sk的图像处理库
import skimage
from skimage import feature, filters
from tqdm import tqdm

PATH="../input/rsna-pneumonia-detection-challenge"

print(os.listdir(PATH))

In [None]:
plt.rcParams['font.sans-serif'] = ['KaiTi'] # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False 

In [None]:
# 将训练集的特征-标签和图像的具体描述连接
class_info_df = pd.read_csv(PATH+'/stage_2_detailed_class_info.csv')
train_labels_df = pd.read_csv(PATH+'/stage_2_train_labels.csv')
train_class_df = train_labels_df.merge(class_info_df, left_on='patientId', right_on='patientId', how='inner')

In [None]:
train_class_df.head()

In [None]:
# 看下图像的类别
train_class_df['class'].unique()

### 定义一些常用函数

In [None]:

def load_images(data):
    """
    加载图片，将data集中包含的id的图片全加载，返回一个列表
    """
    imgs = []
    for path in data['patientId']:
        patientImage = path + '.dcm'
        imagePath = os.path.join(PATH,"stage_2_train_images/", patientImage)
        img = dcm.read_file(imagePath).pixel_array
        imgs.append(img)
    return imgs

def imshow_gray(img):
    """
    用灰度显示单张图片
    """
    plt.figure(figsize=(12,7))
    return plt.imshow(img, cmap='gray')
    
def imshow_with_labels(img, patient_id):
    """
    在肺炎图片上标记出边界框，并显示图片
    """
    # 从训练集中取出这id所对应的一行
    rows = train_labels_df[train_labels_df['patientId'] == patient_id]
    # 分别得到该图片的边界框特征
    for row in rows.itertuples():        
        x, y, w, h = row.x, row.y, row.width, row.height
        x, y, w, h = map(int, [x,y,w,h])
        cv2.rectangle(img, (x,y), (x+w,y+h), 255, 2)
    plt.figure(figsize=(12,7))
    return plt.imshow(img, cmap='gray')

## 采样几张图片来进行图像处理

In [None]:
# 随机选取三种肺炎图像
test_df = train_class_df[train_class_df['Target']==1].sample(4)
box = test_df.loc[test_df.index, ['x', 'y', 'width', 'height']]
# 读取随机采样的前三张肺炎图像
test = load_images(test_df[0:3])

In [None]:
test_df[0:3]

In [None]:
# 第一张的情况
idx = 0
img = test[idx]
imshow_with_labels(img.copy(), test_df.iloc[idx,0])

In [None]:
# 第二张的情况
idx = 1
img = test[idx]
imshow_with_labels(img.copy(), test_df.iloc[idx,0])

## 图像增强

### 1.直方图均衡化

In [None]:
equ = cv2.equalizeHist(test[idx])
ax = imshow_gray(equ)

In [None]:
imshow_with_labels(equ.copy(), test_df.iloc[idx,0])

均衡呈现肺部的对比度，图片明显变亮了一些，并进一步强调不透明度的存在

### 2.锐化

In [None]:
# 使用3*3的高通滤波器
hpf_kernel = np.full((3, 3), -1)
hpf_kernel[1,1] = 9
im_hp = cv2.filter2D(equ, -1, hpf_kernel)

# 使用虚光蒙版
im_us = skimage.filters.unsharp_mask(equ)

# 对比两种锐化的结果
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,9))
ax1.imshow(im_hp, cmap='gray')
ax1.set_title('高通 filter')
ax2.imshow(im_us, cmap='gray')
ax2.set_title('虚光 filter')
fig.suptitle('锐化')

高通滤波器的锐化效果更明显

### 3.阈值化


In [None]:
# otsu 阈值化
ret, otsu = cv2.threshold(cv2.GaussianBlur(im_hp,(7,7),0),0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
# 全局阈值化
local = im_hp > skimage.filters.threshold_local(im_hp, 5)
# 均值阈值化
mean = im_hp > skimage.filters.threshold_mean(im_hp)

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16,9))
ax1.imshow(otsu, cmap='gray')
ax1.set_title('Otsu thresholding')
ax2.imshow(local, cmap='gray')
ax2.set_title('Local thresholding')
fig.suptitle('Image sharpening')
ax3.imshow(mean, cmap='gray')
ax3.set_title('Mean thresholding')
fig.suptitle('Image thresholding')

Otsu的表现最好

### 边缘检测

In [None]:
# sobel算子
sobel = filters.sobel(otsu)
# canny算子
canny = feature.canny(otsu/255)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,9))
ax1.imshow(canny, cmap='gray')
ax1.set_title('Canny edge detection')
ax2.imshow(sobel, cmap='gray')
ax2.set_title('Sobel operator')
fig.suptitle('Edge detection')

sobel分割出的边界比canny的更清晰

### 肺部分割

查找并画出肺部的轮廓

In [None]:
contours, hier = cv2.findContours((sobel * 255).astype('uint8'),cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
print('Contours found ', len(contours))

srt_contours = sorted(contours, key=lambda x: x.shape[0], reverse=True)
select_contour = srt_contours[0]  # probably not the best assumption

test = img.copy()
img_contour = cv2.drawContours(test, [select_contour], 0, (255,0,0), thickness=3)

imshow_gray(img_contour)

识别肺段后，我们可以提取该段的矩中心。由于所有 X 射线图像都来自同一维度，因此这可能是预测的有效特征

In [None]:
M = cv2.moments(select_contour)
cx = int(M['m10'] / M['m00'])
cy = int(M["m01"] / M["m00"])

test = img.copy()
cv2.circle(test, (cx, cy), 7, (255, 255, 255), -1)
imshow_gray(test)

但是有些图像的主体略微旋转，并且大小不同，因此我们希望矩中心对旋转和缩放保持不变。所以我们选择hu矩。我们还将记录时刻以便于比较和删除第 3 个时刻，因为它取决于其他时刻和第7个时刻，因为它区分镜像并且数据集中没有翻转图像

But there are images were the subject is slightly rotated, and differently sized so we want center of moment to be invariant to rotation and scale. So we pick **Hu moments**. We will also log the moments to make it easy to compare and drop the 3rd moment as it depends on the other moments and 7th moment as it distinguishes mirror images and there are no flipped images in the dataset

In [None]:
def get_hu_moments(contour):
    M = cv2.moments(select_contour)
    hu = cv2.HuMoments(M).ravel().tolist()
    del hu[2]
    del hu[-1]
    log_hu = [-np.sign(a)*np.log10(np.abs(a)) for a in hu]
    return log_hu

get_hu_moments(select_contour)

### 特征
1. 不透明区域
2. 肺的周长
3. 不规则程度
4. 直径
5. 原始图像的均值，标准差
6. hu 矩


In [None]:
def area(img):
    # 二值图像作为输入
    return np.count_nonzero(img)

def perimeter(img):
    # 边缘图作为输入
    return np.count_nonzero(img)

def irregularity(area, perimeter):
    # area and perimeter of the image as input, also called compactness
    I = (4 * np.pi * area) / (perimeter ** 2)
    return I

def equiv_diam(area):
    # area of image as input
    ed = np.sqrt((4 * area) / np.pi)
    return ed

def get_hu_moments(contour):
    # hu moments except 3rd and 7th (5 values)
    M = cv2.moments(contour)
    hu = cv2.HuMoments(M).ravel().tolist()
    del hu[2]
    del hu[-1]
    log_hu = [-np.sign(a)*np.log10(np.abs(a)) for a in hu]
    return log_hu

### 图像预处理和特征提取

In [None]:
def extract_features(img):
    # 先求原始图像的统计值
    mean = img.mean()
    std_dev = img.std()
    
    # 直方图均衡化
    equalized = cv2.equalizeHist(img)
    
    # 高通锐化
    hpf_kernel = np.full((3, 3), -1)
    hpf_kernel[1,1] = 9
    sharpened = cv2.filter2D(equalized, -1, hpf_kernel)
    
    # 阈值化
    ret, binarized = cv2.threshold(cv2.GaussianBlur(sharpened,(7,7),0),0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
    
    # 边缘检测
    edges = skimage.filters.sobel(binarized)
    
    # 矩计算
    contours, hier = cv2.findContours((edges * 255).astype('uint8'),cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
    select_contour = sorted(contours, key=lambda x: x.shape[0], reverse=True)[0]
    
    # 特征提取
    # 不透明区域面积
    ar = area(binarized)
    # 肺部边长
    per = perimeter(edges)
    # 不规则程度
    irreg = irregularity(ar, per)
    # 直径
    eq_diam = equiv_diam(ar)
    # hu矩
    hu = get_hu_moments(select_contour)
    # 最后一幅图被提取为(6+5)个特征,其中hu矩有5个特征
    return (mean, std_dev, ar, per, irreg, eq_diam, *hu)

In [None]:
# test the function
extract_features(img)

# 保存特征

## 加载数据
只选择正常和肺炎图像进行模型构建

In [None]:
# 肺炎图像
pneumonia_ids = train_labels_df[train_labels_df['Target'] == 1]['patientId'].unique()
pneumonia_labels = [1] * len(pneumonia_ids)

# 非肺炎患者的正常图像
normal_ids = class_info_df[class_info_df['class'] == 'Normal']['patientId'].unique()
normal_labels = [0] * len(normal_ids)

data = dict()
data['patientId'] = np.concatenate((pneumonia_ids, normal_ids))
data['target'] = np.concatenate((pneumonia_labels, normal_labels))

print(f'Pneumonia images: {len(pneumonia_ids)}\nNormal images: {len(normal_ids)}')

## 生成特征
对于每个 ID，从图像生成特征并将其存储在数据集中

In [None]:
from tqdm import tqdm

In [None]:
features = []

for path in tqdm(data['patientId']):
    # 加载数据集中的图像
    patientImage = path + '.dcm'
    imagePath = os.path.join(PATH,"stage_2_train_images/", patientImage)
    img = dcm.read_file(imagePath).pixel_array
    # 对该图像进行特征提取
    feats = extract_features(img)
    # 一行特征作为一个列表值，加到feats中
    features.append(feats)

data['features'] = features
# 最终data三个值（id,target,features）

In [None]:
df = pd.DataFrame(data)
df.to_csv('img_features.csv')

生成特征后，可以使用机器学习模型加载和训练它们以执行分类。