# CheXpert触ってみた  
## CheXpertデータセット  
Stanford ML groupが公開した胸部レントゲン画像のデータセット。  
2002年10月から2017年7月までの間にスタンフォード大学附属病院で65,240人の患者に対して撮影された224,316枚の胸部レントゲン写真(正面像・側面像を含む)。  
放射線読影医の読影レポートに対してルールベースのテキスト処理を行い、14種類の診断項目のラベルをつけてある(マルチラベル)。  
オリジナルのデータセットとして全体で439GBのものに加え、解像度を落としたデータセット(11GB程度)のものがダウンロード可能。  
CheXpertのサイト:https://stanfordmlgroup.github.io/competitions/chexpert/  
CheXpertの論文:https://arxiv.org/abs/1901.07031
  
### 解像度を落としたデータで中身をざっとみてみる  
解像度を落としたデータを用い、側面像は取り除き、今回は正面像のみで解析を行った

In [None]:
import pandas as pd
import numpy as np
import scipy.misc
import scipy
import os
import glob
import matplotlib.pyplot as plt
import matplotlib as mpl
from multiprocessing import Pool
import multiprocessing as multi

%matplotlib inline

def glance(path):
    """試しに写真を表示する用の関数"""
    img = scipy.misc.imread(path)
    plt.imshow(img)
    plt.gray()
    plt.show()
    
labels_train_raw = pd.read_csv("train.csv")
path = "path/to" #CheXpert-v1.0-smallのディレクトリのある場所へのフルパス

## それぞれのラベルのついた画像を数枚ずつ確認

In [None]:
#No Finding 
print("異常所見なし")
glance(os.path.join(path, labels_train_raw.iloc[0,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[5,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[18,:]["Path"]))

#Enlarged Cardiomediastinum 
print("縦隔陰影の拡大")
glance(os.path.join(path, labels_train_raw.iloc[13,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[55,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[147,:]["Path"]))

#Cardiomegaly 
print("心拡大")
glance(os.path.join(path, labels_train_raw.iloc[132,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[133,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[134,:]["Path"]))

#Lung Opacity 
print("肺野の透過性低下")
glance(os.path.join(path, labels_train_raw.iloc[103,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[104,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[108,:]["Path"]))

#Lung Lesion 
print("肺野病変")
glance(os.path.join(path, labels_train_raw.iloc[457,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[494,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[554,:]["Path"]))

#Edema 
print("肺水腫")
glance(os.path.join(path, labels_train_raw.iloc[295,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[314,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[387,:]["Path"]))

#Consolidation 
print("肺野浸潤影")
glance(os.path.join(path, labels_train_raw.iloc[557,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[1008,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[1015,:]["Path"]))

#Pneumonia 
print("肺炎")
glance(os.path.join(path, labels_train_raw.iloc[716,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[1037,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[1662,:]["Path"]))

#Atelectasis 
print("無気肺")
glance(os.path.join(path, labels_train_raw.iloc[54,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[196,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[526,:]["Path"]))

#Pneumothorax 
print("気胸")
glance(os.path.join(path, labels_train_raw.iloc[120,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[1513,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[1928,:]["Path"]))

#Pleural Effusion 
print("胸水")
glance(os.path.join(path, labels_train_raw.iloc[330,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[392,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[436,:]["Path"]))

#Pleural Other 
print("胸膜病変その他")
glance(os.path.join(path, labels_train_raw.iloc[4891,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[6112,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[8861,:]["Path"]))

#Fracture 
print("骨折")
glance(os.path.join(path, labels_train_raw.iloc[2404,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[2690,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[3130,:]["Path"]))

#Support Devices 
print("サポートデバイス")
glance(os.path.join(path, labels_train_raw.iloc[227,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[293,:]["Path"]))
glance(os.path.join(path, labels_train_raw.iloc[406,:]["Path"]))

## 前処理  
正面像のみを抽出し、画像は256x256pxに整形した  
ラベルの欠損値は0で補完し、uncertainも0とした(かなり雑)

In [None]:
#データ前処理
ROOT = "path/to" #CheXpert-v1.0-smallのディレクトリのある場所へのフルパス

labels = ['No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity',
          'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis',
          'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture',
          'Support Devices'] #label項目を設定

def read_pics_process(path_df):
    """画像及びラベル読み込み並列実行用関数"""
    img = scipy.misc.imread(os.path.join(ROOT,path_df[0]))
    img = scipy.misc.imresize(img, (256, 256))
    img = img.reshape(1, 256, 256, 1)
    
    num = path_df[1].loc[:,"Path"].tolist().index(path_df[0])
    label = path_df[1].loc[num,labels]
        
    print("filename:", path_df[0])
    return [img,label]

def preprocessing(path):    
    """正面画像のラベルデータと画像のあるパスを抽出＋欠測値補完"""
    #csvファイルの読み込み
    df = pd.read_csv(path)
    #正面像のデータだけcsvファイルから抽出
    df_frontal = df[df["Frontal/Lateral"]=="Frontal"].reset_index(drop=True)
    #ラベルデータの負数と欠測値を0に置換
    df_frontal[labels] = df_frontal[labels].fillna(0)
    df_frontal[labels] = df_frontal[labels].replace(-1, 0)
    
    #正面像画像のパスをcsvファイルから抽出
    img_path = df[df["Frontal/Lateral"]=="Frontal"].loc[:,"Path"].tolist()
    
    return df_frontal, img_path

def save_img_arr(results,name):
    """画像データのnumpyアレイを保存する関数"""
    img_arr_tmp = [x[0] for x in results] 
    img_arr = np.concatenate(img_arr_tmp,axis=0)
    np.save(name,img_arr)

def save_label_csv(results,name):
    """ラベルデータを保存する関数"""
    labels_df_tmp = [x[1] for x in results] 
    labels_df = pd.concat([pd.DataFrame(x) for x in labels_df_tmp],axis=1).T
    labels_df.to_csv(name,index=False)


In [None]:
#以下前処理実行
#csvファイルへのフルパス作成
path_train_csv = os.path.join(ROOT,"CheXpert-v1.0-small/train.csv")
path_valid_csv = os.path.join(ROOT,"CheXpert-v1.0-small/valid.csv")

#欠測値補完したcsvファイルと正面画像のラベルデータと画像のあるパスを得る
df_train_frontal, img_path_train = preprocessing(path_train_csv)
df_valid_frontal, img_path_valid = preprocessing(path_valid_csv)

#画像取り込み開始(train data[:3000]) *3000枚だけ
n_jobs=multi.cpu_count()
arg_t = [(i, df_train_frontal) for i in img_path_train[:3000]]
p = Pool(n_jobs)
train_data = p.map(read_pics_process, arg_t)
p.close()

#画像取り込み開始(valid data)
arg_v = [(i, df_valid_frontal) for i in img_path_valid]
p = Pool(n_jobs)
valid_data = p.map(read_pics_process, arg_v)
p.close()

#得たデータをそれぞれ保存
save_img_arr(train_data,"Chexpert_small_train_small")
save_img_arr(valid_data,"Chexpert_small_valid")
save_label_csv(train_data,"Chexpert_small_train_label_small.csv")
save_label_csv(valid_data,"Chexpert_small_valid_label.csv")

## とりあえず試しにディープラーニングで分類してみる  
KerasでImageNetの重みの載ったXception読み込んでmulti-labelのclass分類してみた  

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import backend as K
from tensorflow.keras.applications.xception import Xception
from tensorflow.keras.models import Model, model_from_json
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.utils import to_categorical

%matplotlib inline

def build_model():
    """Xception読み込み用"""
    base_model = Xception(weights='imagenet', include_top=False, input_shape=(256,256,3))

    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(1024, activation='relu')(x)
    
    outputs=[]
    for i in range(14):
        exec("output" + str(i).zfill(2) + " = Dense(2, activation='softmax', name='output" + str(i).zfill(2) + "')(x)")
        exec("outputs.append(" + "output" + str(i).zfill(2) + ")")  

    model = Model(inputs=base_model.input, outputs=outputs)
    
    for layer in base_model.layers:
        layer.trainable = True
        
    loss_dict = {}
    for i in range(14):
        exec("loss_dict['output"+str(i).zfill(2)+"'] = 'binary_crossentropy'")
             
    model.compile(optimizer='rmsprop', 
                  loss=loss_dict,
                  metrics=["accuracy"])
    
    return model

def preprocess_input(x):
    return ((x/255.)-0.5)*2.
  

In [None]:
#データの読み込みと整形
labels_train = pd.read_csv("Chexpert_small_train_label_small.csv")
labels_valid = pd.read_csv("Chexpert_small_valid_label.csv")
X_train = preprocess_input(np.load("Chexpert_small_train_small.npy"))
X_valid = preprocess_input(np.load("Chexpert_small_valid.npy"))
X_train = np.tile(X_train,(1,1,1,3))
X_valid = np.tile(X_valid,(1,1,1,3))

y_dict_train = {}
y_dict_valid = {}

for i in range(14):
    exec("y_train" + str(i).zfill(2) + " = keras.utils.to_categorical(labels_train.iloc[:," + str(i) + "], 2)")
    exec("y_valid" + str(i).zfill(2) + " = keras.utils.to_categorical(labels_valid.iloc[:," + str(i) + "], 2)")
    exec("y_dict_train['output" + str(i).zfill(2) + "'] = y_train" + str(i).zfill(2))
    exec("y_dict_valid['output" + str(i).zfill(2) + "'] = y_valid" + str(i).zfill(2))
    

#kerasでXceptionを読み込んで学習実行
model = build_model()

es_cb = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='auto')
cp_cb = ModelCheckpoint(filepath = 'chexpert_model.styles.hdf5', monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

batch_size=32
nb_epoch=100
    
history = model.fit(X_train, y_dict_train, batch_size=batch_size, shuffle=True, epochs=nb_epoch, 
                    validation_data=(X_valid, y_dict_valid), callbacks=[cp_cb, es_cb])

In [None]:
#Loss
plt.plot(history.history['loss'],"o-",label="loss",)
plt.plot(history.history['val_loss'],"o-",label="val_loss")
plt.title('model loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(loc='upper right')
plt.show()

![loss](https://raw.githubusercontent.com/sekips/chexpert_try/master/loss_3000.png)

In [None]:
#ラベルごとのloss
fig = plt.figure(figsize=(16,20))
axes = [] 
for i in range(14):
    axes.append(fig.add_subplot(5, 3, i+1))
    exec("axes[i].plot(history.history['output" + str(i).zfill(2) + "_loss'],'o-',label='" + labels[i] + "_loss')")    
    exec("axes[i].plot(history.history['val_output" + str(i).zfill(2) + "_loss'],'o-',label='val_" + labels[i] + "_loss')")
    axes[i].legend(loc='upper right')

plt.show()

![each_loss](https://raw.githubusercontent.com/sekips/chexpert_try/master/each_loss_3000.png)

In [None]:
#ラベルごとのaccuracy
fig = plt.figure(figsize=(16,20))
axes = [] 
for i in range(14):
    axes.append(fig.add_subplot(5, 3, i+1))
    exec("axes[i].plot(history.history['output" + str(i).zfill(2) + "_acc'],'o-',label='" + labels[i] + "_acc')")    
    exec("axes[i].plot(history.history['val_output" + str(i).zfill(2) + "_acc'],'o-',label='val_" + labels[i] + "_acc')")
    axes[i].set_ylim([0, 1])
    axes[i].legend(loc='lower right')

plt.show()

![each_acc](https://raw.githubusercontent.com/sekips/chexpert_try/master/each_acc_3000.png)

### 試しに全データで学習
#### Loss
![each_acc](https://raw.githubusercontent.com/sekips/chexpert_try/master/model_loss_all.png)
#### ラベルごとのloss
![each_acc](https://raw.githubusercontent.com/sekips/chexpert_try/master/each_loss_all.png)
#### ラベルごとのaccuracy  
![each_acc](https://raw.githubusercontent.com/sekips/chexpert_try/master/each_acc_all.png)