In [None]:
from glob import glob 
import numpy as np
import pandas as pd
import keras,cv2,os

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, BatchNormalization, Activation
from keras.layers import Conv2D, MaxPool2D
from keras.optimizers import RMSprop, Adam

import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [None]:
path = "../input/histopathologic-cancer-detection/" #adapt this path, when running locally
#path = "/histopathologic-cancer-detection/"
train_path = path + 'train/'
test_path = path + 'test/'

print(os.path.join(train_path,'*.tif'))
#将tif的图片的路径与系统默认的路径合并在一起
tiff_path=glob(os.path.join(train_path,'*.tif'))
#glob函数的作用是查找该路径下符合.tif的文件

tiff_path=pd.DataFrame({'path':tiff_path})
tiff_path['id']= [x.split('/')[4].split('.')[0] for x in tiff_path.path] 
#利用列表表达式在path中提取出id
print(tiff_path.head())
data=pd.read_csv(path+'train_labels.csv')

df_data=tiff_path.merge(data,on='id')
df_data.head()


In [None]:
#异常图像
df_data = df_data[df_data['id'] != 'dd6dfed324f9fcb6f93f46f32fc800f2ec196be2']
df_data = df_data[df_data['id'] != '9369c7278ec8bcc6c880d99194de09fc2bd4efbe']
df_data = df_data[df_data['id'] !='0883a7e019a80bac0d0e61e21320f7fd30b37a46']


In [None]:
#a=cv2.imread(df['path'].iloc[0])
#plt.imshow(a)
fig=plt.figure(figsize=(5,4),dpi=150)
for i in range(12):
    ax=fig.add_subplot(3,4,i+1,xticks=[],yticks=[])
    img=cv2.imread(df_data['path'].iloc[i])
    #读取图像的路径
    plt.imshow(img)
    plt.subplots_adjust(hspace=0.5)
    ax.set_title('Label:'+str(df_data['label'][i]))
plt.savefig('picture.png',dpi=300)    


In [None]:
from sklearn.model_selection import train_test_split as ts
from sklearn.utils import shuffle

SAMPLE_SIZE = 60000 # load 80k negative examples

# take a random sample of class 0 with size equal to num samples in class 1
df_0 = df_data[df_data['label'] == 0].sample(SAMPLE_SIZE, random_state = 1234)
# filter out class 1
df_1 = df_data[df_data['label'] == 1].sample(SAMPLE_SIZE, random_state = 1234)

# concat the dataframes

df_data = shuffle(pd.concat([df_0, df_1], axis=0).reset_index(drop=True))
#pd.concat([df_0,df_1],axis=0).reset_index()
#reset_index：drop是删除新的索引列

y = df_data['label']
df_train, df_test = ts(df_data, test_size=0.1, random_state=101, stratify=y)
#按照label进行分成抽样

df_test,df_val=ts(df_test,test_size=0.1,random_state=101)


train_path = 'base_dir/train'
valid_path = 'base_dir/valid'
test_path='base_dir/test'
#建立新的路径


for fold in [train_path, valid_path,test_path]:
    for i in ['0','1']:
        os.makedirs(os.path.join(fold,i))
#将训练集和训练集分别建立新的文件，并且生成两种不同类别的子文件



In [None]:
print('训练集shape',df_train.shape)
print('验证集shape',df_val.shape)
print('测试集shape',df_test.shape)

In [None]:
print(os.listdir('base_dir'))
print(os.listdir(train_path))
print(os.listdir(valid_path))
print(os.listdir(test_path))

In [None]:
df_data.set_index('id',inplace=True)
#s=df_data.loc['99dbe7a2b1341eeab4dd5e89578099be96e5800a','label']
#s

In [None]:
import shutil
#导入copy模块
for image in df_train['id'].values:
    # the id in the csv file does not have the .tif extension therefore we add it here
    fname = image + '.tif'
    if df_data.loc[image,'label']==0:
        label='0'
    else:
        label='1'
   #将数据分别分到0和1的文件夹里面     
    src = os.path.join('../input/histopathologic-cancer-detection/train', fname)
    dst = os.path.join(train_path,label,fname)
    shutil.copyfile(src, dst)

    
for image in df_val['id'].values:
    fname = image + '.tif'
    if df_data.loc[image,'label']==0:
        label='0'
    else:
        label='1'
    src = os.path.join('../input/histopathologic-cancer-detection/train', fname)
    dst = os.path.join(valid_path,label,fname)
    shutil.copyfile(src, dst)

    
for image in df_test['id'].values:
    fname = image + '.tif'
    if df_data.loc[image,'label']==0:
        label='0'
    else:
        label='1'
    src = os.path.join('../input/histopathologic-cancer-detection/train', fname)
    dst = os.path.join(test_path,label,fname)
    shutil.copyfile(src, dst)

In [None]:
print(len(os.listdir('base_dir/train/0')))
print(len(os.listdir('base_dir/valid/1')))
print(len(os.listdir('base_dir/test/0')))
print(os.listdir('base_dir/test'))

In [None]:
#500张样本做色彩特征分析
n=500
def read_img(n,df):
    x=np.zeros([n,96,96,3],dtype=np.uint8)
    for i in range(n):
        x[i]=cv2.imread(df['path'].iloc[i])
    return x

x_nor=read_img(n,df_0)
x_abnor=read_img(n,df_1)



In [None]:
#观测每个rgb通道中不同色彩的分布情况

bright_bins = 256 #表示的是亮度，0是最暗即是黑色，256是最亮即是白色
fig,axs = plt.subplots(4,2,sharey=True,figsize=(8,8),dpi=150)

#RGB channels
#x_nor[:,:,:,0] 想象该物体是正方体，拥有三个参数，这个三个参数是颜色参数。RGB的提取在最后的一个参数
#array.flatten展开为一维数据
axs[0,0].hist(x_nor[:,:,:,0].flatten(),bins=bright_bins,density=True)
axs[0,1].hist(x_abnor[:,:,:,0].flatten(),density=True,bins=bright_bins)


axs[1,0].hist(x_nor[:,:,:,1].flatten(),density=True,bins=bright_bins)
axs[1,1].hist(x_abnor[:,:,:,1].flatten(),density=True,bins=bright_bins)

axs[2,0].hist(x_nor[:,:,:,2].flatten(),density=True,bins=bright_bins)
axs[2,1].hist(x_abnor[:,:,:,2].flatten(),density=True,bins=bright_bins)


axs[3,0].hist(x_nor[:,:,:].flatten(),density=True,bins=bright_bins)
axs[3,1].hist(x_abnor[:,:,:].flatten(),density=True,bins=bright_bins)

axs[0,0].set_ylabel("Red",horizontalalignment='left',labelpad=15,fontsize=12)
axs[1,0].set_ylabel("Green",horizontalalignment='left',labelpad=15,fontsize=12)
axs[2,0].set_ylabel("Blue",horizontalalignment='left',labelpad=15,fontsize=12)
axs[3,0].set_ylabel("RGB",horizontalalignment='left',labelpad=15,fontsize=12)

axs[0,0].set_title('normal')
axs[0,1].set_title('abnormal')

plt.savefig('channel.png',dpi=300)

将所有图像的RGB通道分离，可以看出normal和abnormal的图像在颜色的亮度方面有所不同。
在R色彩的通道上可以得知红色channel上的normal的亮度较高，230-250之间；异常值的亮度集中在200之间。
B,G channel的分析类似如此，至于为什么会有部分的图片的亮度到255，可能是实验人员在制作切片时留下的白光斑。

In [None]:
fig,axs=plt.subplots(1,2,figsize=(8,2),dpi=150)
axs[0].hist(np.mean(x_nor,axis=(1,2,3)),bins=64,density=True)
axs[1].hist(np.mean(x_abnor,axis=(1,2,3)),bins=64,density=True)
axs[0].set_title('normal')
axs[1].set_title('abnormal')
# #三个不同维度下平均亮度分布有所不同
plt.savefig('averange_bright.png',dpi=300)

In [None]:
import gc
del x_nor
del x_abnor
gc.collect()

In [None]:
from keras.preprocessing.image import ImageDataGenerator

IMAGE_SIZE = 96

num_train_samples = len(df_train)
num_val_samples = len(df_val)
num_test_samples = len(df_test)

train_batch_size = 32
val_batch_size = 32
#test_batch_size =32

train_steps = np.ceil(num_train_samples / train_batch_size)
val_steps = np.ceil(num_val_samples / val_batch_size)
#test_steps=np.ceil(num_test_samples/test_batch_size)

print('train_steps',train_steps)
print('val_steps',val_steps)
#print('test_steps',test_steps)

datagen = ImageDataGenerator(rescale=1/255,
                             #rescale：缩放因子有助于模型的收敛
                            horizontal_flip=True,
                            vertical_flip=True)

train_gen = datagen.flow_from_directory(directory=train_path,
                                        target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                        batch_size=train_batch_size,
                                        class_mode='binary')

val_gen = datagen.flow_from_directory(valid_path,
                                        target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                        batch_size=val_batch_size,
                                        class_mode='binary')

test_gen = datagen.flow_from_directory(test_path,
                                       batch_size=1,
                                        target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                        class_mode='binary',shuffle=False)

In [None]:
#设置模型训练的参数
kernel_size=(3,3)
#内核
pool_size=(2,2)
#池化层
first_filters=32
#第一层32个过滤器
second_filters=64
#第二层64个过滤器
third_filters=128

dropout_conv=0.3
#卷积正则化使用，保持神经元输出的可能性
dropout_dense=0.3
#全连接网络的正则化使用，保持神经元的输出的可能性



In [None]:
model=Sequential()

#第一层卷积
model.add(Conv2D(first_filters,kernel_size,input_shape=(96,96,3),activation='relu'))
model.add(BatchNormalization())
model.add(Conv2D(first_filters,kernel_size,use_bias=False,activation='relu'))
model.add(BatchNormalization())
model.add(MaxPool2D(pool_size=pool_size))
model.add(Dropout(dropout_conv))

#第二层卷积
model.add(Conv2D(second_filters,kernel_size,use_bias=False,activation='relu'))
model.add(BatchNormalization())
model.add(Conv2D(second_filters,kernel_size,use_bias=False,activation='relu'))
model.add(BatchNormalization())
model.add(MaxPool2D(pool_size=pool_size))
model.add(Dropout(dropout_conv))

#第三层卷积
model.add(Conv2D(third_filters,kernel_size,use_bias=False,activation='relu'))
model.add(BatchNormalization())
model.add(Conv2D(third_filters,kernel_size,use_bias=False,activation='relu'))
model.add(BatchNormalization())
model.add(MaxPool2D(pool_size=pool_size))
model.add(Dropout(dropout_conv))

#两层全链接网络
model.add(Flatten())
model.add(Dense(256,use_bias=False,activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(dropout_dense))
model.add(Dense(1,activation='sigmoid'))

In [None]:
#模型编译
model.compile(Adam(0.001),
              loss='binary_crossentropy',
             metrics=['accuracy'])


In [None]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
earlystopper = EarlyStopping(patience=3,restore_best_weights=True)
#monitor:val_loss,  patience:如果连续两个epoch,val_loss都没有下降即停止训练 restore_best_weigths:停止后保存最佳权重=True

reducel = ReduceLROnPlateau(monitor='val_loss', patience=2, verbose=1)
#减少学习速率函数
# #参数有：keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
#                                             factor=0.1, 每次减少的学习速率的因子
#                                             patience=10, 
#                                             verbose=0, 
#                                            mode='auto', 
#                                         epsilon=0.0001, 阈值，用来确定是都进入检测值的平原区
#                                        cooldown=0, 学习速率减少后，会经过多少个cooldown个epoch才重新进行操作
#                                        min_lr=0)  学习率的下限

history=model.fit_generator(train_gen, steps_per_epoch=train_steps, 
                    validation_data=val_gen,
                    validation_steps=val_steps,
                    epochs=13,
                   callbacks=[reducel, earlystopper])
#steps_per_epoch:当生成器返回steps_per_epoch次数据时计一个epoch，执行下一个epoch
#validation_steps:指定验证集的生成器返回次数



In [None]:
model.summary()

In [None]:
acc=history.history['accuracy']
loss=history.history['loss']
val_loss=history.history['val_loss']
val_accuracy=history.history['val_accuracy']


sns.lineplot(x=np.arange(1,len(acc)+1),y=acc,marker='o')
sns.lineplot(x=np.arange(1,len(acc)+1),y=val_accuracy,marker='o')
plt.title('accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['train_a','val_a'])

plt.savefig('acc.png',dpi=100)
plt.show()

sns.lineplot(x=np.arange(1,len(acc)+1),y=loss,marker='o')
sns.lineplot(x=np.arange(1,len(acc)+1),y=val_loss,marker='o')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('loss')
plt.legend(['train_loss','val_loss'])
plt.savefig('loss.png',dpi=300)
plt.show()

In [None]:
from keras.utils import plot_model
plot_model(model, to_file='model.png',dpi=200)

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt

# make a prediction
y_pred_keras = model.predict_generator(test_gen, 
                                       steps=len(df_test),
                                       verbose=1)
fpr_keras, tpr_keras, thresholds_keras = roc_curve(test_gen.classes, y_pred_keras)
auc_keras = auc(fpr_keras, tpr_keras)
auc_keras
#y_pred_keras.shape
#test_gen.classes.shape


In [None]:
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='area = {:.3f}'.format(auc_keras))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')

plt.savefig('roc.png',dpi=300)
plt.show()

In [None]:
y_pred_keras[y_pred_keras>0.5]=1
y_pred_keras[y_pred_keras<=0.5]=0
a=y_pred_keras.flatten()
pd.Series(a).value_counts()

In [None]:

from sklearn import metrics as ms


precision=ms.precision_score(test_gen.classes,a)*100
print('precison:%.2f%%'%precision)
recall=ms.recall_score(test_gen.classes,a)*100 
print('recall:%.2f%%'%recall)


