In [None]:
from numpy.random import seed
seed(101)
from tensorflow import set_random_seed
set_random_seed(101)

import pandas as pd
import numpy as np


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

import os
import cv2
import cv2 as cv
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import itertools
import shutil
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
IMAGE_SIZE = 96
IMAGE_CHANNELS = 3

SAMPLE_SIZE = 70000 # the number of images we use from each of the two classes

In [None]:
os.listdir('../input')

In [None]:
df_data = pd.read_csv('../input/train_labels.csv')
df_data[df_data['id'] != 'dd6dfed324f9fcb6f93f46f32fc800f2ec196be2']
df_data[df_data['id'] != '9369c7278ec8bcc6c880d99194de09fc2bd4efbe']
print(df_data.shape)

In [None]:
df_data['label'].value_counts()

In [None]:
df_0 = df_data[df_data['label'] == 0].sample(SAMPLE_SIZE, random_state = 101)
df_1 = df_data[df_data['label'] == 1].sample(SAMPLE_SIZE, random_state = 101)
df_data = pd.concat([df_0, df_1], axis=0).reset_index(drop=True)
df_data = shuffle(df_data)
df_data['label'].value_counts()

In [None]:
plt.figure(figsize=(12,6))
for i in range(10):
    name = df_data.iloc[i,0]
    im = cv.imread("../input/train/" + name + ".tif")
    im = cv.cvtColor(im, cv.COLOR_BGR2RGB)
    plt.subplot(2,5,i+1)
    plt.imshow(im)
    plt.title("label " + str(df_data.loc[df_data['id']==name, 'label'].values[0]))

In [None]:
#对灰度图作均衡化
im2 = cv.imread("../input/train/c8402ec2db5ab23baaaaebab8580979e40dc6586.tif",0)
plt.figure(figsize=(30,20))
plt.subplot(231)
plt.imshow(im2,cmap='gray')
plt.title('origin image',fontsize=30)
plt.subplot(234)
hist = cv.calcHist([im2],[0],None,[256],[0,255])
plt.plot(hist,'r')
#直方图均衡化
plt.subplot(232)
plt.title('equalizeHist',fontsize=30)
im2_eq = cv.equalizeHist(im2)
plt.imshow(im2_eq, cmap='gray')
plt.subplot(235)
hist = cv.calcHist([im2_eq],[0],None,[256],[0,255])
plt.plot(hist,'r')
#自适应直方图均衡化
clahe = cv.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
im2_ad = clahe.apply(im2)
plt.subplot(233)
plt.title('CLAHE',fontsize=30)
plt.imshow(im2_ad, cmap='gray')
plt.subplot(236)
hist = cv.calcHist([im2_ad],[0],None,[256],[0,255])
plt.plot(hist,'r')
plt.show()

In [None]:
def change_img(im):
    clahe = cv.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    img_lab = cv.cvtColor(im, cv.COLOR_BGR2Lab)
    l, a, b = cv.split(img_lab)
    img_l = clahe.apply(l)
    img_clahe = cv.merge([img_l, a, b])
    img_clahe = cv.cvtColor(img_clahe, cv.COLOR_Lab2BGR)
    return img_clahe

In [None]:
y = df_data['label']

df_train, df_val = train_test_split(df_data, test_size=0.10, random_state=101, stratify=y)

print(df_train.shape)
print(df_val.shape)

In [None]:
df_train['label'].value_counts()

In [None]:
base_dir = 'base_dir'
os.mkdir(base_dir)
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)


val_dir = os.path.join(base_dir, 'val_dir')
os.mkdir(val_dir)

no_tumor_tissue = os.path.join(train_dir, 'a_no_tumor_tissue')
os.mkdir(no_tumor_tissue)
has_tumor_tissue = os.path.join(train_dir, 'b_has_tumor_tissue')
os.mkdir(has_tumor_tissue)

no_tumor_tissue = os.path.join(val_dir, 'a_no_tumor_tissue')
os.mkdir(no_tumor_tissue)
has_tumor_tissue = os.path.join(val_dir, 'b_has_tumor_tissue')
os.mkdir(has_tumor_tissue)

In [None]:
os.listdir('base_dir/train_dir')

In [None]:
df_data.set_index('id', inplace=True)

In [None]:
train_list = list(df_train['id'])
val_list = list(df_val['id'])



for image in train_list:
    fname = image + '.tif'
    target = df_data.loc[image,'label']
    if target == 0:
        label = 'a_no_tumor_tissue'
    if target == 1:
        label = 'b_has_tumor_tissue'
    src = os.path.join('../input/train', fname)
    dst = os.path.join(train_dir, label, fname)
    shutil.copyfile(src, dst)
    im = cv.imread(dst)
    cv.imwrite(dst, change_img(im))


for image in val_list:
    fname = image + '.tif'
    target = df_data.loc[image,'label']
    
    if target == 0:
        label = 'a_no_tumor_tissue'
    if target == 1:
        label = 'b_has_tumor_tissue'
    
    src = os.path.join('../input/train', fname)
    dst = os.path.join(val_dir, label, fname)
    shutil.copyfile(src, dst)
    im = cv.imread(dst)
    cv.imwrite(dst, change_img(im))
    

In [None]:
print(len(os.listdir('base_dir/train_dir/a_no_tumor_tissue')))
print(len(os.listdir('base_dir/train_dir/b_has_tumor_tissue')))

In [None]:
train_path = 'base_dir/train_dir'
valid_path = 'base_dir/val_dir'
test_path = '../input/test'

num_train_samples = len(df_train)
num_val_samples = len(df_val)
train_batch_size = 10
val_batch_size = 10


train_steps = np.ceil(num_train_samples / train_batch_size)
val_steps = np.ceil(num_val_samples / val_batch_size)

In [None]:
datagen = ImageDataGenerator(rescale=1.0/255)

train_gen = datagen.flow_from_directory(train_path,
                                        target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                        batch_size=train_batch_size,
                                        class_mode='categorical')

val_gen = datagen.flow_from_directory(valid_path,
                                        target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                        batch_size=val_batch_size,
                                        class_mode='categorical')

test_gen = datagen.flow_from_directory(valid_path,
                                        target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                        batch_size=1,
                                        class_mode='categorical',
                                        shuffle=False)

In [None]:
kernel_size = (3,3)
pool_size= (2,2)
first_filters = 32
second_filters = 64
third_filters = 128

dropout_conv = 0.3
dropout_dense = 0.3


model = Sequential()
model.add(Conv2D(first_filters, kernel_size, activation = 'relu', input_shape = (96, 96, 3)))
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(MaxPooling2D(pool_size = pool_size)) 
model.add(Dropout(dropout_conv))

model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

model.add(Flatten())
model.add(Dense(256, activation = "relu"))
model.add(Dropout(dropout_dense))
model.add(Dense(2, activation = "softmax"))

model.summary()

In [None]:
model.compile(Adam(lr=0.0001), loss='binary_crossentropy', 
              metrics=['accuracy'])

In [None]:
filepath = "model.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, 
                             save_best_only=True, mode='max')

reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor=0.5, patience=2, 
                                   verbose=1, mode='max', min_lr=0.00001)
                              
                              
callbacks_list = [checkpoint, reduce_lr]

history = model.fit_generator(train_gen, steps_per_epoch=train_steps, 
                    validation_data=val_gen,
                    validation_steps=val_steps,
                    epochs=8, verbose=1,
                   callbacks=callbacks_list)

In [None]:
model.load_weights('model.h5')

val_loss, val_acc = \
model.evaluate_generator(test_gen, 
                        steps=len(df_val))

print('val_loss:', val_loss)
print('val_acc:', val_acc)

In [None]:
import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.figure()

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

In [None]:
predictions = model.predict_generator(test_gen, steps=len(df_val), verbose=1)

In [None]:
predictions.shape

In [None]:
df_preds = pd.DataFrame(predictions, columns=['no_tumor_tissue', 'has_tumor_tissue'])

df_preds.head()

In [None]:
y_true = test_gen.classes
y_pred = df_preds['has_tumor_tissue']

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_true, y_pred)

In [None]:
shutil.rmtree('base_dir')

In [None]:
test_dir = 'test_dir'
os.mkdir(test_dir)
    
test_images = os.path.join(test_dir, 'test_images')
os.mkdir(test_images)

In [None]:
os.listdir('test_dir')

In [None]:
test_list = os.listdir('../input/test')

for image in test_list:
    fname = image
    src = os.path.join('../input/test', fname)
    dst = os.path.join(test_images, fname)
    shutil.copyfile(src, dst)
    im = cv.imread(dst)
    cv.imwrite(dst, change_img(im))

In [None]:
len(os.listdir('test_dir/test_images'))

In [None]:
test_path ='test_dir'
test_gen = datagen.flow_from_directory(test_path,
                                        target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                        batch_size=1,
                                        class_mode='categorical',
                                        shuffle=False)

In [None]:
num_test_images = 57458
model.load_weights('model.h5')

predictions = model.predict_generator(test_gen, steps=num_test_images, verbose=1)

In [None]:
df_preds = pd.DataFrame(predictions, columns=['no_tumor_tissue', 'has_tumor_tissue'])

df_preds.head()

In [None]:
test_filenames = test_gen.filenames
df_preds['file_names'] = test_filenames
df_preds.head()

In [None]:
def extract_id(x):
    
    # split into a list
    a = x.split('/')
    # split into a list
    b = a[1].split('.')
    extracted_id = b[0]
    
    return extracted_id

df_preds['id'] = df_preds['file_names'].apply(extract_id)

df_preds.head()

In [None]:
y_pred = df_preds['has_tumor_tissue']
image_id = df_preds['id']

In [None]:
submission = pd.DataFrame({'id':image_id, 
                           'label':y_pred, 
                          }).set_index('id')

submission.to_csv('patch_preds.csv', columns=['label']) 

In [None]:
submission.head()

In [None]:
shutil.rmtree('test_dir')