In [None]:
%%html
<marquee style='width: 100%; color: red;'><H1>prostate-cancer-grade-assessment</H1></marquee>

# Sommaire
1. Objectifs
2. Comprendre la base de données
   * Comprendre la base de données
3. Préparation de la base de données
 * Visualisation de données
 * Fixer quelques problèmes dans la base de données
    * Images sans masque
    * ISUP = 2 Gleason score = 4 + 3 
    * remplacer "négatif" par "0+0"
    * Quelques problèmes dans la base de données

# 1. Objectifs
Détecter et classer la gravité du cancer de la prostate sur des images d'échantillons de tissus prostatiques.

En pratique, les échantillons de tissus sont examinés et notés par les pathologistes selon le système de notation dit de Gleason, qui est ensuite converti en grade ISUP.

<img src="https://storage.googleapis.com/kaggle-media/competitions/PANDA/Screen%20Shot%202020-04-08%20at%202.03.53%20PM.png" height="100px">

# 2.Comprendre la base de données


train.csv et test.csv:

* image_id: Code d'identification de l'image.

* data_provider: Le nom de l'institution qui a fourni les données. L'Institut **Karolinska** et le Centre médical universitaire **Radboud** 



*   uniquement dans train.csv

* isup_grade: La gravité du cancer sur une échelle de 0 à 5.

* gleason_score: Un système alternatif d'évaluation de la gravité du cancer avec plus de niveaux que l'échelle ISUP. 

* train_images:
* 10616 images de type .tiff 
  * Karolinska=5455 images
  * Radboud=5060 images
* test_images:
3 images de type .tiff

train_label_masks: Segmentation masks showing which parts of the image led to the ISUP grade. Not all training images have label masks, and there may be false positives or false negatives in the label masks for a variety of reasons. These masks are provided to assist with the development of strategies for selecting the most useful subsamples of the images. The mask values depend on the data provider:

# 3.Préparation de la base de données

## Visualisation de données

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import openslide
import os
import cv2
from PIL import Image
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.vgg16 import VGG16,preprocess_input
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Input, Flatten,BatchNormalization,Activation
from keras.layers import GlobalMaxPooling2D,GlobalAveragePooling2D
from keras.models import Model
from keras.optimizers import Adam, SGD, RMSprop
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from keras.callbacks.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import cohen_kappa_score
import tensorflow as tf
from keras.callbacks import LearningRateScheduler
from keras.metrics import *

train_df = pd.read_csv("../input/prostate-cancer-grade-assessment/train.csv")
image_path = "../input/prostate-cancer-grade-assessment/train_images/"
PATH = "../input/prostate-cancer-grade-assessment/"
train_df = pd.read_csv(os.path.join(PATH,'train.csv'))
test_df =  pd.read_csv(os.path.join(PATH,'test.csv'))
train_img_path = '../input/prostate-cancer-grade-assessment/train_images'
train_read_img= pd.read_csv(PATH+"train.csv")
masks = '../input/prostate-cancer-grade-assessment/train_label_masks'
images_train_list = os.listdir(os.path.join(PATH, 'train_images'))
masks_list = os.listdir(os.path.join(PATH, 'train_label_masks'))
sns.set_style("darkgrid")

In [None]:
DEVICE = "TPU"

In [None]:
if DEVICE != "TPU":
    print("Using default strategy for CPU and single GPU")
    strategy = tf.distribute.get_strategy()

if DEVICE == "GPU":
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    
print("REPLICAS: ", strategy.num_replicas_in_sync)
AUTO = tf.data.experimental.AUTOTUNE


In [None]:
print(train_df)

In [None]:
print(test_df)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,5))
sns.countplot(ax=ax1, x="data_provider", data=train_df)
ax1.set_title("distribution de data_provider  dans  training data")
sns.countplot(ax=ax2, x="data_provider", data=test_df)
ax2.set_title("distribution de data_provider dans test data")
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,5))
sns.countplot(ax=ax1, x="isup_grade", data=train_df)
ax1.set_title("ISUP Grade distribution dans  Data Provider")
sns.countplot(ax=ax2, x="gleason_score", data=train_df)
ax2.set_title("Gleason_Score distribution dans  Data Provider")
plt.show()

In [None]:
from tqdm import tqdm

img_dim= []

for i,row in tqdm(train_df.iterrows()):
    slide = openslide.OpenSlide(os.path.join(train_img_path, train_df.image_id.iloc[i]+'.tiff'))
    img_dim.append(slide.dimensions)
    slide.close()
    
width = [dimensions[0] for dimensions in img_dim] 
height = [dimensions[1] for dimensions in img_dim] 

train_df['width'] = width
train_df['height'] = height

In [None]:
fig = plt.figure(figsize=(20,5))
ax = sns.scatterplot(x='width', y='height', data=train_df, hue='data_provider', alpha=0.70)
ax.tick_params(labelsize=10)

plt.title('Dimensions des images')
plt.show()

In [None]:
fig, ax = plt.subplots(1, 2)
fig.set_size_inches(20, 5)

sns.stripplot(train_df['width'],train_df['data_provider'],ax=ax[0],jitter=True)
sns.stripplot(train_df['height'],train_df['data_provider'],ax=ax[1],jitter=True)

ax[0].tick_params(labelsize=10)
ax[1].tick_params(labelsize=10)
ax[0].tick_params(labelrotation=90)
ax[1].tick_params(labelrotation=90)
plt.show()

In [None]:
data_file_masks = pd.Series(masks_list).to_frame()
data_file_masks.columns = ['mask_file_name']
data_file_masks.head()

In [None]:
data_file_masks['image_id'] =data_file_masks.mask_file_name.apply(lambda x: x.split('_')[0])
data_file_masks.head()

In [None]:
train_df = pd.merge(train_df, data_file_masks, on='image_id', how='outer')
train_df.head()

## Fixer quelques problèmes dans la base de données

# Images sans masque
il y a des images sans masque dans la base de ddonnées

In [None]:
del data_file_masks
print(f"Il y a {len(train_df[train_df.mask_file_name.isna()])} images sans masque.")

In [None]:
print(f"Train data avant la réduction: {len(train_df)}")
df_train_reduction= train_df[~train_df.mask_file_name.isna()]
print(f"Train data après la réduction: {len(df_train_reduction)}")

In [None]:
fig,ax=plt.subplots(1,2,figsize=(20,5))
train_df['data_provider'].value_counts().plot.pie(autopct='%1.1f%%',ax=ax[0])
ax[0].set_ylabel('')
df_train_reduction['data_provider'].value_counts().plot.pie(autopct='%1.1f%%',ax=ax[1])
ax[1].set_ylabel('')
plt.show()

* le test data contient uniquement 3 images , donc  je vais créer un autre fichier new_test.csv  avec les 100 images que j'ai supprimé (images sans masque)

In [None]:
"""
images_without_masks=train_df[train_df.mask_file_name.isna()]
without_masks=images_without_masks.groupby('image_id').data_provider.unique().to_frame()
without_masks.to_csv("new_test.csv",index=False)
without_masks
"""

inspiré de : [Links](https://medium.com/@kvnamipara/a-better-visualisation-of-pie-charts-by-matplotlib-935b7667d77f)

* 1. ISUP grade = 0  Gleason score 0+0 or negative.
* 1. ISUP grade = 1  Gleason score 3+3.
* 1. ISUP grade = 2  Gleason score 3+4.
* 1. ISUP grade = 3  Gleason score 4+3.
* 1. ISUP grade = 4  Gleason score 4+4 (majority), 3+5 or 5+3.
* 1. ISUP grade = 5  Gleason score 4+5 (majority), 5+4 or 5+5.

In [None]:
df_train_reduction.groupby('isup_grade').gleason_score.unique().to_frame()

# ISUP = 2 Gleason score = 4 + 3 
** Il n'y a pas de ISUP = 2 , Gleason score = 4+3 dans le système de notation Gleason + il n'y a qu'une seule image de ce type et elle semble être une erreur, je vais donc la supprimer.**

In [None]:
df_train_reduction[(df_train_reduction.isup_grade == 2) & (df_train_reduction.gleason_score == '4+3')].reset_index()

In [None]:
df_train_reduction.reset_index(inplace=True)
df_train_reduction = df_train_reduction[df_train_reduction.image_id !='b0a92a74cb53899311acc30b7405e101']

In [None]:
df_train_reduction[(df_train_reduction.isup_grade == 2) & (df_train_reduction.gleason_score == '4+3')].reset_index()

In [None]:
df_train_reduction.groupby('isup_grade').gleason_score.unique().to_frame()

In [None]:
temp = df_train_reduction.groupby('isup_grade').count()['image_id'].reset_index().sort_values(by='image_id',ascending=False)
temp.style.background_gradient(cmap='Purples')

In [None]:
temp = df_train_reduction.groupby('gleason_score').count()['image_id'].reset_index().sort_values(by='image_id',ascending=False)
temp.style.background_gradient(cmap='Reds')

# remplacer "negative" par "0+0"

In [None]:
df_train_reduction[(df_train_reduction.isup_grade == 0) & (df_train_reduction.gleason_score =='negative')].reset_index()

In [None]:
sns.set_style("darkgrid")
fig= plt.subplots(figsize=(20,5))
sns.countplot(x='gleason_score', hue="data_provider", data=df_train_reduction)
plt.show()

*   nous pouvons voir que radboud n'a pas de valeurs "0+0" alors que karolinska n'a pas de valeurs "negative".
*    conclusion : "negative" correspond à la façon dont le radbound représente "0+0" (c'est-à-dire l'absence de cancer) ; il serait donc plus logique de remplacer "negative" par "0+0".

In [None]:
df_train_reduction["gleason_score"]= df_train_reduction["gleason_score"].replace("negative", "0+0")

In [None]:
df_train_reduction.groupby('isup_grade').gleason_score.unique().to_frame()

In [None]:
temp = df_train_reduction.groupby('gleason_score').count()['image_id'].reset_index().sort_values(by='image_id',ascending=False)
temp.style.background_gradient(cmap='Reds')

# Affichage de quelques images

In [None]:
def show_images(df, read_region=(1780,1950)):
    
    data = df
    f, ax = plt.subplots(3,3, figsize=(20,20))
    for i,data_row in enumerate(data.iterrows()):
        image = str(data_row[1][0])+'.tiff'
        image_path = os.path.join(PATH,"train_images",image)
        image = openslide.OpenSlide(image_path)
        spacing = 1 / (float(image.properties['tiff.XResolution']) / 10000)
        patch = image.read_region(read_region, 0, (256, 256))
        ax[i//3, i%3].imshow(patch) 
        image.close()       
        ax[i//3, i%3].axis('off')
        ax[i//3, i%3].set_title('ID: {}\nSource: {} ISUP: {} Gleason: {}'.format(
                data_row[1][0], data_row[1][1], data_row[1][2], data_row[1][3]))

    plt.show()
    
images = [
    '07a7ef0ba3bb0d6564a73f4f3e1c2293',
    '037504061b9fba71ef6e24c48c6df44d',
    '035b1edd3d1aeeffc77ce5d248a01a53',
    '059cbf902c5e42972587c8d17d49efed',
    '06a0cbd8fd6320ef1aa6f19342af2e68',
    '06eda4a6faca84e84a781fee2d5f47e1',
    '0a4b7a7499ed55c71033cefb0765e93d',
    '0838c82917cd9af681df249264d2769c',
    '046b35ae95374bfb48cdca8d7c83233f'
]
data_sample = train_df.loc[train_df.image_id.isin(images)]
show_images(data_sample)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,5))
sns.countplot(ax=ax1, x="isup_grade", data=df_train_reduction)
ax1.set_title("ISUP Grade Count by Data Provider")
sns.countplot(ax=ax2, x="gleason_score", data=df_train_reduction)
ax2.set_title("Gleason_Score Count by Data Provider")
plt.show()

# Affichage de quelques masques pour loacaliser le cancer et comprendre chaque grade de la maladie

In [None]:
def show_masks(slides): 
    f, ax = plt.subplots(5,3, figsize=(18,22))
    for i, slide in enumerate(slides):
        mask = openslide.OpenSlide(os.path.join(mask_dir, f'{slide}_mask.tiff'))
        mask_data = mask.read_region((0,0), mask.level_count - 1, mask.level_dimensions[-1])
        cmap = matplotlib.colors.ListedColormap(['black', 'gray', 'green', 'yellow', 'orange', 'red'])
        ax[i//3, i%3].imshow(np.asarray(mask_data)[:,:,0], cmap=cmap, interpolation='nearest', vmin=0, vmax=5) 
        mask.close()       
        ax[i//3, i%3].axis('off')    
        image_id = slide
        data_provider = data_sample_mask.loc[slide, 'data_provider']
        isup_grade = data_sample_mask.loc[slide, 'isup_grade']
        gleason_score = data_sample_mask.loc[slide, 'gleason_score']
        ax[i//3, i%3].set_title(f"ID: {image_id}\nSource: {data_provider} ISUP: {isup_grade} Gleason: {gleason_score}")
        f.tight_layout()
        
    plt.show()

In [None]:
images_mask  = [
    '07a7ef0ba3bb0d6564a73f4f3e1c2293',
    '037504061b9fba71ef6e24c48c6df44d',
    '035b1edd3d1aeeffc77ce5d248a01a53',
    '059cbf902c5e42972587c8d17d49efed',
    '06a0cbd8fd6320ef1aa6f19342af2e68',
    '06eda4a6faca84e84a781fee2d5f47e1',
    '0a4b7a7499ed55c71033cefb0765e93d',
    '0838c82917cd9af681df249264d2769c',
    '028098c36eb49a8c6aa6e76e365dd055',
    '0280f8b612771801229e2dde52371141',
    '028dc05d52d1dd336952a437f2852a0a',
    '02a2dcd6ad8bc1d9ad7fdc04ffb6dff3',
    '049031b0ea0dede1ca1e5ca470c1332d',
    '05f4e9415af9fdabc19109c980daf5ad',
    '07fd8d4f02f9b95d86da4bc89563e077'
]

mask_dir = os.path.join(PATH,"train_label_masks")
data_sample_mask = df_train_reduction.set_index('image_id')
show_masks(images_mask)

# Affichage de quelques images et leurs masques

In [None]:
def mask_img(image,max_size=(600,400)):
    slide = openslide.OpenSlide(os.path.join(train_img_path, f'{image}.tiff'))
    mask =  openslide.OpenSlide(os.path.join(mask_dir, f'{image}_mask.tiff'))
    f,ax =  plt.subplots(1,2 ,figsize=(18,22))
    spacing = 1 / (float(slide.properties['tiff.XResolution']) / 10000)
    img = slide.get_thumbnail(size=(600,400)) 
    mask_data = mask.read_region((0,0), mask.level_count - 1, mask.level_dimensions[-1])
    cmap = matplotlib.colors.ListedColormap(['black', 'gray', 'green', 'yellow', 'orange', 'red'])
    
    
    ax[0].imshow(img)
    ax[1].imshow(np.asarray(mask_data)[:,:,0], cmap=cmap, interpolation='nearest', vmin=0, vmax=5) 
    
    image_id = image
    data_provider = data_sample_mask.loc[image, 'data_provider']
    isup_grade = data_sample_mask.loc[image, 'isup_grade']
    gleason_score = data_sample_mask.loc[image, 'gleason_score']
    ax[0].set_title(f"ID: {image_id}\nSource: {data_provider} ISUP: {isup_grade} Gleason: {gleason_score} IMAGE")
    ax[1].set_title(f"ID: {image_id}\nSource: {data_provider} ISUP: {isup_grade} Gleason: {gleason_score} IMAGE_MASK")

In [None]:
images1= [
    '08ab45297bfe652cc0397f4b37719ba1',
    '090a77c517a7a2caa23e443a77a78bc7',
    '07fd8d4f02f9b95d86da4bc89563e077'
]

for image in images1:
    mask_img(image)


panda-resized-train-data-512x512 , code source : [Links](https://www.kaggle.com/xhlulu/panda-resize-and-save-train-data)

In [None]:
train_df=df_train_reduction
Accuracies_list=[]
labels=[]
data=[]
data_dir='../input/panda-resized-train-data-512x512/train_images/train_images/'
for i in range(train_df.shape[0]):
    data.append(data_dir + train_df['image_id'].iloc[i]+'.png')
    labels.append(train_df['isup_grade'].iloc[i])
df=pd.DataFrame(data)
df.columns=['images']
df['isup_grade']=labels

In [None]:
df.head()

In [None]:
print(len(df))

In [None]:
print(labels)

### diviser notre data set

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df['images'],df['isup_grade'], test_size=0.1, random_state=42)

In [None]:
train=pd.DataFrame(X_train)
train.columns=['images']
train['isup_grade']=y_train

validation=pd.DataFrame(X_val)
validation.columns=['images']
validation['isup_grade']=y_val

train['isup_grade']=train['isup_grade'].astype(str)
validation['isup_grade']=validation['isup_grade'].astype(str)

In [None]:
print("train size ",len(train))
print("validation size ",len(validation))

In [None]:
print(train)

In [None]:
print(validation)

### après le divisiment de notre data set

In [None]:
sns.set(style="darkgrid")
a = ['TRAIN DATA ','TEST DATA ']
b = [len((train)),len((validation))]
ax = sns.barplot(x=a, y=b)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,5))
sns.countplot(ax=ax1, x="isup_grade", data=train)
ax1.set_title("distribution de Grade ISUP dans le TRAIN DATA après le divisiment")
sns.countplot(ax=ax2, x="isup_grade", data=validation)
ax2.set_title("distribution de Grade ISUP dans le TEST DATA après le divisiment")
plt.show()

### data  augmentation

In [None]:
train_datagen = ImageDataGenerator(rescale=1./255,rotation_range=45,
    featurewise_center=True,
    featurewise_std_normalization=True,
    zoom_range=[0.8, 1.2],        
    horizontal_flip=True, vertical_flip = True,
    brightness_range=[0.9, 1.1],
    width_shift_range=1.0,
    height_shift_range=1.0)

val_datagen=train_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train,
    x_col='images',
    y_col='isup_grade',
    target_size=(224, 224),
    batch_size=32,
    seed=2020,
    shuffle = True,
    class_mode='categorical')

validation_generator = val_datagen.flow_from_dataframe(
    dataframe=validation,
    x_col='images',
    y_col='isup_grade',
    target_size=(224, 224),
    batch_size=32,
    seed=2020,
    class_mode='categorical')

#### METRIC

In [None]:
METRICS = [
      TruePositives(name='tp'),
      FalsePositives(name='fp'),
      TrueNegatives(name='tn'),
      FalseNegatives(name='fn'), 
      BinaryAccuracy(name='accuracy'),
      Precision(name='precision'),
      Recall(name='recall'),
      AUC(name='auc'),
]

In [None]:
"""
%load_ext tensorboard
logdir = "logs/scalars/"
"""

In [None]:
"""
import keras
def lr_schedule(epoch):
  
  learning_rate = 0.2
  if epoch > 10:
    learning_rate = 0.02
  if epoch > 20:
    learning_rate = 0.001
  if epoch > 50:
    learning_rate = 0.0005

  tf.summary.scalar('learning rate', data=learning_rate, step=epoch)
  return learning_rate


lr_callback = keras.callbacks.LearningRateScheduler(lr_schedule)
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)
"""

In [None]:
"""
callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=1, verbose=1, factor=0.5),
             EarlyStopping(monitor='val_loss', patience=3),
             ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]
"""


#earlyStopping = EarlyStopping(monitor='val_loss', patience=2, verbose=1, mode='auto')
#mcp_save = ModelCheckpoint(filepath='best_model.h5', save_best_only=True, monitor='val_loss', mode='auto')
#reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1, epsilon=1e-4, mode='auto')


earlyStopping =tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=1, mode='auto')
mcp_save  = tf.keras.callbacks.ModelCheckpoint(filepath='best_model.h5',save_best_only=True,monitor='val_loss', mode='auto')
#reduce_lr_loss = tf.keras.callbacks.ReduceLROnPlateau( monitor='val_loss', factor=0.1, patience=2, verbose=1, mode='auto', min_delta=0.0001,epsilon=1e-4)



def lrfn(epoch):
    LR_START          = 0.000005
    LR_MAX            = 0.000020 * strategy.num_replicas_in_sync
    LR_MIN            = 0.000001
    LR_RAMPUP_EPOCHS = 5
    LR_SUSTAIN_EPOCHS = 0
    LR_EXP_DECAY = .8
    
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = (LR_MAX - LR_MIN) * LR_EXP_DECAY**(epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
    return lr

In [None]:
lr_schedule = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=1)

In [None]:
def vgg16_model( num_classes=None):

    model = VGG16(weights='../input/keras-pretrained-models/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5', include_top=False, input_shape=(224, 224, 3))
    x=Dropout(0.3)(model.output)
    x=Flatten()(x)
    x=Dense(32, activation = 'relu')(x)
    x=Dropout(0.2)(x)
    output=Dense(num_classes,activation='softmax')(x)
    model=Model(model.input,output)
    return model
vgg16_conv=vgg16_model(6)

In [None]:
vgg16_conv.summary()

In [None]:
"""
def kappa_score(y_true, y_pred):
    
    y_true=tf.math.argmax(y_true)
    y_pred=tf.math.argmax(y_pred)
    return tf.compat.v1.py_func(cohen_kappa_score ,(y_true, y_pred),tf.double)
"""

lr= 0.0005, momentum=0.9,decay=1e-4 ,inspiré de : [Links](https://machinelearningmastery.com/understand-the-dynamics-of-learning-rate-on-deep-learning-neural-networks/)

In [None]:
#opt =SGD(lr= 0.0005, momentum=0.9,decay=1e-4)
vgg16_conv.compile(optimizer='adam',
    loss = 'binary_crossentropy',
    metrics=['accuracy'])

In [None]:
nb_epochs =10
batch_size=32
nb_train_steps = train.shape[0]//batch_size
nb_val_steps=validation.shape[0]//batch_size
print("Number of training and validation steps: {} and {}".format(nb_train_steps,nb_val_steps))

In [None]:
"""
def data_augment(image, label):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    image = tf.image.random_hue(image, 0.01)
    image = tf.image.random_saturation(image, 0.7, 1.3)
    image = tf.image.random_contrast(image, 0.8, 1.2)
    image = tf.image.random_brightness(image, 0.1)
    return image, label   

def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
    dataset = dataset.map(data_augment, num_parallel_calls=AUTO)
    dataset = dataset.repeat() 
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) =
    return dataset

def get_validation_dataset(ordered=False):
    dataset = load_dataset(VALIDATION_FILENAMES, labeled=True, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTO)
    return dataset
"""

In [None]:
vgg16_history=vgg16_conv.fit_generator(train_generator,
                                       steps_per_epoch=nb_train_steps,
                                       epochs=nb_epochs,
                                       validation_data=validation_generator,
                                       validation_steps=nb_val_steps,
                                       callbacks=[earlyStopping,mcp_save,lr_schedule])

In [None]:
vgg16_conv.save('prostate_cancer_vgg16_model.h5')
vgg16_weights =vgg16_conv.save_weights('vgg16_weights.h5')
Accuracies_list.append(['vgg16', vgg16_history])

In [None]:
def show_history(history):
    fig, ax = plt.subplots(1, 3, figsize=(20,5))
    ax[0].set_title('loss')
    ax[0].plot(history.epoch, history.history["loss"], label="Train loss")
    ax[0].plot(history.epoch, history.history["val_loss"], label="Validation loss")
    ax[1].set_title('AUC')
    ax[1].plot(history.epoch, history.history["auc"], label="Train AUC")
    ax[1].plot(history.epoch, history.history["val_auc"], label="Validation AUC")
    ax[2].set_title('Accuracy')
    ax[2].plot(history.epoch, history.history["accuracy"], label="Train accuracy")
    ax[2].plot(history.epoch, history.history["val_accuracy"], label="Validation accuracy")
    ax[0].legend()
    ax[1].legend()
    ax[2].legend()

In [None]:
show_history(vgg16_history)

In [None]:
from keras.applications.resnet50 import ResNet50
def ResNet50_model(num_classes = None):
    #model = ResNet50(weights='imagenet', include_top = False, input_shape = (224,224,3))
    model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    #x=Dropout(0.2)(model.output)
    #x = GlobalAveragePooling2D()(model.output)
    x=Flatten()(model.output)
    #x =Dropout(0.2)(x)
    x =Dense(16, activation = 'relu')(x)
    x =Dropout(0.2)(x)
    output=Dense(num_classes,activation='softmax')(x)
    model=Model(model.input,output)
    return model
ResNet50_conv = ResNet50_model(6)

In [None]:
ResNet50_conv.summary()

In [None]:
ResNet50_conv.compile(loss='binary_crossentropy',optimizer=opt,metrics=METRICS)

In [None]:
RN_50_history=ResNet50_conv.fit_generator(train_generator,
                                          steps_per_epoch=nb_train_steps,
                                          epochs=nb_epochs,
                                          validation_data=validation_generator,
                                          validation_steps=nb_val_steps,
                                          callbacks=[earlyStopping, mcp_save,lr_schedule])

In [None]:
ResNet50_conv.save('prostate_cancer_ResNet50_conv.h5')
Accuracies_list.append(['ResNet50', RN_50_history])

In [None]:
def show_history(history):
    fig, ax = plt.subplots(1, 3, figsize=(20,5))
    ax[0].set_title('loss')
    ax[0].plot(history.epoch, history.history["loss"], label="Train loss")
    ax[0].plot(history.epoch, history.history["val_loss"], label="Validation loss")
    ax[1].set_title('AUC')
    ax[1].plot(history.epoch, history.history["auc"], label="Train AUC")
    ax[1].plot(history.epoch, history.history["val_auc"], label="Validation AUC")
    ax[2].set_title('Accuracy')
    ax[2].plot(history.epoch, history.history["accuracy"], label="Train accuracy")
    ax[2].plot(history.epoch, history.history["val_accuracy"], label="Validation accuracy")
    ax[0].legend()
    ax[1].legend()
    ax[2].legend()

In [None]:
show_history(RN_50_history)

In [None]:
from keras.applications.vgg19 import VGG19
def vgg19_model(num_classes = None):
    model = VGG19(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    x=Dropout(0.3)(model.output)
    x=Flatten()(x)
    x =Dense(32, activation = 'relu')(x)
    x =Dropout(0.2)(x)
    output=Dense(num_classes,activation='softmax')(x)
    model=Model(model.input,output)
    return model
vgg19_conv = vgg19_model(6)

In [None]:
vgg19_conv.summary()

In [None]:
vgg19_conv.compile(loss='binary_crossentropy',optimizer=opt,metrics=METRICS)

In [None]:
vgg19_history=vgg19_conv.fit_generator(train_generator,
                                       steps_per_epoch=nb_train_steps,
                                       epochs=nb_epochs,
                                       validation_data=validation_generator,
                                       validation_steps=nb_val_steps,
                                       callbacks=[earlyStopping, mcp_save,lr_schedule])

In [None]:
vgg19_conv.save('prostate_cancer_vgg19_conv.h5')
Accuracies_list.append(['vgg19', vgg19_history])

In [None]:
def show_history(history):
    fig, ax = plt.subplots(1, 3, figsize=(20,5))
    ax[0].set_title('loss')
    ax[0].plot(history.epoch, history.history["loss"], label="Train loss")
    ax[0].plot(history.epoch, history.history["val_loss"], label="Validation loss")
    ax[1].set_title('AUC')
    ax[1].plot(history.epoch, history.history["auc"], label="Train AUC")
    ax[1].plot(history.epoch, history.history["val_auc"], label="Validation AUC")
    ax[2].set_title('Accuracy')
    ax[2].plot(history.epoch, history.history["accuracy"], label="Train accuracy")
    ax[2].plot(history.epoch, history.history["val_accuracy"], label="Validation accuracy")
    ax[0].legend()
    ax[1].legend()
    ax[2].legend()

In [None]:
show_history(vgg19_history)

In [None]:
from keras.applications.inception_v3 import InceptionV3
def InceptionV3_model(num_classes = None):
    InceptionV3_weights = '../input/keras-pretrained-models/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5'
    model = InceptionV3(weights= InceptionV3_weights, include_top=False, input_shape=(224, 224, 3))
    x=Dropout(0.3)(model.output)
    x=Flatten()(x)
    x =Dense(32, activation = 'relu')(x)
    x =Dropout(0.2)(x)
    output=Dense(num_classes,activation='softmax')(x)
    model=Model(model.input,output)
    return model
InceptionV3_conv = InceptionV3_model(6)

In [None]:
InceptionV3_conv.summary()

In [None]:
InceptionV3_conv.compile(loss='binary_crossentropy',optimizer=opt,metrics=METRICS)

In [None]:
InceptionV3_history=InceptionV3_conv.fit_generator( train_generator,
                                           steps_per_epoch=nb_train_steps,
                                           epochs=nb_epochs,
                                           validation_data=validation_generator,
                                           validation_steps=nb_val_steps,
                                           callbacks=[earlyStopping, mcp_save,lr_schedule])

In [None]:
InceptionV3_conv.save('prostate_cancer_vgg19_conv.h5')
Accuracies_list.append(['InceptionV3',InceptionV3_history])

In [None]:
def show_history(history):
    fig, ax = plt.subplots(1, 3, figsize=(20,5))
    ax[0].set_title('loss')
    ax[0].plot(history.epoch, history.history["loss"], label="Train loss")
    ax[0].plot(history.epoch, history.history["val_loss"], label="Validation loss")
    ax[1].set_title('AUC')
    ax[1].plot(history.epoch, history.history["auc"], label="Train AUC")
    ax[1].plot(history.epoch, history.history["val_auc"], label="Validation AUC")
    ax[2].set_title('Accuracy')
    ax[2].plot(history.epoch, history.history["accuracy"], label="Train accuracy")
    ax[2].plot(history.epoch, history.history["val_accuracy"], label="Validation accuracy")
    ax[0].legend()
    ax[1].legend()
    ax[2].legend()

In [None]:
show_history(InceptionV3_history)

In [None]:
Accuracies_list = np.array(Accuracies_list)
model_names = Accuracies_list[:, 0]
histories = Accuracies_list[:, 1]

fig, ax = plt.subplots(2, 2, figsize=(20, 20))
sns.barplot(x=model_names, y=list(map(lambda x: x.history.get('auc')[-1], histories)), ax=ax[0, 0], palette='Spectral')
sns.barplot(x=model_names, y=list(map(lambda x: x.history.get('val_auc')[-1], histories)), ax=ax[0, 1], palette='gist_yarg')
sns.barplot(x=model_names, y=list(map(lambda x: x.history.get('accuracy')[-1], histories)), ax=ax[1, 0], palette='rocket')
sns.barplot(x=model_names, y=list(map(lambda x: x.history.get('val_accuracy')[-1], histories)), ax=ax[1, 1], palette='ocean_r')
ax[0, 0].set_title('Model Training AUC scores')
ax[0, 1].set_title('Model Validation AUC scores')
ax[1, 0].set_title('Model Training Accuracies')
ax[1, 1].set_title('Model Validation Accuracies')
fig.suptitle('Model Comparisions')
plt.show()

In [None]:
metric_dataframe = pd.DataFrame({
    'Model Names': model_names,
    'True Positives': list(map(lambda x: x.history.get('tp')[-1], histories)),
    'False Positives': list(map(lambda x: x.history.get('fp')[-1], histories)),
    'True Negatives': list(map(lambda x: x.history.get('tn')[-1], histories)),
    'False Negatives': list(map(lambda x: x.history.get('fn')[-1], histories))
})
fig, ax = plt.subplots(2, 2, figsize=(20, 20))
sns.barplot(x='Model Names', y='True Positives', data=metric_dataframe, ax=ax[0, 0], palette='BrBG')
sns.barplot(x='Model Names', y='False Positives', data=metric_dataframe, ax=ax[0, 1], palette='icefire_r')
sns.barplot(x='Model Names', y='True Negatives', data=metric_dataframe, ax=ax[1, 0], palette='PuBu_r')
sns.barplot(x='Model Names', y='False Negatives', data=metric_dataframe, ax=ax[1, 1], palette='YlOrBr')
ax[0, 0].set_title('True Positives of Models')
ax[0, 1].set_title('False Positives of Models')
ax[1, 0].set_title('True Negatives of Models')
ax[1, 1].set_title('False Negatives of Models')
fig.suptitle('Confusion Matrix comparision of Models', size=16)
plt.show()

In [None]:
vgg16_conv.load_weights("best_model.h5")


In [None]:
import skimage.io
def predict_isup_grade(df, path):
    
    df["image_path"] = [path+image_id+".tiff" for image_id in df["image_id"]]
    df["isup_grade"] = 0
    predictions = []
    for idx, row in df.iterrows():
        print(row.image_path)
        img=skimage.io.imread(str(row.image_path))
        img = cv2.resize(img, (224,224))
        img = cv2.resize(img, (224,224))
        img = img.astype(np.float32)/255.
        img=np.reshape(img,(1,224,224,3))
        prediction=vgg16_conv.predict(img)
        predictions.append(np.argmax(prediction))
            
    df["isup_grade"] = predictions
    df = df.drop('image_path', 1)
    return df[["image_id","isup_grade"]]


In [None]:
training_df_val = pd.read_csv("../input/prostate-cancer-grade-assessment/train.csv")[:20]
predict_isup_grade(training_df_val, image_path)

In [None]:
"""
test_path = "../input/prostate-cancer-grade-assessment/test_images/"
test_df = pd.read_csv("../input/prostate-cancer-grade-assessment/test.csv")[:20]
predict_isup_grade(test_df, test_path, passes=5)
predict_isup_grade.head()
"""

## prochain travail
1. créer une nouvelle data-set a partir des images supprimées pour le test après ,car il y a juste le test.csv , les images pour le test n'existe pas
1. essayer de nouvelle techniques ,il est possible que j'utilise pytorch