In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image

# Analyze the data structure

In [None]:
PATH = '/kaggle/input'
data_folder = os.listdir(PATH)[0]
main_path = os.path.join(PATH, data_folder)
print(f'the data is in: {main_path}')

In [None]:
print(f'The data structure is: {os.listdir(main_path)}')

# Business understanding

Histopathologic Cancer Detection

We have a histopathology by light microscopy: It is a sample of dough extracted from an organ (biopsy) with an ematoxylin eosin stain, this staining shows us the cytoplasm, nucleus and the plasma membrane (It only shows the morphology of the cell). For the cell to be normal the image should be like:
1. The cell must have a polarity (an order in orientation)
2. Homogeneous cell differentiation: It must have a specialized order, for example, ranging from large to small cells.
3. That it is not dysmorphic (same size between equal cells)
4. Neoplasia: Do not exceed the basement membrane (basement membrane discontinuity is cancer)

# Data Understanding

In [None]:
# See the shape of all images:


def get_size(folder):
    sizes = []
    # Get from main directory all sub-directories
    for folder_path in os.listdir(folder):
        # Skip

        if (folder_path == 'train_labels.csv' or folder_path == 'sample_submission.csv'):
            #print(f'{folder_path}')
            continue
        # See Train and Test sub-directories
        count = 0
        for filename in os.listdir(os.path.join(folder, folder_path)):
            if count == 100:
                break
            count += 1
            # take image
            img = Image.open(os.path.join(folder, folder_path, filename))
            # Get image with 
            #print(img.size)
            # Stores data like: (width, height)
            sizes.append(img.size)
        
        print(f'{folder_path}')
        print(f'the max width is: {max(sizes[0])}, and the min width is: {min(sizes[0])}')
        print(f'the max height is: {max(sizes[1])}, and the min height is: {min(sizes[1])}')
        print(f'the mean width is: {np.mean(sizes[0])}, and the mean height is: {np.mean(sizes[1])}')

# Call the function
get_size(main_path)

In [None]:
# Paths

images_folder_train = os.path.join(main_path , 'train/')
train_path_labels = os.path.join(main_path , 'train_labels.csv')

images_folder_test = os.path.join(main_path , 'test/')
test_path_labels = os.path.join(main_path , 'sample_submission.csv')

## See Images

In [None]:
# See some files
print(f'the number of images are {len(os.listdir(images_folder_train))}')
os.listdir(images_folder_train)[:10]

In [None]:
import matplotlib.pyplot as plt
def plot_images(axis=(2,2), train = True):
    
    if train:
        img_path = images_folder_train
        # name of images 
        df = pd.read_csv(train_path_labels)['id']
        labels = pd.read_csv(train_path_labels)['label']
    else:
        img_path = images_folder_test
        # name of images 
        labels = pd.read_csv(test_path_labels)['label']
        df = pd.read_csv(test_path_labels)['id']
        
    
        
    # Grid
    f, axarr = plt.subplots(axis[0], axis[1], figsize=(30/axis[1], 10))
    
    for i in range(0,axis[0]):
        for j in range (0,axis[1]):
            
            # Choose a random image
            index_img = np.random.randint(len(df))
            image_name = df.iloc[index_img]
            image_path = os.path.join(img_path, image_name)
            
            image_label = labels.iloc[index_img]
            
            if (not train):
                image_label = 'No label'
            elif (image_label == 0):
                image_label = 'No cancer'
            elif (image_label == 1):
                image_label = 'Cancer'
            
            # Read Image:
            img = Image.open(image_path+'.tif').convert('RGB')
            # To numpy
            img = np.asarray(img)
            # print(f'image shape {img.shape}, max: {img.max()}, min: {img.min()}')
            # Plot
            axarr[i,j].imshow(img)
            axarr[i,j].title.set_text(image_label)
    plt.show()

In [None]:
plot_images(axis=(3,3), train = True)

In [None]:
plot_images(axis=(3,3), train=False)

## See Labels

In [None]:
# Train Images and labels
train_map = pd.read_csv(train_path_labels)
train_map.head()

In [None]:
# Test Images and labels
test_map = pd.read_csv(test_path_labels)
test_map.head()

In [None]:
# Data distribution
train_map.groupby(['label'])['id'].count().plot(kind='bar', stacked=True)

# Data Preparation

In [None]:
# label datatype to string
train_map['label'] = train_map['label'].astype(str)
test_map['label'] = test_map['label'].astype(str)

In [None]:
# add .tif
train_map.id = train_map.id.apply(lambda name: name + '.tif')
test_map.id = test_map.id.apply(lambda name: name + '.tif')

train_map

In [None]:
from sklearn.model_selection import train_test_split

# Split Train and test
train, test = train_test_split(train_map, test_size=0.2, random_state=1)

# Split Train and validation
train, validation = train_test_split(train_map, test_size=0.1, random_state=1)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
validation.head()

# Data Generator and Data Agumentation

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# We can't load all data in memory at once, so we use a DataGenerator
#Create instance of ImageDataGenerator Class
image_gen_train = ImageDataGenerator(
                    # Rescale
                    rescale=1./255,
                    # Rotate 30
                    rotation_range=30,
                    # Shift pixel values
                    width_shift_range=.20,
                    height_shift_range=.20,
                    # Flip all image
                    horizontal_flip=True,
                    # Random zoom
                    zoom_range=0.4
                    )
image_gen_test = ImageDataGenerator(rescale=1./255)
image_gen_valid = ImageDataGenerator(rescale=1./255)

In [None]:
width = 96 # width = height
batch_size = 32

# Custom datagenerator
train_datagen = image_gen_train.flow_from_dataframe(dataframe=train,
                                                    directory=images_folder_train,
                                                    x_col='id',
                                                    y_col='label',
                                                    batch_size=batch_size, #16,32,64...
                                                    seed=1,
                                                    shuffle=True,
                                                    class_mode="binary",
                                                    target_size=(width,width))
                                                                
test_datagen = image_gen_test.flow_from_dataframe(dataframe=test,
                                                    directory=images_folder_train,
                                                    x_col='id',
                                                    y_col='label',
                                                    batch_size=batch_size, #16,32,64...
                                                    seed=1,
                                                    shuffle=False,
                                                    class_mode="binary",
                                                    target_size=(width,width))

valid_datagen = image_gen_valid.flow_from_dataframe(dataframe=validation,
                                                    directory=images_folder_train,
                                                    x_col='id',
                                                    y_col='label',
                                                    batch_size=batch_size, #16,32,64...
                                                    seed=1,
                                                    shuffle=True,
                                                    class_mode="binary",
                                                    target_size=(width,width))

In [None]:
import matplotlib.pyplot as plt
def plot_images_datagen(axis=(2,2), images=None):

    # Grid
    f, axarr = plt.subplots(axis[0], axis[1], figsize=(30/axis[1], 10))
    index = 0
    for i in range(0,axis[0]):
        for j in range (0,axis[1]):
            # Plot
            axarr[i,j].imshow(images[index])
            index += 1
    plt.show()

In [None]:
# See Example of image datagenerator
example = image_gen_train.flow_from_dataframe(dataframe=validation,
                                                    directory=images_folder_train,
                                                    x_col='id',
                                                    y_col='label',
                                                    batch_size=batch_size, #16,32,64...
                                                    seed=1,
                                                    shuffle=True,
                                                    class_mode="binary",
                                                    target_size=(width,width))

images, _ = next(example)
example_images = images[:9]
plot_images_datagen(axis=(3,3), images=example_images)

# Modeling

# Create Model
* <a href="https://arxiv.org/abs/1512.03385">Resnet 50 </a>

In [None]:
# See if GPU is aviable
import tensorflow as tf

gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

In [None]:
from tensorflow.keras import applications

# See model
applications.resnet50.ResNet50(weights= None).summary()

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model

import tensorflow as tf
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model

outs = 1


class MyModel(tf.keras.Model):

    def __init__(self, n_outputs=outs, pretrained=False, freeze=False, size = width, depth = 3):
        
        super(MyModel, self).__init__()
        
        
        if pretrained:
            self.model_weights = 'imagenet'
        else:
            self.model_weights = None
        
        # Download the architecture of ResNet50 with ImageNet weights
        self.resnet = applications.resnet50.ResNet50(include_top=False, weights=self.model_weights, input_shape= (width,width, depth))
        
        # Taking the output of the last convolution block in ResNet50
        self.res_out = self.resnet.output
        self.res_in = self.resnet.input
        
        self.GlobPoll = GlobalAveragePooling2D()
        
        # Adding a fully connected layer having 1024 neurons
        #self.fc1 = Dense(1024, activation='relu')
        
        # Sigmoid Out
        self.out = Dense(outs, activation='softmax')
        
        if freeze:
            # Training only top layers i.e. the layers which we have added in the end
            self.resnet.trainable = False

    def call(self, inputs):

        x = self.resnet(inputs)
        x = self.GlobPoll(x)
        #x = self.fc1(x)
        x = self.out(x)
        
        return x


class MyModel(tf.keras.Model):

    def __init__(self, n_outputs=outs, pretrained=False, freeze=False, size = width, depth = 3):
        
        super(MyModel, self).__init__()
        
        
        if pretrained:
            self.model_weights = 'imagenet'
        else:
            self.model_weights = None
        
        # Download the architecture of ResNet50 with ImageNet weights
        self.resnet = applications.resnet50.ResNet50(include_top=False, weights=self.model_weights, input_shape= (width,width, depth))
        
        # Taking the output of the last convolution block in ResNet50
        self.res_out = self.resnet.output
        self.res_in = self.resnet.input
        
        self.GlobPoll = GlobalAveragePooling2D()
        
        # Adding a fully connected layer having 1024 neurons
        #self.fc1 = Dense(1024, activation='relu')
        
        # Sigmoid Out
        self.out = Dense(outs, activation='sigmoid')
        
        if freeze:
            # Training only top layers i.e. the layers which we have added in the end
            self.resnet.trainable = False

    def call(self, inputs):

        x = self.resnet(inputs)
        x = self.GlobPoll(x)
        #x = self.fc1(x)
        x = self.out(x)
        
        return x


In [None]:
np.random.seed(1)
tf.random.set_seed(1234)

# With Class
model = MyModel()
#model.build(input_shape=(None,width, width, 3))
#model.summary()
# Model 
#model.load_weights('/kaggle/working/Models/ModelResnet50/Resnet50_tf_batch32_NoPretrained')

# Train Model

### Custom Loss unbalanced

In [None]:
positive_weights = {}
negative_weights = {}

positive_weights['label'] = train.shape[0]/(2*np.count_nonzero(train['label']=='1'))
negative_weights['label'] = train.shape[0]/(2*np.count_nonzero(train['label']=='0'))

print(positive_weights)
print('----------------------')
print(negative_weights)


In [None]:
# custon Binary Crossentropy
import tensorflow.keras.backend as K

def loss_fn(y_true,y_pred):
    
    y_true = tf.cast(y_true, tf.float32)
    
    #print(y_true.dtype)
    #print(y_pred.dtype)
    loss = 0
    loss -= (positive_weights['label']*y_true[0]*K.log(y_pred[0]) + negative_weights['label']*(1-y_true[0])*K.log(1-y_pred[0]))
    #print(loss)
    return loss

In [None]:
# Compile with custom loss
model.compile(optimizer = tf.keras.optimizers.Adam(3e-5), loss = loss_fn, metrics = ['categorical_accuracy','accuracy'])

#model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['categorical_accuracy','accuracy'])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# EarlyStopping:
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, 
        verbose=1, mode='auto', restore_best_weights=True)

STEP_SIZE_TRAIN = train_datagen.n//train_datagen.batch_size
STEP_SIZE_VALID = valid_datagen.n//valid_datagen.batch_size
STEP_SIZE_TEST = test_datagen.n//test_datagen.batch_size



# https://www.tensorflow.org/versions/r2.1/api_docs/python/tf/keras/Model#fit
model.fit(x = train_datagen,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_datagen,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=1,
                    callbacks=[monitor]
)

# Evaluation

## Test Model

In [None]:
# Predict 
#test_datagen.reset()
pred = model.predict_generator(test_datagen,
                            steps=STEP_SIZE_TEST,
                            verbose=1)

In [None]:
print('the predictions are: ')
pred

In [None]:
print('the predictions are: ')
# Transform predictions to 0 or 1
round_pred = np.rint(pred)
round_pred

In [None]:
y_true = test_datagen.labels
print(f'the actual values are: {y_true[:5]}...')


In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

def get_metrics(y_true=y_true, round_pred=round_pred):

    y_true = y_true[:round_pred.shape[0]]
    
    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(y_true, round_pred)
    print('Accuracy: %f' % accuracy)
    # precision tp / (tp + fp)
    precision = precision_score(y_true, round_pred)
    print('Precision: %f' % precision)
    # recall: tp / (tp + fn)
    recall = recall_score(y_true, round_pred)
    print('Recall: %f' % recall)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(y_true, round_pred)
    print('F1 score: %f' % f1)

In [None]:
get_metrics(y_true, round_pred)

In [None]:
# Save the weights (Class)
model.save_weights('/kaggle/working/Models/ModelResnet50/Resnet50_tf_batch32_NoPretrained')