# Patch Classifier
This notebook was created to run on Kaggle using a TPU.

The model created uses an EfficientNet-B0 pre-trained with ImageNet as a starting point.

Datasets used:
- CBIS-DDSM patches 224x224 in TFRecords format: https://www.kaggle.com/dsv/4400757 (DOI: 10.34740/kaggle/dsv/4400757)
- CBIS-DDSM patches 448x448 in TFRecords format: https://www.kaggle.com/dsv/4465927 (DOI: 10.34740/kaggle/dsv/4465927)
	
Change PATCH_SIZE and IMAGE_SIZE variables to use different resolutions matching the patch resolution in the dataset used

In [None]:
# IMPORTS
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
import zipfile
import numpy as np
import tensorflow as tf
import math
import cv2

from tensorflow.keras.applications import EfficientNetB0
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from glob import glob

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import gc

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
print("Rodei imports")

# Obtain TPU

In [None]:
# Detect TPU, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() 

print("REPLICAS: ", strategy.num_replicas_in_sync)

# Get GCS bucket

In [None]:
# Simple test TPU
# Step 1: Get the credential from the Cloud SDK
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
user_credential = user_secrets.get_gcloud_credential()
print("Obtendo credentials")
# Step 2: Set the credentials
user_secrets.set_tensorflow_credential(user_credential)
print("Definindo credentials")

# Step 3: Use a familiar call to get the GCS path of the dataset
from kaggle_datasets import KaggleDatasets
# /kaggle/input/cbisddsmv2patchess10448tfrecords/data_cbis_patch_tfrecords_448
#/kaggle/input/cbisddsmpatchesv2tfrecordspngv2/data_patches_s10_v2_tfrecords
GCS_DS_PATH = KaggleDatasets().get_gcs_path()
print("GCS_DS_PATH:")
print(GCS_DS_PATH)
VALID_FILENAMES = tf.io.gfile.glob(GCS_DS_PATH+'/**')
print(VALID_FILENAMES)


In [None]:
#Read from bucket
print("DEBUG: "+ GCS_DS_PATH)
# ../input/cbisddsmv2patchess10448tfrecords/data_cbis_patch_tfrecords_448/test
files = tf.io.gfile.glob(GCS_DS_PATH+'/data_cbis_patch_tfrecords_448/test/**')
print(len(files))

# Get datasets

In [None]:
BATCH_SIZE = 32 * strategy.num_replicas_in_sync
# Uncomment for 448x448 resolution
TRAINING_FILENAMES = tf.io.gfile.glob(GCS_DS_PATH+'/data_cbis_patch_tfrecords_448/train/train_??-32??.tfrec')
VALIDATION_FILENAMES = tf.io.gfile.glob(GCS_DS_PATH+'/data_cbis_patch_tfrecords_448/validation/validation_??-35?.tfrec')
TEST_FILENAMES = tf.io.gfile.glob(GCS_DS_PATH+'/data_cbis_patch_tfrecords_448/test/test_??-880.tfrec')
# Uncomment for 224x224 resolution
# TRAINING_FILENAMES = tf.io.gfile.glob(GCS_DS_PATH+'/data_patches_s10_v2_tfrecords/train/train_??-32??.tfrec')
# VALIDATION_FILENAMES = tf.io.gfile.glob(GCS_DS_PATH+'/data_patches_s10_v2_tfrecords/validation/validation_??-35?.tfrec')
# TEST_FILENAMES = tf.io.gfile.glob(GCS_DS_PATH+'/data_patches_s10_v2_tfrecords/test/test_??-880.tfrec')
AUTO = tf.data.experimental.AUTOTUNE
IMAGE_SIZE = [448, 448]
# IMAGE_SIZE = [224, 224]
print("BATCH_SIZE, AUTO and IMAGE_SIZE defined")

In [None]:
# Helper methods
#Ler datasets em tfrecord
def decode_image(image_data):
    PATCH_SIZE = 448
    # DECODIFICAR A IMAGEM
    # Get 16 bit gray scalar tf.io.parse_tensor
    image = tf.image.decode_png(image_data, channels=1, dtype=tf.uint16)
    sh = tf.shape(image)
    r = sh[0]
    c = sh[1]
    image = tf.cast(image, tf.float32)
    if(r != PATCH_SIZE or c != PATCH_SIZE):
        tf.print(sh)
        image = tf.image.resize(image, (PATCH_SIZE,PATCH_SIZE))
    image = tf.reshape(image, [PATCH_SIZE,PATCH_SIZE,1])
    return image

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "label": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = tf.cast(example['label'], tf.int32)
    return image, label # returns a dataset of (image, label) pairs

def read_unlabeled_tfrecord(example):
    UNLABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "label": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
        # class is missing, this competitions's challenge is to predict flower classes for the test dataset
    }
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    idnum = example['label']
    return image, idnum # returns a dataset of image(s)

def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord if labeled else read_unlabeled_tfrecord, num_parallel_calls=AUTO)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset
def data_augment(image, label):
    # Thanks to the dataset.prefetch(AUTO)
    # statement in the next function (below), this happens essentially
    # for free on TPU. Data pipeline code is executed on the "CPU"
    # part of the TPU while the TPU itself is computing gradients.
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    return image, label   

def subtract_mean(image):
    TRAINING_MEAN = 21678
    TRAINING_VAR = 69225734.9022153
    image = image - TRAINING_MEAN
    return image

def one_to_three_channels(image):
    image = tf.repeat(image, repeats=3, axis=2)
    return image

def normalize_for_model(image):
    SCALE = 257.0 #pixels [0.0, 255.0]
    return image/SCALE

def adjust_for_model_input(image, label):
    image = subtract_mean(image)
    image = normalize_for_model(image)
    image = one_to_three_channels(image)
    return image, label

def get_training_dataset():
    print("CREATING TRAINING DATASET")
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
    print("Data augmentation")
    dataset = dataset.map(data_augment, num_parallel_calls=AUTO)
    dataset = dataset.map(adjust_for_model_input, num_parallel_calls=AUTO)
    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_validation_dataset(ordered=False):
    dataset = load_dataset(VALIDATION_FILENAMES, labeled=True, ordered=ordered)
    dataset = dataset.map(adjust_for_model_input, num_parallel_calls=AUTO)
    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_test_dataset(ordered=False):
    dataset = load_dataset(TEST_FILENAMES, labeled=True, ordered=ordered)
    dataset = dataset.map(adjust_for_model_input, num_parallel_calls=AUTO)
    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset
print("Helper methods to load datasets ready")

In [None]:
# Ler datasets
ds_train = get_training_dataset()
ds_valid = get_validation_dataset()
print("Datasets de treino e validação lidos")

In [None]:
# Test Datasets data
import re
def count_data_items(filenames):
    # the number of data items is written in the name of the .tfrec
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

print("Training data shapes:")
cont = 0
for image, label in ds_train.take(3):
    print(image.numpy().shape, label.numpy().shape)
print(cont)
print("###################################")
NUM_TRAINING_IMAGES = count_data_items(TRAINING_FILENAMES)
NUM_VALIDATION_IMAGES = count_data_items(VALIDATION_FILENAMES)
NUM_TEST_IMAGES = count_data_items(TEST_FILENAMES)
print('Dataset: {} training images, {} validation images, {} unlabeled test images'.format(NUM_TRAINING_IMAGES, NUM_VALIDATION_IMAGES, NUM_TEST_IMAGES))

# Train model

In [None]:
# Model definition
def get_model():
    DENSE_KERNEL_INITIALIZER = {
        'class_name': 'VarianceScaling',
        'config': {
            'scale': 1. / 3.,
            'mode': 'fan_out',
            'distribution': 'uniform'
        }
    }
    pretrained_model = EfficientNetB0(weights='imagenet', include_top=False)
    pretrained_model.trainable = True
    model = tf.keras.Sequential([
        pretrained_model,
        layers.GlobalAveragePooling2D(),
        layers.Dropout(0.2),
        layers.Dense(units=5, activation='softmax',kernel_initializer=DENSE_KERNEL_INITIALIZER)

    ])
    return model
print("Model ready to be used")

In [None]:
# My Learning Rate 1
import math
from matplotlib import pyplot as plt
def cosine_and_warmup_lr(epoch, 
                        start_lr = 0.000002,
                        base = 0.0001,
                        warmup_epochs = 4,
                        period = 3,
                        delta = 0.0002):
    def lr(epoch, warmup_epochs, period, delta, base):
        if(epoch < warmup_epochs):
            if epoch < 1:
                lr = base*(1)/warmup_epochs
            else:
                lr = base*(epoch)/warmup_epochs
            print(base)
            print(epoch)
            print(warmup_epochs)
            print(lr)
        else:
            lr = 0.5*delta*(1+tf.math.cos((epoch - warmup_epochs)*math.pi/period))+base-delta/2
        return lr
    return lr(epoch,
             warmup_epochs,
             period,
             delta,
             base)
    
lr_callback = tf.keras.callbacks.LearningRateScheduler(cosine_and_warmup_lr, verbose=True)
rng = [i for i in range(1,30)]
y = [cosine_and_warmup_lr(x) for x in rng]
plt.plot(rng, y)
print("Learning rate schedule: {:.3g} to {:.3g} to {:.3g}".format(y[0], max(y), y[-1]))

In [None]:

with strategy.scope():
    # Build model
    model = get_model()
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4) # Adam LR=1e-4, delta=2e-4, period=EPOCHS/10, warmp=4
    metrics = ['sparse_categorical_accuracy']
    
    model.compile(
        optimizer = optimizer,
        loss = 'sparse_categorical_crossentropy',#CrossEntropyLoss
        metrics = metrics,
    )
model.summary()

In [None]:
# Training
# Learning Rate
# Fit Model
epochs = 30
batch_size = BATCH_SIZE
history = model.fit(
    ds_train,
    validation_data=ds_valid,
    epochs=epochs,
    batch_size=batch_size,
    shuffle=True,
    verbose=1,
    callbacks=[lr_callback],
)

In [None]:
print("Verificando acurácia durante treinamento")
print(history.history.keys())

In [None]:
print("Calculating the accuracy")
acc = history.history['sparse_categorical_accuracy']
val_acc = history.history['val_sparse_categorical_accuracy']
print("Calculating the loss")
loss = history.history['loss']
val_loss = history.history['val_loss']
print("Calculating learning rate")
lr_gr = history.history['lr']
epochs_range = range(epochs)
print("The results are being visualized")
plt.figure(figsize=(20, 60))
# Acuracia
plt.subplot(3, 1, 1)
plt.plot(epochs_range, acc, label='Treino')
plt.plot(epochs_range, val_acc, label='Validação')
plt.legend(loc='lower right')
plt.title('Acurácia de treino e validação')
#Loss
plt.subplot(3, 1, 2)
plt.plot(epochs_range, loss, label='Treino')
plt.plot(epochs_range, val_loss, label='Validação')
plt.legend(loc='upper right')
plt.title('Perda de treino e validação')
#Learning rate
plt.subplot(3, 1, 3)
plt.plot(epochs_range, lr_gr, label='lr')
plt.legend(loc='upper right')
plt.title('Taxa de Aprendizagem')
plt.show()

# Test model

In [None]:
# Cleaning RAM
import gc
print("Limpando RAM antes de rodar testes")
ds_train = None
ds_valid = None
del ds_train
del ds_valid
gc.collect()

In [None]:
from sklearn.metrics import confusion_matrix
import gc
from collections import Counter

def data_visualization(matrix):
    num_cases = 0
    total_per_class = [0,0,0,0,0]
    for i in range(5):
        for j in range(5):
            num_cases = num_cases+matrix[i,j]
            total_per_class[i] = total_per_class[i] + matrix[i,j]
    for i in range(5):
        print("Classe: ",i)
        print("Elementos dessa classe: ", total_per_class[i])
        print("Acertos: ", matrix[i,i])
        print("% erro: ", (total_per_class[i]-matrix[i,i])/total_per_class[i])
        
dataset = get_test_dataset()
score = model.evaluate(dataset)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
hits = 0
last_acc_hit = 0
dataset = dataset.unbatch().batch(BATCH_SIZE)
batch = iter(dataset)
NUM_BATCH = 14080//BATCH_SIZE
cf_matrix = np.zeros([5,5])
n_correct_batches = 0
for i in range(NUM_BATCH):
    last_acc_hit = hits
    images, labels = next(batch)
    probabilities = model.predict(images)
    predictions = np.argmax(probabilities, axis=-1)
    labels_np = labels.numpy().astype(np.int64)
    print_batch = False
    for i in range(BATCH_SIZE):
        if(predictions[i] == labels_np[i]):
            hits = hits + 1
        else:
            print_batch = True
    if(print_batch):
        print((hits-last_acc_hit)/BATCH_SIZE)
    else:
        n_correct_batches = n_correct_batches + 1
    cf_batch_matrix = confusion_matrix(labels_np, predictions, labels=[0,1,2,3,4])
    cf_matrix = cf_matrix + cf_batch_matrix
    
print(probabilities)
print("Correct batches")
print(n_correct_batches)
print("Accuracy")
print(hits/(NUM_BATCH*BATCH_SIZE))
print(cf_matrix)
data_visualization(cf_matrix)
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(15, 15))
ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues', fmt='.4g')

ax.set_title('Matriz de Confusão de Classificador de Patch\n\n');
ax.set_xlabel('\nClasses Preditas')
ax.set_ylabel('Classes Reais ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['Fundo','Calc. Ben.', 'Calc. Mal.','Massa Ben.','Massa Mal.'])
plt.yticks(rotation=0)
sns.set(font_scale=1.6)
ax.yaxis.set_ticklabels(['Fundo','Calc. Ben.', 'Calc. Mal.','Massa Ben.','Massa Mal.'])

## Display the visualization of the Confusion Matrix.
plt.show()

In [None]:
# Save model
model.save("patch_classifier_tpu_448.h5")

In [None]:
from tensorflow.keras.models import load_model
with strategy.scope():
    test_model = load_model("patch_classifier_tpu_448.h5")

In [None]:
score = test_model.evaluate(dataset)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
hits = 0
last_acc_hit = 0
dataset = dataset.unbatch().batch(BATCH_SIZE)
batch = iter(dataset)
NUM_BATCH = 14080//BATCH_SIZE
cf_matrix = np.zeros([5,5])
n_correct_batches = 0
for i in range(NUM_BATCH):
    last_acc_hit = hits
    images, labels = next(batch)
    probabilities = test_model.predict(images)
    predictions = np.argmax(probabilities, axis=-1)
    labels_np = labels.numpy().astype(np.int64)
    print_batch = False
    for i in range(BATCH_SIZE):
        if(predictions[i] == labels_np[i]):
            hits = hits + 1
        else:
            print_batch = True
    if(print_batch):
        print((hits-last_acc_hit)/BATCH_SIZE)
    else:
        n_correct_batches = n_correct_batches + 1
    cf_batch_matrix = confusion_matrix(labels_np, predictions, labels=[0,1,2,3,4])
    cf_matrix = cf_matrix + cf_batch_matrix
    
print(probabilities)
print("Correct batches")
print(n_correct_batches)
print("Accuracy")
print(hits/(NUM_BATCH*BATCH_SIZE))
print(cf_matrix)
data_visualization(cf_matrix)
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(15, 15))
ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues', fmt='.4g')

ax.set_title('Matriz de Confusão de Classificador de Patch\n\n');
ax.set_xlabel('\nClasses Preditas')
ax.set_ylabel('Classes Reais ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['Fundo','Calc. Ben.', 'Calc. Mal.','Massa Ben.','Massa Mal.'])
plt.yticks(rotation=0)
sns.set(font_scale=1.6)
ax.yaxis.set_ticklabels(['Fundo','Calc. Ben.', 'Calc. Mal.','Massa Ben.','Massa Mal.'])

## Display the visualization of the Confusion Matrix.
plt.show()