# Preprocessing

In [1]:
import tensorflow as tf
import math, re, os, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from kaggle_datasets import KaggleDatasets
from tensorflow import keras
from functools import partial
from sklearn.model_selection import train_test_split
import tensorflow.keras.backend as K
from sklearn.model_selection import KFold
from scipy import stats


from kaggle_datasets import KaggleDatasets
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

print("Tensorflow version " + tf.__version__)

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)

AUTOTUNE = tf.data.experimental.AUTOTUNE
GCS_PATH =  KaggleDatasets().get_gcs_path('cassava-leaf-disease-classification')
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
IMAGE_SIZE = [512, 512]
CLASSES = ['0', '1', '2', '3', '4']
EPOCHS = 25


SEED = 752
SKIP_VALIDATION = False
TTA_NUM = 5

random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)


Tensorflow version 2.2.0
Device: grpc://10.0.0.2:8470
Number of replicas: 8


In [2]:
def decode_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.reshape(image, [*IMAGE_SIZE, 3])
    return image

def read_tfrecord(example, labeled):
    tfrecord_format = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.int64)
    } if labeled else {
        "image": tf.io.FixedLenFeature([], tf.string),
        "image_name": tf.io.FixedLenFeature([], tf.string)
    }
    example = tf.io.parse_single_example(example, tfrecord_format)
    image = decode_image(example['image'])
    if labeled:
        label = tf.cast(example['target'], tf.int32)
        return image, label
    idnum = example['image_name']
    return image, idnum

def load_dataset(filenames, labeled=True, ordered=False):
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(partial(read_tfrecord, labeled=labeled), num_parallel_calls=AUTOTUNE)
    return dataset



In [3]:
TRAINING_FILENAMES, VALID_FILENAMES = train_test_split(
    tf.io.gfile.glob(GCS_PATH + '/train_tfrecords/ld_train*.tfrec'),
    test_size=0.35, random_state=5
)

TEST_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/test_tfrecords/ld_test*.tfrec')

In [4]:
def random_blockout(img, sl=0.1, sh=0.2, rl=0.4):
    p=random.random()
    if p>=0.25:
        w, h, c = IMAGE_SIZE[0], IMAGE_SIZE[1], 3
        origin_area = tf.cast(h*w, tf.float32)

        e_size_l = tf.cast(tf.round(tf.sqrt(origin_area * sl * rl)), tf.int32)
        e_size_h = tf.cast(tf.round(tf.sqrt(origin_area * sh / rl)), tf.int32)

        e_height_h = tf.minimum(e_size_h, h)
        e_width_h = tf.minimum(e_size_h, w)

        erase_height = tf.random.uniform(shape=[], minval=e_size_l, maxval=e_height_h, dtype=tf.int32)
        erase_width = tf.random.uniform(shape=[], minval=e_size_l, maxval=e_width_h, dtype=tf.int32)

        erase_area = tf.zeros(shape=[erase_height, erase_width, c])
        erase_area = tf.cast(erase_area, tf.uint8)

        pad_h = h - erase_height
        pad_top = tf.random.uniform(shape=[], minval=0, maxval=pad_h, dtype=tf.int32)
        pad_bottom = pad_h - pad_top

        pad_w = w - erase_width
        pad_left = tf.random.uniform(shape=[], minval=0, maxval=pad_w, dtype=tf.int32)
        pad_right = pad_w - pad_left

        erase_mask = tf.pad([erase_area], [[0,0],[pad_top, pad_bottom], [pad_left, pad_right], [0,0]], constant_values=1)
        erase_mask = tf.squeeze(erase_mask, axis=0)
        erased_img = tf.multiply(tf.cast(img,tf.float32), tf.cast(erase_mask, tf.float32))

        return tf.cast(erased_img, img.dtype)
    else:
        return tf.cast(img, img.dtype)




def data_augment(image, label):
    # Thanks to the dataset.prefetch(AUTO) statement in the following function this happens essentially for free on TPU. 
    # Data pipeline code is executed on the "CPU" part of the TPU while the TPU itself is computing gradients.
    image = tf.image.random_flip_left_right(image)
    image = random_blockout(image)
    return image, label
def get_mat(rotation, shear, height_zoom, width_zoom, height_shift, width_shift):
    # returns 3x3 transformmatrix which transforms indicies
        
    # CONVERT DEGREES TO RADIANS
    rotation = math.pi * rotation / 180.
    shear = math.pi * shear / 180.
    
    # ROTATION MATRIX
    c1 = tf.math.cos(rotation)
    s1 = tf.math.sin(rotation)
    one = tf.constant([1],dtype='float32')
    zero = tf.constant([0],dtype='float32')
    rotation_matrix = tf.reshape( tf.concat([c1,s1,zero, -s1,c1,zero, zero,zero,one],axis=0),[3,3] )
        
    # SHEAR MATRIX
    c2 = tf.math.cos(shear)
    s2 = tf.math.sin(shear)
    shear_matrix = tf.reshape( tf.concat([one,s2,zero, zero,c2,zero, zero,zero,one],axis=0),[3,3] )    
    
    # ZOOM MATRIX
    zoom_matrix = tf.reshape( tf.concat([one/height_zoom,zero,zero, zero,one/width_zoom,zero, zero,zero,one],axis=0),[3,3] )
    
    # SHIFT MATRIX
    shift_matrix = tf.reshape( tf.concat([one,zero,height_shift, zero,one,width_shift, zero,zero,one],axis=0),[3,3] )
    
    return K.dot(K.dot(rotation_matrix, shear_matrix), K.dot(zoom_matrix, shift_matrix))
def transform(image,label):
    # input image - is one image of size [dim,dim,3] not a batch of [b,dim,dim,3]
    # output - image randomly rotated, sheared, zoomed, and shifted
    DIM = IMAGE_SIZE[0]
    XDIM = DIM%2 #fix for size 331
    
    rot = 15. * tf.random.normal([1],dtype='float32')
    shr = 5. * tf.random.normal([1],dtype='float32') 
    h_zoom = 1.0 + tf.random.normal([1],dtype='float32')/10.
    w_zoom = 1.0 + tf.random.normal([1],dtype='float32')/10.
    h_shift = 16. * tf.random.normal([1],dtype='float32') 
    w_shift = 16. * tf.random.normal([1],dtype='float32') 
  
    # GET TRANSFORMATION MATRIX
    m = get_mat(rot,shr,h_zoom,w_zoom,h_shift,w_shift) 

    # LIST DESTINATION PIXEL INDICES
    x = tf.repeat( tf.range(DIM//2,-DIM//2,-1), DIM )
    y = tf.tile( tf.range(-DIM//2,DIM//2),[DIM] )
    z = tf.ones([DIM*DIM],dtype='int32')
    idx = tf.stack( [x,y,z] )
    
    # ROTATE DESTINATION PIXELS ONTO ORIGIN PIXELS
    idx2 = K.dot(m,tf.cast(idx,dtype='float32'))
    idx2 = K.cast(idx2,dtype='int32')
    idx2 = K.clip(idx2,-DIM//2+XDIM+1,DIM//2)
    
    # FIND ORIGIN PIXEL VALUES           
    idx3 = tf.stack( [DIM//2-idx2[0,], DIM//2-1+idx2[1,]] )
    d = tf.gather_nd(image,tf.transpose(idx3))
        
    return tf.reshape(d,[DIM,DIM,3]),label


In [5]:
def get_training_dataset(TRAINING_FILENAMES):
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)  
    dataset = dataset.map(data_augment, num_parallel_calls=AUTOTUNE)  
    dataset = dataset.map(transform, num_parallel_calls=AUTOTUNE)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset
def get_validation_dataset(VALID_FILENAMES,ordered=False):
    dataset = load_dataset(VALID_FILENAMES, labeled=True, ordered=ordered) 
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset
def get_test_dataset(TEST_FILENAMES,ordered=False):
    dataset = load_dataset(TEST_FILENAMES, labeled=False, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset
def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)
NUM_TRAINING_IMAGES = count_data_items(TRAINING_FILENAMES)
NUM_VALIDATION_IMAGES = count_data_items(VALID_FILENAMES)
NUM_TEST_IMAGES = count_data_items(TEST_FILENAMES)

print('Dataset: {} training images, {} validation images, {} (unlabeled) test images'.format(
    NUM_TRAINING_IMAGES, NUM_VALIDATION_IMAGES, NUM_TEST_IMAGES))


Dataset: 13380 training images, 8017 validation images, 1 (unlabeled) test images


# Load Dataset

In [6]:
import tensorflow_datasets as tfds

val_dataset = get_validation_dataset(VALID_FILENAMES,ordered=True)

label_map = val_dataset.map(lambda images, label:label)
images_map = val_dataset.map(lambda images, label:images)

In [7]:
true_labels = pd.DataFrame(np.array([label for label in label_map.unbatch()]),columns=['true_labels'])
true_labels.to_csv('true_labels.csv',index=False)

# Load All Models

In [8]:
import sys
package_path = '../input/efficientnet/'
sys.path.append(package_path)

package_path = '../input/kerasapplications'
sys.path.append(package_path)

In [9]:
import efficientnet.tfkeras

with strategy.scope():
    
    efficientnet_model = tf.keras.models.load_model('../input/cassava-leaf-disease-training/effcient_net.h5')
    efficient_net_predictions = efficientnet_model.predict(images_map)
    efficient_net_predictions_df = pd.DataFrame(np.argmax(efficient_net_predictions,axis=-1),columns=['efficient_net_predictions'])
    efficient_net_predictions_df.to_csv('efficient_net_predictions.csv',index=False)

In [10]:
with strategy.scope():   
    densenet_model = tf.keras.models.load_model('../input/cassava-leaf-disease-training/dense_net.h5')
    dense_net_predictions = densenet_model.predict(images_map)
    dense_net_predictions_df = pd.DataFrame(np.argmax(dense_net_predictions,axis=-1),columns=['dense_net_predictions'])
    dense_net_predictions_df.to_csv('dense_net_predictions.csv',index=False)

In [11]:
with strategy.scope(): 
    resnet50_models = [tf.keras.models.load_model(f'../input/cassava-leaf-disease-resnet50/resnet50/fold-{i}.h5') for i in range(5)]
    
    resnet_50_models_predictions = np.array( [resnet50_model.predict(images_map) for resnet50_model in resnet50_models])
    resnet_50_models_predictions=[np.argmax(predictions,axis=-1) for predictions in resnet_50_models_predictions]
    resnet_50_models_predictions = stats.mode(resnet_50_models_predictions,axis=0)[0][0]
    
    resnet_50_models_predictions_df = pd.DataFrame(resnet_50_models_predictions,columns=['resnet_50_models_predictions'])
    resnet_50_models_predictions_df.to_csv('resnet_50_models_predictions.csv',index=False)

In [12]:
with strategy.scope():
    resnet101_models = [tf.keras.models.load_model(f'../input/cassava-leaf-disease-resnet101/resnet101/fold-{i}.h5') for i in range(5)]
    
    resnet101_models_predictions = np.array( [resnet101_model.predict(images_map) for resnet101_model in resnet101_models])
    resnet101_models_predictions=[np.argmax(predictions,axis=-1) for predictions in resnet101_models_predictions]
    resnet101_models_predictions = stats.mode(resnet101_models_predictions,axis=0)[0][0]
    
    resnet101_models_predictions_df = pd.DataFrame(resnet101_models_predictions,columns=['resnet101_models_predictions'])
    resnet101_models_predictions_df.to_csv('resnet101_models_predictions.csv',index=False)

In [13]:
with strategy.scope():
    resnext101_models = [tf.keras.models.load_model(f'../input/cassava-leaf-disease-resnext101/resnext101/fold-{i}.h5') for i in range(5)]
 
    resnext101_models_predictions = np.array( [resnext101_model.predict(images_map) for resnext101_model in resnext101_models])
    resnext101_models_predictions=[np.argmax(predictions,axis=-1) for predictions in resnext101_models_predictions]
    resnext101_models_predictions = stats.mode(resnext101_models_predictions,axis=0)[0][0]
    
    resnext101_models_predictions_df = pd.DataFrame(resnext101_models_predictions,columns=['resnext101_models_predictions'])
    resnext101_models_predictions_df.to_csv('resnext101_models_predictions.csv',index=False)

# Prediction

In [14]:
# test_ds = get_test_dataset(TEST_FILENAMES)
# test_images_ds = test_ds.map(lambda image, idnum: image)

In [15]:
# print('Calculating predictions...')
# # ensemble_predictions = 
# predictions = np.argmax(ensemble_predictions, axis=-1)

# Submission

In [16]:
# print('Generating submission file...')
# test_ids_ds = test_ds.map(lambda image, idnum: idnum).unbatch()
# test_ids = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U') # all in one batch
# np.savetxt('submission.csv', np.rec.fromarrays([test_ids, predictions]), fmt=['%s', '%d'], delimiter=',', header='image_id,label', comments='')


In [17]:
# !head submission.csv