## Motivation

If you look at results of a big machine learning competition, you will most likely find that the top results are achieved by an ensemble of models rather than a single model. For instance, the top-scoring single model architecture at ILSVRC2015 is on place 13. Places 1–12 are taken by various ensembles.

I haven’t seen a tutorial or documentation on how to use multiple neural networks in an ensemble, so I decided to make a practical guide on this topic.

![](https://miro.medium.com/max/1400/1*fy-6esoTWsTutld4fdSyCQ.png)

In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import os
import seaborn as sns
import cv2
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten,  Dropout, BatchNormalization, LeakyReLU,Input
from keras.optimizers import SGD, Adam
from keras.utils import np_utils
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from kaggle_datasets import KaggleDatasets


# TPU Configuration


In [None]:


# TPU or GPU detection
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()
    
def seed_everything(seed=0):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

seed = 2048
seed_everything(seed)
print("REPLICAS: ", strategy.num_replicas_in_sync)

# Data access
GCS_DS_PATH = KaggleDatasets().get_gcs_path()
AUTO = tf.data.experimental.AUTOTUNE


In [None]:
tf.tpu.experimental.initialize_tpu_system(tpu) # Clear TPU Memory

In [None]:
# Configuration
EPOCHS = 40
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
IMG_SIZE = 700


## Load Data and perform test train split

In [None]:

def format_path(st):
    return GCS_DS_PATH + '/images/' + st + '.jpg'

train = pd.read_csv('/kaggle/input/plant-pathology-2020-fgvc7/train.csv')
test = pd.read_csv('/kaggle/input/plant-pathology-2020-fgvc7/test.csv')
sub = pd.read_csv('/kaggle/input/plant-pathology-2020-fgvc7/sample_submission.csv')

train_paths = train.image_id.apply(format_path).values
test_paths = test.image_id.apply(format_path).values
train_labels = train.loc[:, 'healthy':].values
SPLIT_VALIDATION =True
if SPLIT_VALIDATION:
    train_paths, valid_paths, train_labels, valid_labels =train_test_split(train_paths, train_labels, test_size=0.25, random_state=seed)

def decode_image(filename, label=None, IMG_SIZE=(IMG_SIZE, IMG_SIZE)):
    bits = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(bits, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.image.resize(image, IMG_SIZE)
    
    if label is None:
        return image
    else:
        return image, label

def data_augment(image, label=None):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    
    if label is None:
        return image
    else:
        return image, label



In [None]:
train_dataset = (
tf.data.Dataset
    .from_tensor_slices((train_paths, train_labels))
    .map(decode_image, num_parallel_calls=AUTO)
    .cache()
    .map(data_augment, num_parallel_calls=AUTO)
    .repeat()
    .shuffle(512)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)
train_dataset_1 = (
tf.data.Dataset
    .from_tensor_slices((train_paths, train_labels))
    .map(decode_image, num_parallel_calls=AUTO)
    .cache()
    .map(data_augment, num_parallel_calls=AUTO)
    .repeat()
    .shuffle(512)
    .batch(64)
    .prefetch(AUTO)
)
valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((valid_paths, valid_labels))
    .map(decode_image, num_parallel_calls=AUTO)
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(test_paths)
    .map(decode_image, num_parallel_calls=AUTO)
    .map(data_augment, num_parallel_calls=AUTO)
    .batch(BATCH_SIZE)
)

    

In [None]:


LR_START = 0.0001
LR_MAX = 0.00005 * strategy.num_replicas_in_sync
LR_MIN = 0.0001
LR_RAMPUP_EPOCHS = 4
LR_SUSTAIN_EPOCHS = 6
LR_EXP_DECAY = .8

def lrfn(epoch):
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = (LR_MAX - LR_MIN) * LR_EXP_DECAY**(epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
    return lr
    
lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=True)

rng = [i for i in range(EPOCHS)]
y = [lrfn(x) for x in rng]
plt.plot(rng, y)
print("Learning rate schedule: {:.3g} to {:.3g} to {:.3g}".format(y[0], max(y), y[-1]))



## EfficientNet

EfficientNet, which not only focuses on improving the accuracy, but also the efficiency of models.

### What does scaling mean in the context of CNNs?

There are three scaling dimensions of a CNN: depth, width, and resolution. Depth simply means how deep the networks is which is equivalent to the number of layers in it. Width simply means how wide the network is. One measure of width, for example, is the number of channels in a Conv layer whereas Resolution is simply the image resolution that is being passed to a CNN. The figure below(from the paper itself) will give you a clear idea of what scaling means across different dimensions. We will discuss these in detail as well.

![](https://miro.medium.com/max/1400/1*xQCVt1tFWe7XNWVEmC6hGQ.png)

EfficientNet Architecture

Scaling doesn’t change the layer operations, hence it is better to first have a good baseline network and then scale it along different dimensions using the proposed compound scaling. The authors obtained their base network by doing a Neural Architecture Search (NAS) that optimizes for both accuracy and FLOPS. The architecture is similar to M-NASNet as it has been found using the similar search space. The network layers/blocks are as shown below:
![](https://miro.medium.com/max/1400/1*OpvSpqMP61IO_9cp4mAXnA.png)

In [None]:
import tensorflow as tf

from keras.models import Model
from tensorflow import keras
!pip install -q efficientnet
import efficientnet.tfkeras as efn



with strategy.scope():    
    efficient_net = efn.EfficientNetB7(
                    input_shape=(IMG_SIZE, IMG_SIZE, 3),
                    weights='imagenet',
                    include_top=False
                    )
    x = efficient_net.output
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(4, activation='softmax')(x)
    model_effnet =  keras.Model(inputs = efficient_net.input,outputs=x)
    model_effnet.compile(loss="categorical_crossentropy", optimizer= 'adam', metrics=["accuracy"])

In [None]:
STEPS_PER_EPOCH = train_labels.shape[0] // BATCH_SIZE

history = model_effnet.fit(
    train_dataset, 
    epochs=EPOCHS, 
    callbacks=[lr_callback],
    steps_per_epoch=STEPS_PER_EPOCH,
    validation_data=valid_dataset if SPLIT_VALIDATION else None,
)


In [None]:
sns.lineplot(x = range(0,40), y = history.history['loss'])
sns.lineplot(x = range(0,40),y = history.history['val_loss'])
plt.legend(['loss train', 'loss validation'])
plt.title('Loss evolution')
plt.show()

In [None]:
sns.lineplot(x = range(0,40), y = history.history['accuracy'])
sns.lineplot(x = range(0,40),y = history.history['val_accuracy'])
plt.legend(['accuracy train', 'accuracy validation'])
plt.title('accuracy evolution')
plt.show()

In [None]:

history.history

In [None]:
predict= model_effnet.predict(test_dataset)

prediction = np.ndarray(shape = (test.shape[0],4), dtype = np.float32)

if False:
    for row in range(test.shape[0]):
        for col in range(4):
            if predict[row][col] == max(predict[row]):
                prediction[row][col] = 1
            else:
                prediction[row][col] = 0

prediction = pd.DataFrame(prediction)
prediction.columns = ['healthy', 'multiple_diseases', 'rust', 'scab']
df = pd.concat([test.image_id, prediction], axis = 1)


df.to_csv('effi_submission.csv', index = False)



# Xception

Xception is an extension of the Inception architecture which replaces the standard Inception modules with depthwise separable convolutions.


The original depthwise separable convolution is the depthwise convolution followed by a pointwise convolution.

    Depthwise convolution is the channel-wise n×n spatial convolution. Suppose in the figure above, we have 5 channels, then we will have 5 n×n spatial convolution.
    Pointwise convolution actually is the 1×1 convolution to change the dimension.

Compared with conventional convolution, we do not need to perform convolution across all channels. That means the number of connections are fewer and the model is lighter.

![](https://miro.medium.com/max/1400/1*VvBTMkVRus6bWOqrK1SlLQ.png)


The modified depthwise separable convolution is the pointwise convolution followed by a depthwise convolution. This modification is motivated by the inception module in Inception-v3 that 1×1 convolution is done first before any n×n spatial convolutions. Thus, it is a bit different from the original one. (n=3 here since 3×3 spatial convolutions are used in Inception-v3.)

Two minor differences:

    The order of operations: As mentioned, the original depthwise separable convolutions as usually implemented (e.g. in TensorFlow) perform first channel-wise spatial convolution and then perform 1×1 convolution whereas the modified depthwise separable convolution perform 1×1 convolution first then channel-wise spatial convolution. This is claimed to be unimportant because when it is used in stacked setting, there are only small differences appeared at the beginning and at the end of all the chained inception modules.
    
    The Presence/Absence of Non-Linearity: In the original Inception Module, there is non-linearity after first operation. In Xception, the modified depthwise separable convolution, there is NO intermediate ReLU non-linearity.
    
    
![](https://miro.medium.com/max/1400/1*hOcAEj9QzqgBXcwUzmEvSg.png)

In [None]:
tf.tpu.experimental.initialize_tpu_system(tpu) # Clear TPU Memory

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50V2,ResNet101V2,ResNet152V2,DenseNet201,Xception
from keras.models import Model
from tensorflow import keras
with strategy.scope():    
    Dense_net = Xception(
                    input_shape=(IMG_SIZE, IMG_SIZE, 3),
                    weights='imagenet',
                    include_top=False
                    )
    x = Dense_net.output
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(4, activation='softmax')(x)
    model_xception =  keras.Model(inputs = Dense_net.input,outputs=x)
    model_xception.compile(loss="categorical_crossentropy", optimizer= 'adam', metrics=["accuracy"])

In [None]:
STEPS_PER_EPOCH = train_labels.shape[0] // BATCH_SIZE

history = model_xception.fit(
    train_dataset, 
    epochs=EPOCHS, 
    callbacks=[lr_callback],
    steps_per_epoch=STEPS_PER_EPOCH,
    validation_data=valid_dataset if SPLIT_VALIDATION else None,
)

In [None]:
predict= model_xception.predict(test_dataset)

prediction = np.ndarray(shape = (test.shape[0],4), dtype = np.float32)
if False:
    for row in range(test.shape[0]):
        for col in range(4):
            if predict[row][col] == max(predict[row]):
                prediction[row][col] = 1
            else:
                prediction[row][col] = 0

prediction = pd.DataFrame(prediction)
prediction.columns = ['healthy', 'multiple_diseases', 'rust', 'scab']
df_dense = pd.concat([test.image_id, prediction], axis = 1)


df_dense.to_csv('dense_submission.csv', index = False)



# ResNet152
ResNet introduces skip connection (or shortcut connection) to fit the input from the previous layer to the next layer without any modification of the input. Skip connection enables to have deeper network and finally ResNet becomes the Winner of ILSVRC 2015 in image classification, detection, and localization, as well as Winner of MS COCO 2015 detection, and segmentation. This is a 2016 CVPR paper with more than 19000 citations.

Skip / Shortcut Connection in Residual Network (ResNet)

To solve the problem of vanishing/exploding gradients, a skip / shortcut connection is added to add the input x to the output after few weight layers as below:
![](https://miro.medium.com/max/894/1*rbhjv7ZdAgXM2MlBUL5Mmw.png)

Hence, the output H(x)= F(x) + x. The weight layers actually is to learn a kind of residual mapping: F(x)=H(x)-x.

Even if there is vanishing gradient for the weight layers, we always still have the identity x to transfer back to earlier layers.
Resnet Architecture:
![](https://miro.medium.com/max/2000/1*6hF97Upuqg_LdsqWY6n_wg.png)

In [None]:
tf.tpu.experimental.initialize_tpu_system(tpu) # Clear TPU Memory

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50V2,ResNet101V2,ResNet152V2,DenseNet201
from keras.models import Model
from tensorflow import keras
with strategy.scope():    
    Res_net = ResNet152V2(
                    input_shape=(IMG_SIZE, IMG_SIZE, 3),
                    weights='imagenet',
                    include_top=False
                    )
    x = Res_net.output
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(4, activation='softmax')(x)
    model_resnet =  keras.Model(inputs = Res_net.input,outputs=x)
    model_resnet.compile(loss="categorical_crossentropy", optimizer= 'adam', metrics=["accuracy"])

In [None]:
STEPS_PER_EPOCH = train_labels.shape[0] // BATCH_SIZE

history = model_resnet.fit(
    train_dataset, 
    epochs=EPOCHS, 
    callbacks=[lr_callback],
    steps_per_epoch=STEPS_PER_EPOCH,
    validation_data=valid_dataset if SPLIT_VALIDATION else None,
)

In [None]:
predict= model_resnet.predict(test_dataset)

prediction = np.ndarray(shape = (test.shape[0],4), dtype = np.float32)
if False:
    for row in range(test.shape[0]):
        for col in range(4):
            if predict[row][col] == max(predict[row]):
                prediction[row][col] = 1
            else:
                prediction[row][col] = 0

prediction = pd.DataFrame(prediction)
prediction.columns = ['healthy', 'multiple_diseases', 'rust', 'scab']
df_res = pd.concat([test.image_id, prediction], axis = 1)


df_res.to_csv('res_submission.csv', index = False)


## Ensemble Process

In [None]:
def voting(a,b,c):
    if a==b:
        return a
    if b==c:
        return c
    if a==c:
        return a


In [None]:
image_id = df['image_id']
healthy = []
multiple_diseases = []
rust = []
scab = []
for i in range(len(df['healthy'])):
    healthy.append(voting(df['healthy'][i],df_dense['healthy'][i],df_res['healthy'][i]))
    multiple_diseases.append(voting(df['multiple_diseases'][i],df_dense['multiple_diseases'][i],df_res['multiple_diseases'][i]))
    rust.append(voting(df['rust'][i],df_dense['rust'][i],df_res['rust'][i]))
    scab.append(voting(df['scab'][i],df_dense['scab'][i],df_res['scab'][i]))
    
finalsubmission = pd.DataFrame(columns = ['image_id','healthy', 'multiple_diseases', 'rust', 'scab'])

finalsubmission['image_id'] = image_id
finalsubmission['healthy'] = healthy
finalsubmission['multiple_diseases'] = multiple_diseases
finalsubmission['rust'] = rust
finalsubmission['scab'] = scab
finalsubmission.to_csv('submission.csv', index = False)


# VKO Ensemble Process

In [None]:
X_train_cv =

X_train_cv, X_validation_cv, y_train_cv, y_validation_cv = train_test_split

X_test = 


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

model_stacked = XGBClassifier(n_jobs=-1)

params = {'max_depth' : [5,10,20],
         'booster' : 'gblinear', 'gbtree', 'dart',
         }

cv = GridSearchCV(estimator = model_stacked,
                 param_grid = params,
                 n_jobs=-1,
                 scoring = 'accuracy',
                 cv=5, #stratified
                 )

cv.fit()