
<h1 style='background-color:#C19A6B; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;' > Herbarium 2021 - Half-Earth Challenge - FGVC8 </h1>


#### Identify plant species of the Americas, Oceania and the Pacific from herbarium specimens



<img src="https://biokic.asu.edu/sites/default/files/styles/panopoly_image_full/public/img_2061-fred_irish_collections_1.jpg?itok=PdE3WGEJ" width="1000px">


## Data Overview
The training and test set contain images of herbarium specimens from nearly 65,000 species of vascular plants. Each image contains exactly one specimen. The text labels on the specimen images have been blurred to remove category information in the image.

The data has been approximately split 80%/20% for training/test. Each category has at least 1 instance in both the training and test datasets. Note that the test set distribution is slightly different from the training set distribution. The training set contains species with hundreds of examples, but the test set has the number of examples per species capped at a maximum of 10.


## Dataset Details
Each image has different image dimensions, with a maximum of 1000 pixels in the larger dimension. These have been resized from the original image resolution. All images are in JPEG format.

## Dataset Format
This dataset uses the COCO dataset format with additional annotation fields. In addition to the species category labels, we also provide region and supercategory information.

The training set metadata (train/metadata.json) and test set metadata (test/metadata.json) are JSON files in the format below. Naturally, the test set metadata file omits the "annotations", "categories," and "regions" elements.

#### Dataset Link 

##### [Here](https://www.kaggle.com/c/herbarium-2021-fgvc8/overview)


## What is VGG16 model?
VGG16 (also called OxfordNet) is a convolutional neural network architecture named after the Visual Geometry Group from Oxford, who developed it. ... By only keeping the convolutional modules, our model can be adapted to arbitrary input sizes. The model loads a set of weights pre-trained on ImageNet.

<img src="https://storage.googleapis.com/lds-media/images/vgg16-architecture.original.jpg" width="1000px">


In [None]:
import os
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
import json
import PIL
import skimage.io
import seaborn as sn
from collections import Counter
import tensorflow as tf
from sklearn.model_selection import train_test_split
import keras.backend as K
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.utils import to_categorical, Sequence
from keras.models import Sequential,Model
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, Activation,BatchNormalization
from keras.optimizers import RMSprop,Adam
from keras.applications import ResNet50
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from tensorflow.keras.optimizers import Adam
import warnings
warnings.filterwarnings("ignore")

In [None]:
path = '/kaggle/input/herbarium-2021-fgvc8/'
os.listdir(path)

In [None]:
samp_subm = pd.read_csv(path+'sample_submission.csv')

In [None]:
with open(path+'train/'+'metadata.json') as f:
    train_data = json.load(f)
with open(path+'test/'+'metadata.json') as f:
    test_data = json.load(f)

In [None]:
def plot_examples():
    fig, axs = plt.subplots(4, 4, figsize=(20, 20))
    fig.subplots_adjust(hspace = .1, wspace=.1)
    
    axs = axs.ravel()
    for i in range(16):
        img = cv2.imread(path+'train/'+train_data['images'][i]['file_name'])
        axs[i].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        axs[i].set_title(train_data['categories'][i]['family'])
        axs[i].set_xticklabels([])
        axs[i].set_yticklabels([])
    plt.show()

In [None]:
print('Number of train images:', len(train_data['images']))
print('Number of test images:', len(test_data['images']))

In [None]:
train_data['annotations'][0]

In [None]:
train_data['categories'][0]

In [None]:
train_data['images'][0]

In [None]:
plot_examples()

In [None]:
df_image = pd.json_normalize(train_data['images'])
df_annot = pd.json_normalize(train_data['annotations'])
df_train_data = pd.DataFrame()
df_train_data['file_name'] = df_image['file_name']
df_train_data['category_id'] = df_annot['category_id']

In [None]:
df_train_data, df_val_data = train_test_split(df_train_data, test_size=0.3)
df_train_data.index = range(len(df_train_data.index))
df_val_data.index = range(len(df_val_data.index))

In [None]:
df_image = pd.json_normalize(test_data['images'])
df_test_data = pd.DataFrame()
df_test_data['file_name'] = df_image['file_name']
df_test_data['category_id'] = 0

In [None]:
print('Number of train samples:', len(df_train_data))
print('Number of val samples:', len(df_val_data))
print('Number of test samples:', len(df_test_data))

In [None]:
print('Number of categories:', len(df_train_data['category_id'].unique()))

In [None]:
df_train_data['category_id'].value_counts()[0:10]

In [None]:
q_size = 64
img_channel = 3
num_classes = 64500
batch_size = 32
epochs = 5

In [None]:
class DataGenerator(Sequence):
    def __init__(self, path, list_IDs, labels, batch_size, img_size, img_channel, num_classes):
        self.path = path
        self.list_IDs = list_IDs
        self.labels = labels
        self.batch_size = batch_size
        self.img_size = img_size
        self.img_channel = img_channel
        self.num_classes = num_classes
        self.indexes = np.arange(len(self.list_IDs))

        
    def __len__(self):
        len_ = int(len(self.list_IDs)/self.batch_size)
        if len_*self.batch_size < len(self.list_IDs):
            len_ += 1
        return len_
    
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        return X, y
            
    
    def __data_generation(self, list_IDs_temp):
        X = np.zeros((self.batch_size, self.img_size, self.img_size, self.img_channel))
        y = np.zeros((self.batch_size, self.num_classes), dtype=int)
        for i, ID in enumerate(list_IDs_temp):
            img = cv2.imread(self.path+ID)
            img = cv2.resize(img, (self.img_size, self.img_size))
            X[i, ] = img/255
            y[i, ] = to_categorical(self.labels[i], num_classes=self.num_classes)
        return X, y

In [None]:
number_samples = 10000
df_train_data = df_train_data[0:number_samples]
df_val_data = df_val_data[0:number_samples]
df_test_data = df_test_data[0:number_samples]

In [None]:
df_train_data

In [None]:
train_generator = DataGenerator(path+'train/', df_train_data['file_name'], df_train_data['category_id'],
                                batch_size, q_size, img_channel, num_classes)
val_generator = DataGenerator(path+'train/',df_val_data['file_name'], df_val_data['category_id'],
                                batch_size, q_size, img_channel, num_classes)
test_generator = DataGenerator(path+'test/',df_test_data['file_name'], df_test_data['category_id'],
                                batch_size, q_size, img_channel, num_classes)

In [None]:
base_model = tf.keras.applications.VGG16(input_shape=(q_size, q_size, img_channel),include_top=False,weights="imagenet")

In [None]:
# Freezing Layers

for layer in base_model.layers[:-20]:
    layer.trainable=False

In [None]:
# Building Model
model=Sequential()
model.add(base_model)
model.add(Dropout(0.5))
# Add new layers
model.add(Flatten())
model.add(Dense(4096 , activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4096 , activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(4096, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(num_classes, activation='softmax'))

model.summary()

In [None]:
from tensorflow.keras.utils import plot_model
from IPython.display import Image
plot_model(model, to_file='convnet.png', show_shapes=True,show_layer_names=True)
Image(filename='convnet.png') 

In [None]:
def f1_score(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),  
      tf.keras.metrics.AUC(name='auc'),
        f1_score,
]

In [None]:
METRICS

In [None]:
lrd = ReduceLROnPlateau(monitor = 'val_loss',patience = 2,verbose = 1,factor = 0.50, min_lr = 1e-4)

mcp = ModelCheckpoint('model.h5')

es = EarlyStopping(verbose=1, patience=2)

In [None]:
model.compile(optimizer=Adam(), loss='categorical_crossentropy',metrics=METRICS)

In [None]:
%time
history = model.fit_generator(generator=train_generator,validation_data=val_generator,epochs = epochs,verbose = 1)

In [None]:
#%% PLOTTING RESULTS (Train vs Validation FOLDER 1)

def Train_Val_Plot(acc,val_acc,loss,val_loss,auc,val_auc,precision,val_precision,f1,val_f1):
    
    fig, (ax1, ax2,ax3,ax4,ax5) = plt.subplots(1,5, figsize= (20,5))
    fig.suptitle(" MODEL'S METRICS VISUALIZATION ")

    ax1.plot(range(1, len(acc) + 1), acc)
    ax1.plot(range(1, len(val_acc) + 1), val_acc)
    ax1.set_title('History of Accuracy')
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Accuracy')
    ax1.legend(['training', 'validation'])


    ax2.plot(range(1, len(loss) + 1), loss)
    ax2.plot(range(1, len(val_loss) + 1), val_loss)
    ax2.set_title('History of Loss')
    ax2.set_xlabel('Epochs')
    ax2.set_ylabel('Loss')
    ax2.legend(['training', 'validation'])
    
    ax3.plot(range(1, len(auc) + 1), auc)
    ax3.plot(range(1, len(val_auc) + 1), val_auc)
    ax3.set_title('History of AUC')
    ax3.set_xlabel('Epochs')
    ax3.set_ylabel('AUC')
    ax3.legend(['training', 'validation'])
    
    ax4.plot(range(1, len(precision) + 1), precision)
    ax4.plot(range(1, len(val_precision) + 1), val_precision)
    ax4.set_title('History of Precision')
    ax4.set_xlabel('Epochs')
    ax4.set_ylabel('Precision')
    ax4.legend(['training', 'validation'])
    
    ax5.plot(range(1, len(f1) + 1), f1)
    ax5.plot(range(1, len(val_f1) + 1), val_f1)
    ax5.set_title('History of F1-score')
    ax5.set_xlabel('Epochs')
    ax5.set_ylabel('F1 score')
    ax5.legend(['training', 'validation'])


    plt.show()
    

Train_Val_Plot(history.history['accuracy'],history.history['val_accuracy'],
               history.history['loss'],history.history['val_loss'],
               history.history['auc'],history.history['val_auc'],
               history.history['precision'],history.history['val_precision'],
               history.history['f1_score'],history.history['val_f1_score']
              )

In [None]:
predict = model.predict_generator(test_generator, verbose=1)

In [None]:
predict.argmax(axis=1)
samp_subm.loc[0:len(df_test_data.index)-1, 'Predicted'] = predict.argmax(axis=1)[0:len(df_test_data.index)]

In [None]:
Sub = samp_subm.copy()
Sub.to_csv('submission.csv', index=False)