# Intro
Welcome to the [Herbarium 2021 - Half-Earth Challenge - FGVC8](https://www.kaggle.com/c/herbarium-2021-fgvc8) compedition.
![](https://storage.googleapis.com/kaggle-competitions/kaggle/25558/logos/header.png)

<span style="color: royalblue;">Please vote the notebook up if it helps you. Feel free to leave a comment above the notebook. Thank you. </span>

# Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
import json
from collections import Counter

from sklearn.model_selection import train_test_split

from keras.utils import to_categorical, Sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, Activation
from keras.optimizers import RMSprop,Adam
from keras.applications import ResNet50

import warnings
warnings.filterwarnings("ignore")

# Path

In [None]:
path = '/kaggle/input/herbarium-2021-fgvc8/'
os.listdir(path)

# Load Data

In [None]:
samp_subm = pd.read_csv(path+'sample_submission.csv')

In [None]:
with open(path+'train/'+'metadata.json') as f:
    train_data = json.load(f)
with open(path+'test/'+'metadata.json') as f:
    test_data = json.load(f)

# Functions

In [None]:
def plot_examples():
    fig, axs = plt.subplots(4, 4, figsize=(20, 20))
    fig.subplots_adjust(hspace = .1, wspace=.1)
    
    axs = axs.ravel()
    for i in range(16):
        img = cv2.imread(path+'train/'+train_data['images'][i]['file_name'])
        axs[i].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        axs[i].set_title(train_data['categories'][i]['family'])
        axs[i].set_xticklabels([])
        axs[i].set_yticklabels([])
    plt.show()

# Overview

In [None]:
print('Number of train images:', len(train_data['images']))
print('Number of test images:', len(test_data['images']))

# EDA

## Focus Train Data Structure
There are some metadata: "annotations", "categories", "images", "info", "licenses" "institutions".

In [None]:
train_data['annotations'][0]

In [None]:
train_data['categories'][0]

In [None]:
train_data['images'][0]

In [None]:
train_data['info']

In [None]:
train_data['licenses'][0]

In [None]:
train_data['institutions'][0]

## Focus Test Data Structure
There are some metadata: "images", "info", "licenses".

In [None]:
test_data['images'][0]

In [None]:
test_data['info']

In [None]:
test_data['licenses'][0]

## Plot Some Examples

In [None]:
plot_examples()

## Prepare Data For Data Generator
Train Data:

In [None]:
df_image = pd.json_normalize(train_data['images'])
df_annot = pd.json_normalize(train_data['annotations'])
df_train_data = pd.DataFrame()
df_train_data['file_name'] = df_image['file_name']
df_train_data['category_id'] = df_annot['category_id']

Split Train And Validation Data

In [None]:
df_train_data, df_val_data = train_test_split(df_train_data, test_size=0.3)
df_train_data.index = range(len(df_train_data.index))
df_val_data.index = range(len(df_val_data.index))

Test Data

In [None]:
df_image = pd.json_normalize(test_data['images'])
df_test_data = pd.DataFrame()
df_test_data['file_name'] = df_image['file_name']
df_test_data['category_id'] = 0

Summary

In [None]:
print('Number of train samples:', len(df_train_data))
print('Number of val samples:', len(df_val_data))
print('Number of test samples:', len(df_test_data))

## Distribution Of Categories

In [None]:
print('Number of categories:', len(df_train_data['category_id'].unique()))

In [None]:
df_train_data['category_id'].value_counts()[0:10]

# Data Generator

Parameters

In [None]:
q_size = 64
img_channel = 3
num_classes = 64500
batch_size = 32
epochs = 5

In [None]:
class DataGenerator(Sequence):
    def __init__(self, path, list_IDs, labels, batch_size, img_size, img_channel, num_classes):
        self.path = path
        self.list_IDs = list_IDs
        self.labels = labels
        self.batch_size = batch_size
        self.img_size = img_size
        self.img_channel = img_channel
        self.num_classes = num_classes
        self.indexes = np.arange(len(self.list_IDs))

        
    def __len__(self):
        len_ = int(len(self.list_IDs)/self.batch_size)
        if len_*self.batch_size < len(self.list_IDs):
            len_ += 1
        return len_
    
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        return X, y
            
    
    def __data_generation(self, list_IDs_temp):
        X = np.zeros((self.batch_size, self.img_size, self.img_size, self.img_channel))
        y = np.zeros((self.batch_size, self.num_classes), dtype=int)
        for i, ID in enumerate(list_IDs_temp):
            img = cv2.imread(self.path+ID)
            img = cv2.resize(img, (self.img_size, self.img_size))
            X[i, ] = img/255
            y[i, ] = to_categorical(self.labels[i], num_classes=self.num_classes)
        return X, y

# Test On Subset
Both train and test data sets are big. To test the algorithms we work on a small subset.

In [None]:
number_samples = 10000
df_train_data = df_train_data[0:number_samples]
df_val_data = df_val_data[0:number_samples]
df_test_data = df_test_data[0:number_samples]

# Define Train, Val And Test Data

In [None]:
train_generator = DataGenerator(path+'train/', df_train_data['file_name'], df_train_data['category_id'],
                                batch_size, q_size, img_channel, num_classes)
val_generator = DataGenerator(path+'train/',df_val_data['file_name'], df_val_data['category_id'],
                                batch_size, q_size, img_channel, num_classes)
test_generator = DataGenerator(path+'test/',df_test_data['file_name'], df_test_data['category_id'],
                                batch_size, q_size, img_channel, num_classes)

# Load Pretrained Model

In [None]:
weights='../input/models/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'
conv_base = ResNet50(weights=weights,
                     include_top=False,
                     input_shape=(q_size, q_size, img_channel))
conv_base.trainable = True

# Define Model

In [None]:
model = Sequential()
model.add(conv_base)
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(num_classes, activation='sigmoid'))

In [None]:
model.compile(optimizer = RMSprop(lr=1e-5),
              loss='binary_crossentropy',
              metrics=['binary_accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit_generator(generator=train_generator,
                              validation_data=val_generator,
                              epochs = epochs)

# Analyse Training

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(20, 6))
fig.subplots_adjust(hspace = .2, wspace=.2)
axs = axs.ravel()
loss = history.history['loss']
loss_val = history.history['val_loss']
epochs = range(1, len(loss)+1)
axs[0].plot(epochs, loss, 'bo', label='loss_train')
axs[0].plot(epochs, loss_val, 'ro', label='loss_val')
axs[0].set_title('Value of the loss function')
axs[0].set_xlabel('epochs')
axs[0].set_ylabel('value of the loss function')
axs[0].legend()
axs[0].grid()
acc = history.history['binary_accuracy']
acc_val = history.history['val_binary_accuracy']
axs[1].plot(epochs, acc, 'bo', label='accuracy_train')
axs[1].plot(epochs, acc_val, 'ro', label='accuracy_val')
axs[1].set_title('Accuracy')
axs[1].set_xlabel('Epochs')
axs[1].set_ylabel('Value of accuracy')
axs[1].legend()
axs[1].grid()
plt.show()

# Predict Test Data

In [None]:
predict = model.predict_generator(test_generator, verbose=1)

In [None]:
predict.argmax(axis=1)
samp_subm.loc[0:len(df_test_data.index)-1, 'Predicted'] = predict.argmax(axis=1)[0:len(df_test_data.index)]

# Export

In [None]:
output = samp_subm.copy()
output.to_csv('submission.csv', index=False)