# Herbarium - EfficientNetB3

This is a notebook for Herbarium 2020 FGVC7 Challenge, with the principal purpose to identify vascular plant species from a large Dataset of 

In [None]:
!pip install efficientnet 

In [None]:
import numpy as np
import pandas as pd
import os
import json, codecs
import tensorflow as tf
from efficientnet.keras import EfficientNetB3

import keras
from kaggle_datasets import KaggleDatasets
from keras.models import Model
from keras.layers import Dense, Dropout, Conv2D, MaxPool2D, Flatten, BatchNormalization, Input, concatenate
from keras.optimizers import Adam
from keras.utils import plot_model
from sklearn.model_selection import train_test_split as tts

print(tf.__version__)

In [None]:
with codecs.open("../input/herbarium-2020-fgvc7/nybg2020/train/metadata.json", 'r',
                 encoding='utf-8', errors='ignore') as f:
    train_meta = json.load(f)
    
with codecs.open("../input/herbarium-2020-fgvc7/nybg2020/test/metadata.json", 'r',
                 encoding='utf-8', errors='ignore') as f:
    test_meta = json.load(f)

In [None]:
train_df = pd.DataFrame(train_meta['annotations'])

train_cat = pd.DataFrame(train_meta['categories'])
train_cat.columns = ['family', 'genus', 'category_id', 'category_name']

train_img = pd.DataFrame(train_meta['images'])
train_img.columns = ['file_name', 'height', 'image_id', 'license', 'width']

train_reg = pd.DataFrame(train_meta['regions'])
train_reg.columns = ['region_id', 'region_name']

train_df = train_df.merge(train_cat, on='category_id', how='outer')
train_df = train_df.merge(train_img, on='image_id', how='outer')
train_df = train_df.merge(train_reg, on='region_id', how='outer')
train_df

In [None]:
train_df.info()

In [None]:
na = train_df.file_name.isna()
keep = [x for x in range(train_df.shape[0]) if not na[x]]
train_df = train_df.iloc[keep]

dtypes = ['int32', 'int32', 'int32', 'int32', 'object', 'object', 'object', 'object', 'int32', 'int32', 'int32', 'object']
for n, col in enumerate(train_df.columns):
    train_df[col] = train_df[col].astype(dtypes[n])
print(train_df.info())

In [None]:
test_df = pd.DataFrame(test_meta['images'])
test_df.columns = ['file_name', 'height', 'image_id', 'license', 'width']
print(test_df.info())

print("Total Unique Values for each columns:")
print("{0:10s} \t {1:10d}".format('train_df', len(train_df)))
for col in train_df.columns:
    print("{0:10s} \t {1:10d}".format(col, len(train_df[col].unique())))
    
family = train_df[['family', 'genus', 'category_name']].groupby(['family', 'genus']).count()
display(family.describe())

In [None]:
m = train_df[['file_name','family','genus']]
#m = train_df[['file_name', 'family', 'genus', 'category_id']]

fam = m.family.unique().tolist()
m.family = m.family.map(lambda x: fam.index(x))
gen = m.genus.unique().tolist()
m.genus = m.genus.map(lambda x: gen.index(x))
display(m)

In [None]:


def fg_model(shape,lr):
    
    
    actual_shape = shape
    i = Input(actual_shape)
    x = EfficientNetB3(weights='imagenet', include_top=False, input_shape=actual_shape, pooling='max')(i)
    #x = Flatten()(x)
    o1 = Dense(310, name="family", activation='softmax')(x)
    o2 = concatenate([x,o1])
    o2 = Dense(3678, name="genus", activation='softmax')(o2)
    model = Model(inputs=i,outputs=[o1,o2])
    
    model.layers[1].trainable = False
    model.get_layer('genus').trainable = False
    
    opt = Adam(lr=lr, amsgrad=True)
    model.compile(optimizer=opt, loss=['sparse_categorical_crossentropy', 
                                   'sparse_categorical_crossentropy'],
                 metrics=['accuracy'])

    
    return model



In [None]:
from keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator()

In [None]:
model = fg_model((300,300,3), 0.01) #Efficientnet B3 was designed for image size 300x300
model.summary()

In [None]:
for i in range(10):
    
    #al no entrar todo en ram, se iterara en 10 ocasiones mezclando la data, teniendo una epoca por cada entrenamiento
    #se tomaran 100000 muestras por cada iteracion, teniendo
    train, verif = tts(m, test_size=0.2, shuffle=True, random_state=i)
    train = train[:80000]
    verif = verif[:20000]
    shape = (300,300, 3)
    epochs = 1
    batch_size = 256

    #model = fg_model(shape, 0.007)
    #model.summary()
    #Train Family for 2 epochs
    model.fit_generator(train_datagen.flow_from_dataframe(dataframe=train,
                                                          directory='../input/herbarium-2020-fgvc7/nybg2020/train/',
                                                          x_col="file_name",
                                                          y_col=["family","genus"],
                                                          target_size=(300,300),
                                                          batch_size=batch_size,
                                                          class_mode="multi_output"),
                        validation_data=train_datagen.flow_from_dataframe(
                            dataframe=verif,
                            directory='../input/herbarium-2020-fgvc7/nybg2020/train/',
                            x_col="file_name",
                            y_col=["family","genus"],
                            target_size=(300,300),
                            batch_size=batch_size,
                            class_mode='multi_output'),
                        epochs=epochs,
                        steps_per_epoch=len(train)//batch_size,
                        validation_steps=len(verif)//batch_size,
                        verbose=1,
                        workers=2,
                        use_multiprocessing=False)

model.save_weights("weights_model_1.h5")

In [None]:
model = fg_model((300,300,3), 0.01) #Efficientnet B3 was designed for image size 300x300
model.load_weights("../input/model-herbarium-gpu/weights_model_1.h5")
model.summary()

In [None]:

for i in range(10,20):
    
    #al no entrar todo en ram, se iterara en 10 ocasiones mezclando la data, teniendo una epoca por cada entrenamiento
    #se tomaran 100000 muestras por cada iteracion, teniendo
    train, verif = tts(m, test_size=0.2, shuffle=True, random_state=i)
    train = train[:80000]
    verif = verif[:20000]
    shape = (300,300, 3)
    epochs = 1
    batch_size = 256

    #model = fg_model(shape, 0.007)
    #model.summary()
    #Train Family for 2 epochs
    model.fit_generator(train_datagen.flow_from_dataframe(dataframe=train,
                                                          directory='../input/herbarium-2020-fgvc7/nybg2020/train/',
                                                          x_col="file_name",
                                                          y_col=["family","genus"],
                                                          target_size=(300,300),
                                                          batch_size=batch_size,
                                                          class_mode="multi_output"),
                        validation_data=train_datagen.flow_from_dataframe(
                            dataframe=verif,
                            directory='../input/herbarium-2020-fgvc7/nybg2020/train/',
                            x_col="file_name",
                            y_col=["family","genus"],
                            target_size=(300,300),
                            batch_size=batch_size,
                            class_mode='multi_output'),
                        epochs=epochs,
                        steps_per_epoch=len(train)//batch_size,
                        validation_steps=len(verif)//batch_size,
                        verbose=1,
                        workers=2,
                        use_multiprocessing=False)
model.save_weights("weights_model_2.h5")

In [None]:
def fg_model(shape,lr):
    
    
    actual_shape = shape
    i = Input(actual_shape)
    x = EfficientNetB3(weights='imagenet', include_top=False, input_shape=actual_shape, pooling='max')(i)
    #x = Flatten()(x)
    o1 = Dense(310, name="family", activation='softmax')(x)
    o2 = concatenate([x,o1])
    o2 = Dense(3678, name="genus", activation='softmax')(o2)
    model = Model(inputs=i,outputs=[o1,o2])
    
    model.layers[1].trainable = False
    
    opt = Adam(lr=lr, amsgrad=True)
    model.compile(optimizer=opt, loss=['sparse_categorical_crossentropy', 
                                   'sparse_categorical_crossentropy'],
                 metrics=['accuracy'])

    
    return model

model = fg_model((300,300,3), 0.01) #Efficientnet B3 was designed for image size 300x300
model.load_weights("../input/model-herbarium-gpu/weights_model_2.h5")
#model.get_layer('genus').trainable = True#to train the category genus
model.summary()

In [None]:
for i in range(20,33):
    
    #al no entrar todo en ram, se iterara en 10 ocasiones mezclando la data, teniendo una epoca por cada entrenamiento
    #se tomaran 100000 muestras por cada iteracion, teniendo
    train, verif = tts(m, test_size=0.2, shuffle=True, random_state=i)
    train = train[:80000]
    verif = verif[:20000]
    shape = (300,300, 3)
    epochs = 1
    batch_size = 256

    #model = fg_model(shape, 0.007)
    #model.summary()
    #Train Family for 2 epochs
    model.fit_generator(train_datagen.flow_from_dataframe(dataframe=train,
                                                          directory='../input/herbarium-2020-fgvc7/nybg2020/train/',
                                                          x_col="file_name",
                                                          y_col=["family","genus"],
                                                          target_size=(300,300),
                                                          batch_size=batch_size,
                                                          class_mode="multi_output"),
                        validation_data=train_datagen.flow_from_dataframe(
                            dataframe=verif,
                            directory='../input/herbarium-2020-fgvc7/nybg2020/train/',
                            x_col="file_name",
                            y_col=["family","genus"],
                            target_size=(300,300),
                            batch_size=batch_size,
                            class_mode='multi_output'),
                        epochs=epochs,
                        steps_per_epoch=len(train)//batch_size,
                        validation_steps=len(verif)//batch_size,
                        verbose=1,
                        workers=2,
                        use_multiprocessing=False)
    
model.save_weights("weights_model_3.h5")