# Herbarium2022

The Herbarium 2022: Flora of North America is a part of a project of the New York Botanical Garden funded by the National Science Foundation to build tools to identify novel plant species around the world. The dataset strives to represent all known vascular plant taxa in North America, using images gathered from 60 different botanical institutions around the world.

<img align=center src='https://www.floridamuseum.ufl.edu/wp-content/uploads/sites/23/2016/12/herbarium-specimen-sheets-montage-header-600x450.jpg'>

## **Today! I'm trying tensorflow framework**

I'm impressive see this notebook! full credit
Then refer herbarium 2020 competition:  [https://www.kaggle.com/seraphwedd18/herbarium-consolidating-the-details#Submission](http://)
I love it!

### **Enjoy! Tensorflow**

## **Path**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if filename.endswith('.jpg'):
            break
        print(os.path.join(dirname, filename))

In [None]:
sample_sub = pd.read_csv('../input/herbarium-2022-fgvc9/sample_submission.csv')
display(sample_sub)

# **Load Data ---> json**

In [None]:
import json, codecs

with codecs.open("../input/herbarium-2022-fgvc9/train_metadata.json", 'r',
                 encoding='utf-8', errors='ignore') as f:
    train_meta = json.load(f)
    
with codecs.open("../input/herbarium-2022-fgvc9/test_metadata.json", 'r',
                 encoding='utf-8', errors='ignore') as f:
    test_meta = json.load(f)

In [None]:
display(train_meta.keys())

# **Analysis the data**
## **Train_meta ---> Dataframe**

In [None]:
train_df = pd.DataFrame(train_meta['annotations'])
display(train_df)

In [None]:
train_cat = pd.DataFrame(train_meta['categories'])
#train_cat.columns = [ 'category_id', 'scientificName','family', 'genus']
display(train_cat)

In [None]:
train_img = pd.DataFrame(train_meta['images'])
train_img.columns = ['image_id','file_name', 'license']
display(train_img)


In [None]:
train_meta.keys()

In [None]:
train_gen = pd.DataFrame(train_meta['genera'])
train_gen.columns = ['genus_id', 'genus']
display(train_gen)

## **Merge_Important_data's**

In [None]:
train_df = train_df.merge(train_cat, on='category_id', how='outer')
train_df = train_df.merge(train_img, on='image_id', how='outer')
train_df = train_df.merge(train_gen, on='genus_id', how='outer')

In [None]:
train_df.columns


In [None]:
print(train_df.info())
display(train_df)

## **Identify_to_remove_NullData**

In [None]:
na = train_df.file_name.isna()
keep = [x for x in range(train_df.shape[0]) if not na[x]]
train_df = train_df.iloc[keep]

In [None]:
train_df.info()

## **Test_meta ---> DataFrame**

In [None]:
test_df = pd.DataFrame(test_meta)
test_df.columns = ['file_name', 'image_id', 'license']
print(test_df.info())
display(test_df)

# **Generate_CSV----> Important_Data**

In [None]:
train_df.to_csv('train_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

## **EDA ---> NewData**

In [None]:
#dataexploration
print("Total Unique Values for each columns:")
print("{0:10s} \t {1:10d}".format('train_df', len(train_df)))
for col in train_df.columns:
    print("{0:10s} \t {1:10d}".format(col, len(train_df[col].unique())))


In [None]:
family = train_df[['family', 'genus_id', 'scientificName']].groupby(['family','genus_id']).count()
display(family.describe())

# **Build the Model_TensorflowFramework**

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPool2D, Flatten, BatchNormalization, Input, concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
from sklearn.model_selection import train_test_split as tts

in_out_size = (120*120) + 3 #We will resize the image to 120*120 and we have 3 outputs
def xavier(shape, dtype=None):
    return np.random.rand(*shape)*np.sqrt(1/in_out_size)

def fg_model(shape, lr=0.001):
    '''Family-Genus model receives an image and outputs two integers indicating both the family and genus index.'''
    i = Input(shape)
    
    x = Conv2D(3, (3, 3), activation='relu', padding='same', kernel_initializer=xavier)(i)
    x = Conv2D(3, (5, 5), activation='relu', padding='same', kernel_initializer=xavier)(x)
    x = MaxPool2D(pool_size=(3, 3), strides=(3,3))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    x = Conv2D(16, (5, 5), activation='relu', padding='same', kernel_initializer=xavier)(x)
    #x = Conv2D(16, (5, 5), activation='relu', padding='same', kernel_initializer=xavier)(x)
    x = MaxPool2D(pool_size=(5, 5), strides=(5,5))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    x = Flatten()(x)
    
    o1 = Dense(310, activation='softmax', name='family', kernel_initializer=xavier)(x)
    
    o2 = concatenate([o1, x])
    o2 = Dense(3678, activation='softmax', name='genus_id', kernel_initializer=xavier)(o2)
    
    o3 = concatenate([o1, o2, x])
    o3 = Dense(32094, activation='softmax', name='category_id', kernel_initializer=xavier)(o3)
    
    x = Model(inputs=i, outputs=[o1, o2, o3])
    
    opt = Adam(lr=lr, amsgrad=True)
    x.compile(optimizer=opt, loss=['sparse_categorical_crossentropy', 
                                   'sparse_categorical_crossentropy', 
                                   'sparse_categorical_crossentropy'],
                 metrics=['accuracy'])
    return x

model = fg_model((120, 120, 3))
model.summary()
plot_model(model, to_file='full_model_plot.png', show_shapes=True, show_layer_names=True)

## **Augmentation of Data_Image**

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(featurewise_center=False,
                                     featurewise_std_normalization=False,
                                     rotation_range=180,
                                     width_shift_range=0.1,
                                     height_shift_range=0.1,
                                     zoom_range=0.2)


In [None]:
m = train_df[['file_name', 'family', 'genus_id', 'category_id']]
fam = m.family.unique().tolist()
m.family = m.family.map(lambda x: fam.index(x))
gen = m.genus_id.unique().tolist()
m.genus_id = m.genus_id.map(lambda x: gen.index(x))
display(m)

## **Train_data_TensorflowModel_epoch_2**

In [None]:
train, verif = tts(m, test_size=0.2, shuffle=True, random_state=17)
train = train[:40000]
verif = verif[:10000]
shape = (120, 120, 3)
epochs = 2
batch_size = 32

model = fg_model(shape, 0.007)

#Disable the last two output layers for training the Family
for layers in model.layers:
    if layers.name == 'genus_id' or layers.name=='category_id':
        layers.trainable = False

#Train Family for 2 epochs
model.fit_generator(train_datagen.flow_from_dataframe(dataframe=train,
                                                      directory='../input/herbarium-2022-fgvc9/train_images',
                                                      x_col="file_name",
                                                      y_col=["family", "genus_id", "category_id"],
                                                      target_size=(120, 120),
                                                      batch_size=batch_size,
                                                      class_mode='multi_output'),
                    validation_data=train_datagen.flow_from_dataframe(
                        dataframe=verif,
                        directory='../input/herbarium-2022-fgvc9/train_images',
                        x_col="file_name",
                        y_col=["family", "genus_id", "category_id"],
                        target_size=(120, 120),
                        batch_size=batch_size,
                        class_mode='multi_output'),
                    epochs=epochs,
                    steps_per_epoch=len(train)//batch_size,
                    validation_steps=len(verif)//batch_size,
                    verbose=1,
                    workers=8,
                    use_multiprocessing=False)

#Reshuffle the inputs
train, verif = tts(m, test_size=0.2, shuffle=True, random_state=17)
train = train[:40000]
verif = verif[:10000]

#Make the Genus layer Trainable
for layers in model.layers:
    if layers.name == 'genus_id':
        layers.trainable = True
        
#Train Family and Genus for 2 epochs
model.fit_generator(train_datagen.flow_from_dataframe(dataframe=train,
                                                      directory='../input/herbarium-2022-fgvc9/train_images',
                                                      x_col="file_name",
                                                      y_col=["family", "genus_id", "category_id"],
                                                      target_size=(120, 120),
                                                      batch_size=batch_size,
                                                      class_mode='multi_output'),
                    validation_data=train_datagen.flow_from_dataframe(
                        dataframe=verif,
                        directory='../input/herbarium-2022-fgvc9/train_images',
                        x_col="file_name",
                        y_col=["family", "genus_id", "category_id"],
                        target_size=(120, 120),
                        batch_size=batch_size,
                        class_mode='multi_output'),
                    epochs=epochs,
                    steps_per_epoch=len(train)//batch_size,
                    validation_steps=len(verif)//batch_size,
                    verbose=1,
                    workers=8,
                    use_multiprocessing=False)

#Reshuffle the inputs
train, verif = tts(m, test_size=0.2, shuffle=True, random_state=17)
train = train[:40000]
verif = verif[:10000]

#Make the category_id layer Trainable
for layers in model.layers:
    if layers.name == 'category_id':
        layers.trainable = True
        
#Train them all for 2 epochs
model.fit_generator(train_datagen.flow_from_dataframe(dataframe=train,
                                                      directory='../input/herbarium-2022-fgvc9/train_images',
                                                      x_col="file_name",
                                                      y_col=["family", "genus_id", "category_id"],
                                                      target_size=(120, 120),
                                                      batch_size=batch_size,
                                                      class_mode='multi_output'),
                    validation_data=train_datagen.flow_from_dataframe(
                        dataframe=verif,
                        directory='../input/herbarium-2022-fgvc9/train_images',
                        x_col="file_name",
                        y_col=["family", "genus_id", "category_id"],
                        target_size=(120, 120),
                        batch_size=batch_size,
                        class_mode='multi_output'),
                    epochs=epochs,
                    steps_per_epoch=len(train)//batch_size,
                    validation_steps=len(verif)//batch_size,
                    verbose=1,
                    workers=8,
                    use_multiprocessing=False)

'''
for i in range(epochs):
    n = 1
    for X, Y in train_datagen.flow_from_dataframe(dataframe=train,
                                                  directory='../input/herbarium-2020-fgvc7/nybg2020/train/',
                                                  x_col="file_name",
                                                  y_col=["family", "genus", "category_id"],
                                                  target_size=(120, 120),
                                                  batch_size=batch_size,
                                                  class_mode='multi_output'):
        model.train_on_batch(X, Y, reset_metrics=False)
        loss, fam_loss, gen_loss, cat_loss, fam_acc, gen_acc, cat_acc = model.evaluate(X, Y, verbose=False)
        if n%10==0:
            print(f"For epoch {i} batch {n}: {loss}, {fam_loss}, {gen_loss}, {cat_loss}, {fam_acc}, {gen_acc}, {cat_acc}")
            for layers in model.layers:
                if layers.name == 'family' and fam_acc>0.90:
                    layers.trainable=False
                elif layers.name == 'genus':
                    if fam_acc>0.75:
                        layers.trainable=True
                    else:
                        layers.trainable=False
                elif layers.name == 'category_id':
                    if fam_acc>0.75 and gen_acc>0.5:
                        layers.trainable=True
                    else:
                        layers.trainable=False
        n += 1
'''

In [None]:
model.save('fg_model.h5')

## **Test_data**

In [None]:
test_df.info()

In [None]:
test_df.iloc[:10000]

In [None]:
batch_size = 32
test_datagen = ImageDataGenerator(featurewise_center=False,
                                 featurewise_std_normalization=False)
test = test_df.iloc[:10000]
generator = test_datagen.flow_from_dataframe(
        dataframe = test, #Limiting the test to the first 10,000 items
        directory = '../input/herbarium-2022-fgvc9/test_images',
        x_col = 'image_id',
        target_size=(120, 120),
        batch_size=batch_size,
        class_mode=None,  # only data, no labels
        shuffle=False
)

family,genus_id,category_id = model.predict(generator, verbose=1)

# **Submission_f**

In [None]:
sub = pd.DataFrame()
sub['Id'] = test_df.file_name
sub['Id'] = sub['Id'].astype('int64')
sub['Predicted'] = np.concatenate([np.argmax(category_id, axis=1), 23718*np.ones((len(test_df.image_id)-len(category_id)))], axis=0)
sub['Predicted'] = sub['Predicted'].astype('int32')
display(sub)
sub.to_csv('category_submission.csv', index=False)

In [None]:
sub['Predicted'] = np.concatenate([np.argmax(family, axis=1), np.zeros((len(test_df.image_id)-len(family)))], axis=0)
sub['Predicted'] = sub['Predicted'].astype('int32')
display(sub)
sub.to_csv('family_submission.csv', index=False)

In [None]:
sub['Predicted'] = np.concatenate([np.argmax(genus_id, axis=1), np.zeros((len(test_df.image_id)-len(genus_id)))], axis=0)
sub['Predicted'] = sub['Predicted'].astype('int32')
display(sub)
sub.to_csv('genus_submission.csv', index=False)

## ***Thankyou for visiting guys! feel free to use---->starter***

Reference:
1. [https://www.kaggle.com/seraphwedd18/herbarium-consolidating](http://) ---> **Full_credit**
2. [https://www.kaggle.com/venkatkumar001/herbarium-22-fgvc9-baseline](http://)