# overview
See input data and output format 

In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if filename.endswith('.jpg'):
            break
        print(os.path.join(dirname, filename))

Since there's a lot of images included there, we only checked non-image files and got the three above. Next, we will load the sample submission and check.

In [None]:
df_sample = pd.read_csv('../input/herbarium-2020-fgvc7/sample_submission.csv')
display(df_sample)

Load json files

In [None]:
import json
with open("../input/herbarium-2020-fgvc7/nybg2020/train/metadata.json", 'r',
                 encoding='utf-8', errors='ignore') as f:
    train_meta = json.load(f)
    
with open("../input/herbarium-2020-fgvc7/nybg2020/test/metadata.json", 'r',
                 encoding='utf-8', errors='ignore') as f:
    test_meta = json.load(f)

In [None]:
display(train_meta.keys())

Now, we will be unifying the metadata from the `*.json` files. We will first work with the `train` data.

First, we access the `annotations` list and convert it to a df.

In [None]:
train_id = pd.DataFrame(train_meta['annotations'])
display(train_id)

Next is for `plant categories`:

In [None]:
train_cat = pd.DataFrame(train_meta['categories'])
train_cat.columns = ['family', 'genus', 'category_id', 'category_name']
display(train_cat)

Followed by the `image properties`:

In [None]:
train_img = pd.DataFrame(train_meta['images'])
train_img.columns = ['file_name', 'height', 'image_id', 'license', 'width']
display(train_img)

And lastly, the `region`:

In [None]:
train_reg = pd.DataFrame(train_meta['regions'])
train_reg.columns = ['region_id', 'region_name']
display(train_reg)

Then, we will merge all the DataFrames and see what we got:

In [None]:
train_df = train_id.merge(train_cat, on='category_id', how='outer')
train_df = train_df.merge(train_img, on='image_id', how='outer')
train_df = train_df.merge(train_reg, on='region_id', how='outer')

In [None]:
print(train_df.info())
display(train_df)

Looking closer, there's a line with `NaN` values there. We need to remove rows with `NaN`s so we proceed to the next line:

In [None]:
bools_img_path = train_df['file_name'].isna()
keep = [x for x in range(train_df.shape[0]) if not bools_img_path[x]]
train_df = train_df.iloc[keep]

After selecting the `non-NaN` items, we now reiterate on their file types. We need to save on memory, as we reached `102+ MB` for this DataFrame Only.

In [None]:
dtypes = ['int32', 'int32', 'int32', 'int32', 'object', 'object', 'object', 'object', 'int32', 'int32', 'int32', 'object']
for n, col in enumerate(train_df.columns):
    train_df[col] = train_df[col].astype(dtypes[n])
print(train_df.info())
display(train_df)

Finally, for our `test` dataset. Since it only contains one key, `images`:

In [None]:
test_meta.keys()

In [None]:
test_df = pd.DataFrame(test_meta['images'])
test_df.columns = ['file_name', 'height', 'image_id', 'license', 'width']
print(test_df.info())
display(test_df)

Perfect!

Now, we can go ahead and save this dataframe as a `*.csv` file for future use!

In [None]:
train_df.to_csv('full_train_data.csv', index=False)
test_df.to_csv('full_test_data.csv', index=False)

# Data Exploration

We will now start the data exploration and see what we can do with this dataset.

In [None]:
import matplotlib.pyplot as plt

In [None]:
_train = train_df['category_name'].value_counts()
print(len(_train),'種類')
fig = plt.figure(figsize=(14,3))
ax1 = fig.add_subplot(1, 2, 1)
ax1.bar(_train[0:11].index,_train[0:11].values)
plt.xticks(rotation=90)
plt.title('top10')
ax2 = fig.add_subplot(1, 2, 2)
ax2.bar(_train[-10:-1].index,_train[-10:-1].values)
plt.xticks(rotation=90)
plt.title('worst10')
fig.savefig("category_top10_worst_10.png",bbox_inches="tight")
plt.show()

In [None]:
_train = train_df['family'].value_counts()
print(len(_train),'family')
fig = plt.figure(figsize=(14,3))
ax1 = fig.add_subplot(1, 2, 1)
ax1.bar(_train[0:11].index,_train[0:11].values)
plt.xticks(rotation=90)
plt.title('top10')
ax2 = fig.add_subplot(1, 2, 2)
ax2.bar(_train[-10:-1].index,_train[-10:-1].values)
plt.xticks(rotation=90)
plt.title('worst10')
fig.savefig("family_top10_worst_10.png",bbox_inches="tight")
plt.show()

In [None]:
_train = train_df['genus'].value_counts()
print(len(_train),'genus')
fig = plt.figure(figsize=(14,3))
ax1 = fig.add_subplot(1, 2, 1)
ax1.bar(_train[0:11].index,_train[0:11].values)
plt.xticks(rotation=90)
plt.title('top10')
ax2 = fig.add_subplot(1, 2, 2)
ax2.bar(_train[-10:-1].index,_train[-10:-1].values)
plt.xticks(rotation=90)
plt.title('worst10')
fig.savefig("genus_top10_worst_10.png",bbox_inches="tight")
plt.show()

In [None]:
print("Total Unique Values for each columns:")
print("{0:10s} \t {1:10d}".format('train_df', len(train_df)))
for col in train_df.columns:
    print("{0:10s} \t {1:10d}".format(col, len(train_df[col].unique())))

Here, we can see that other than the `category_id`, there's also the `family`, `genus`, `category_name`, `region_id` and `region_name` for the other probable targets. `category_id` and `category_name` are one and the same, similar to `region_id` and `region_name`.

A possible approach for this kernel is to use a `CNN` to predict `family` and `genus` (we will ignore `region` for now). Then, using the `family` and `genus`, we will predict the `category_id` for the image.

In [None]:
family = train_df[['family', 'genus', 'category_name']].groupby(['family', 'genus']).count()
display(family.describe())

With some proper `image_data_augmentation` we can make up for the small number of samples for some images (first quartile).

# Model Creation
一度にcategoryを当てに行くこともできるが、事前情報として与えられているfamilyやgenusの情報を生かして、モデルを構築するアプローチをとる。(そのまま予測する手法はほかメンバーが実施。)

familyやgenusをcategoryの予測に生かす時、familyやgenusを予測する分類器を作って、特定の層の重みを学習させないことができる。tensorFlowであれば、レイヤーにtrainable属性が存在し、パラメータを学習させたくない層やモデルについて、trainable属性をFalseとすることで、学習をかけないでおくことができる。(ただし学習前の最後にcompile()を実行しないと属性の変更が繁栄されないので注意)

参考のURLでは
> summary()の出力結果には反映されているが、実際に設定を有効にするにはcompile()する必要があるので注意。compile()のあとでtrainableを変更した場合、再度compile()しなければならない。

としているため、このnotebookではtrainable属性の変更をしていないまま学習をかけていることになっているのではないか？


参考
https://note.nkmk.me/python-tensorflow-keras-trainable-freeze-unfreeze/


## CNNの構造の決め方について
調べてもこういう時はこうするみたいな各事例ごとの具体例が出てくるのみ。また最近は「これが流行り」のようなモデル構造の隆盛まであるっぽい。結局どうすりゃいいかわからないから、Efficientnetに任せてしまうことにした。(AutoMLの考え方から)


参考
https://qiita.com/icoxfog417/items/5fd55fad152231d706c2

# Data Generator
data generatorを作る。
前処理等しておくにはデータの容量が大きすぎる。バッチごとに処理をするので、dataGeneratorに任せることとする。

処理時間参考
https://hironsan.hatenablog.com/entry/2017/09/09/130608

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(featurewise_center=False,
                                     featurewise_std_normalization=False,
                                     rotation_range=180,
                                     width_shift_range=0.1,
                                     height_shift_range=0.1,
                                     zoom_range=0.2)

Now, we will transform the `family` and `genus` to ids.

In [None]:
m = train_df[['file_name', 'family', 'genus', 'category_id']]
fam = m.family.unique().tolist()
m.family = m.family.map(lambda x: fam.index(x))
gen = m.genus.unique().tolist()
m.genus = m.genus.map(lambda x: gen.index(x))
display(m)

# Train

In [None]:
os.chdir('../input')

In [None]:
from efficientnet.efficientnet.keras import EfficientNetB3 
#from efficientnet.efficientnet.model import EfficientNetB3
from keras.models import Model
from keras.layers import Dense, Dropout, Conv2D, MaxPool2D, Flatten, BatchNormalization, Input, concatenate
from keras.optimizers import Adam
from keras.utils import plot_model
from sklearn.model_selection import train_test_split as tts

In [None]:
def fg_model(shape,lr):
    
    
    actual_shape = shape
    i = Input(actual_shape)
    x = EfficientNetB3(weights='imagenet', include_top=False, input_shape=actual_shape, pooling='max')(i)
    #x = Flatten()(x)
    o1 = Dense(310, name="family", activation='softmax')(x)
    o2 = concatenate([x,o1])
    o2 = Dense(3678, name="genus", activation='softmax')(o2)
    o3 = concatenate([x,o2])
    o3 = Dense(32093, name="category_id", activation='softmax')(o3)
    model = Model(inputs=i,outputs=[o1,o2,o3])
    
    #model.layers[1].trainable = False
    #model.get_layer('genus').trainable = False
    
    opt = Adam(lr=lr, amsgrad=True)
    model.compile(optimizer=opt, loss=['sparse_categorical_crossentropy', 
                                   'sparse_categorical_crossentropy','sparse_categorical_crossentropy'],
                 metrics=['accuracy'])

    
    return model

In [None]:
model = fg_model((300,300,3), 0.01) #Efficientnet B3 was designed for image size 300x300
model.summary()

# Train

Now, we will begin the training.

In [None]:

train, verif = tts(m, test_size=0.2, shuffle=True, random_state=17)
train = train[:40000]
verif = verif[:10000]
shape = (120, 120, 3)
epochs = 1
batch_size = 32

model = fg_model(shape, 0.007)
opt = Adam(lr=0.007, amsgrad=True)
#Disable the last two output layers for training the Family
model.get_layer('genus').trainable = True
model.get_layer('family').trainable = True
model.get_layer('category_id').trainable = True

model.compile(optimizer=opt, loss=['sparse_categorical_crossentropy', 
                               'sparse_categorical_crossentropy','sparse_categorical_crossentropy'],
             metrics=['accuracy'])
#Train Family for 2 epochs
model.fit_generator(train_datagen.flow_from_dataframe(dataframe=train,
                                                      directory='../input/herbarium-2020-fgvc7/nybg2020/train/',
                                                      x_col="file_name",
                                                      y_col=["family", "genus", "category_id"],
                                                      target_size=(120, 120),
                                                      batch_size=batch_size,
                                                      class_mode='multi_output'),
                    validation_data=train_datagen.flow_from_dataframe(
                        dataframe=verif,
                        directory='../input/herbarium-2020-fgvc7/nybg2020/train/',
                        x_col="file_name",
                        y_col=["family", "genus", "category_id"],
                        target_size=(120, 120),
                        batch_size=batch_size,
                        class_mode='multi_output'),
                    epochs=epochs,
                    steps_per_epoch=len(train)//batch_size,
                    validation_steps=len(verif)//batch_size,
                    verbose=1,
                    workers=8,
                    use_multiprocessing=True)

#Reshuffle the inputs
train, verif = tts(m, test_size=0.2, shuffle=True, random_state=18)
train = train[:40000]
verif = verif[:10000]

#Make the Genus layer Trainable
model.get_layer('genus').trainable = True
model.get_layer('family').trainable = False
model.get_layer('category_id').trainable = True
model.compile(optimizer=opt, loss=['sparse_categorical_crossentropy', 
                               'sparse_categorical_crossentropy','sparse_categorical_crossentropy'],
             metrics=['accuracy'])

#Train Family and Genus for 2 epochs
model.fit_generator(train_datagen.flow_from_dataframe(dataframe=train,
                                                      directory='../input/herbarium-2020-fgvc7/nybg2020/train/',
                                                      x_col="file_name",
                                                      y_col=["family", "genus", "category_id"],
                                                      target_size=(120, 120),
                                                      batch_size=batch_size,
                                                      class_mode='multi_output'),
                    validation_data=train_datagen.flow_from_dataframe(
                        dataframe=verif,
                        directory='../input/herbarium-2020-fgvc7/nybg2020/train/',
                        x_col="file_name",
                        y_col=["family", "genus", "category_id"],
                        target_size=(120, 120),
                        batch_size=batch_size,
                        class_mode='multi_output'),
                    epochs=epochs,
                    steps_per_epoch=len(train)//batch_size,
                    validation_steps=len(verif)//batch_size,
                    verbose=1,
                    workers=8,
                    use_multiprocessing=True)

#Reshuffle the inputs
train, verif = tts(m, test_size=0.2, shuffle=True, random_state=19)
train = train[:40000]
verif = verif[:10000]

#Make the category_id layer Trainable
model.get_layer('genus').trainable = True
model.get_layer('family').trainable = True
model.get_layer('category_id').trainable = True
model.compile(optimizer=opt, loss=['sparse_categorical_crossentropy', 
                               'sparse_categorical_crossentropy','sparse_categorical_crossentropy'],
             metrics=['accuracy'])
#Train them all for 2 epochs
model.fit_generator(train_datagen.flow_from_dataframe(dataframe=train,
                                                      directory='../input/herbarium-2020-fgvc7/nybg2020/train/',
                                                      x_col="file_name",
                                                      y_col=["family", "genus", "category_id"],
                                                      target_size=(120, 120),
                                                      batch_size=batch_size,
                                                      class_mode='multi_output'),
                    validation_data=train_datagen.flow_from_dataframe(
                        dataframe=verif,
                        directory='../input/herbarium-2020-fgvc7/nybg2020/train/',
                        x_col="file_name",
                        y_col=["family", "genus", "category_id"],
                        target_size=(120, 120),
                        batch_size=batch_size,
                        class_mode='multi_output'),
                    epochs=epochs,
                    steps_per_epoch=len(train)//batch_size,
                    validation_steps=len(verif)//batch_size,
                    verbose=1,
                    workers=8,
                    use_multiprocessing=True)

In [None]:
model.save('../working/fg_model.h5')

In [None]:
model.history.history

# Predict

Now, we will do our prediction. We may as well skip doing a confusion-matrix for our model because it's not even fully trained, so we go straight to our submission.

Similar to the above reason, we will be limiting the `predictions` to the first `10,000` items due to RAM limitations.

In [None]:
del train, verif, m, train_df, fam, gen, _train
batch_size = 32

# generator = test_datagen.flow_from_dataframe(
#         dataframe = test_df,#.iloc[:10000], 
#         directory = '../input/herbarium-2020-fgvc7/nybg2020/test/',
#         x_col = 'file_name',
#         target_size=(120, 120),
#         batch_size=batch_size,
#         class_mode=None,  # only data, no labels
#         shuffle=False)

# family, genus, category = model.predict_generator(generator, verbose=1)

In [None]:

categories = []
for i in range(1,len(test_df)):
    if i % 10000 == 0:
        test_datagen = ImageDataGenerator(featurewise_center=False,
                                  featurewise_std_normalization=False)
        generator = test_datagen.flow_from_dataframe(
            dataframe = test_df.iloc[i-10000:i], 
            directory = '../input/herbarium-2020-fgvc7/nybg2020/test/',
            x_col = 'file_name',
            target_size=(120, 120),
            batch_size=batch_size,
            class_mode=None,  # only data, no labels
            shuffle=False)

        family, genus, category = model.predict_generator(generator, verbose=1,max_queue_size=10)
        categories.append(np.argmax(category, axis=1))
        last = i # 最後のindexを保存
        del test_datagen, generator, family,genus
    elif i == (len(test_df)-1):
        test_datagen = ImageDataGenerator(featurewise_center=False,
                                  featurewise_std_normalization=False)        
        generator = test_datagen.flow_from_dataframe(
            dataframe = test_df.iloc[last:i+1], 
            directory = '../input/herbarium-2020-fgvc7/nybg2020/test/',
            x_col = 'file_name',
            target_size=(120, 120),
            batch_size=batch_size,
            class_mode=None,  # only data, no labels
            shuffle=False)

        family, genus, category = model.predict_generator(generator, verbose=1,max_queue_size=10)
        categories.append(np.argmax(category, axis=1))
        del test_datagen, generator,family,genus
#categories = np.concatenate(categories,axis=0)

# Submission

Next, we'll save the predicted values under `predictions` into the specified format for submissions. Remember that our `predictions` is a `list` of 3-outputs, namely: `family`, `genus`, `category_id` in that order.

In [None]:
np.concatenate(categories[0:2])

In [None]:
sub = pd.DataFrame()
sub['Id'] = test_df.image_id
sub['Id'] = sub['Id'].astype('int32')
sub['Predicted'] = np.concatenate(categories)
sub['Predicted'] = sub['Predicted'].astype('int32')
display(sub)
sub.to_csv('../working/submission.csv', index=False)

# Finish

There you have it! A working model for predicting the `Category` of the plants. I hope that this kernel helped you on your journey in unraveling the mysteries of this dataset! Please upvote before forking___________3-(^_^ )

In [None]:
# end_time = time.time()
# total = end_time - start_time
# h = total//3600
# m = (total%3600)//60
# s = total%60
# print("Total time spent: %i hours, %i minutes, and %i seconds" %(h, m, s))

In [None]:
# in_out_size = (120*120) + 3 #We will resize the image to 120*120 and we have 3 outputs
# def xavier(shape, dtype=None):
#     return np.random.rand(*shape)*np.sqrt(1/in_out_size)

# def fg_model(shape, lr=0.001):
#     '''Family-Genus model receives an image and outputs two integers indicating both the family and genus index.'''
#     i = Input(shape)
    
#     x = Conv2D(3, (3, 3), activation='relu', padding='same', kernel_initializer=xavier)(i)
#     x = Conv2D(3, (5, 5), activation='relu', padding='same', kernel_initializer=xavier)(x)
#     x = MaxPool2D(pool_size=(3, 3), strides=(3,3))(x)
#     x = BatchNormalization()(x)
#     x = Dropout(0.5)(x)
#     x = Conv2D(16, (5, 5), activation='relu', padding='same', kernel_initializer=xavier)(x)
#     #x = Conv2D(16, (5, 5), activation='relu', padding='same', kernel_initializer=xavier)(x)
#     x = MaxPool2D(pool_size=(5, 5), strides=(5,5))(x)
#     x = BatchNormalization()(x)
#     x = Dropout(0.5)(x)
#     x = Flatten()(x)
    
#     o1 = Dense(310, activation='softmax', name='family', kernel_initializer=xavier)(x)
    
#     o2 = concatenate([o1, x])
#     o2 = Dense(3678, activation='softmax', name='genus', kernel_initializer=xavier)(o2)
    
#     o3 = concatenate([o1, o2, x])
#     o3 = Dense(32094, activation='softmax', name='category_id', kernel_initializer=xavier)(o3)
    
#     x = Model(inputs=i, outputs=[o1, o2, o3])
    
#     opt = Adam(lr=lr, amsgrad=True)
#     x.compile(optimizer=opt, loss=['sparse_categorical_crossentropy', 
#                                    'sparse_categorical_crossentropy', 
#                                    'sparse_categorical_crossentropy'],
#                  metrics=['accuracy'])
#     return x

# model = fg_model((120, 120, 3))
# model.summary()
# plot_model(model, to_file='full_model_plot.png', show_shapes=True, show_layer_names=True)