#  Loading libraries

In [None]:
import json
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from tabulate import tabulate

# Loading and cleaning data

In [None]:
#loading json file for train data
data_path_train = "../input/herbarium-2021-fgvc8/train/metadata.json"
with open(data_path_train) as json_file:
    meta_train = json.load(json_file)
    
#loading json file for test data
data_path_test = "../input/herbarium-2021-fgvc8/test/metadata.json"
with open(data_path_test) as json_file:
    meta_test = json.load(json_file)

In [None]:
#Finding keys in dictionary
meta_train.keys()

In [None]:
#creating seperate dataframes from metadata
annotations_train =  pd.json_normalize(meta_train ['annotations'])
categories_train =  pd.json_normalize(meta_train ['categories'])
images_train =  pd.json_normalize(meta_train ['images'])
info_train =  pd.json_normalize(meta_train ['info'])
licenses_train =  pd.json_normalize(meta_train ['licenses'])
institutions_train =  pd.json_normalize(meta_train ['institutions'])

Now lets check how each dataframe looks like

In [None]:
annotations_train.head(3)

In [None]:
categories_train.head(3)

In [None]:
images_train.head(3)

In [None]:
info_train.head(1)

In [None]:
licenses_train.head(1)

In [None]:
institutions_train.head(1)

As we saw there is not much usefull information in info, licences and institutions

In [None]:
#Removing unused dataframe
del info_train
del licenses_train
del institutions_train

In [None]:
#Finding keys in dictionary
meta_test.keys()

In [None]:
#creating seperate dataframes from metadata
images_test=  pd.json_normalize(meta_test ['images'])
info_test =  pd.json_normalize(meta_test ['info'])
licenses_test=  pd.json_normalize(meta_test ['licenses'])

In [None]:
#checking what each dataframe looks like
images_test.head(3)

In [None]:
info_test.head()

In [None]:
licenses_test.head(1)

In [None]:
#Removing unwanted dataframes
del info_test
del licenses_test

Now manipulate dataframes to make a dataset in required format

In [None]:
#creating test data
df_test = images_test.drop(['height','license','width'], axis=1)
#renaming id to image_id 
df_test = df_test.rename(columns={"id": "image_id"})

# changing order of column
df_test = df_test[['image_id','file_name']]
df_test.head()

Before creating train dataset lets have some checks

In [None]:
#checking whether id, image_id in annotaion_train and images_train are same.
same = pd.DataFrame(np.where(annotations_train['id'] == annotations_train['image_id'], 'True', 'False'))
same.columns = ['an_id_vs_an_im_id']                   
same['im_id_vs_an_id'] = np.where(annotations_train['id'] == images_train['id'], 'True', 'False')

# find true or false. true means id and image_id are same
print(same.head()) 

# find if both true and false is present
print('number of unique items in first column',len(same['an_id_vs_an_im_id'].unique()))
print('number of unique items in second column',len(same['im_id_vs_an_id'].unique()))

del same

After checking whether id and image_id in annotation_train are same, we found that indeed the are same. Only one unique item in both columns and the item is True so id in annotations_train, image_id in annotations_train and id in images_train are same and in same order. Now lets start merging dataframes

In [None]:
#merging annotations and images df
df_merge = pd.merge(annotations_train[['category_id', 'image_id','id']],images_train[['file_name','id','height','width']] , on='id')
#removing unwanted columns
df_merge=df_merge.drop(['id'], axis=1)
df_merge.head()


Before merging df_merge with categories_train, lets have some checks

In [None]:
#checking if category_id in df_merge is having same limits as id in categories_train

if df_merge['category_id'].min() == categories_train['id'].min():
    print('true')
else:
    print('false')

if df_merge['category_id'].max() == categories_train['id'].max():
    print('true')
else:
    print('false')

From the results we see that category_id in df_merge is same as id in categories_train. so we merge those two dataframes

In [None]:
#renaming id to category_id 
categories_train = categories_train.rename(columns={"id": "category_id"})

#merging label with data. creating train data
df_train = pd.merge(df_merge[['image_id','file_name','category_id','height','width']],categories_train[['category_id','name','family','order']] , on='category_id')

#Add a colum containing file fath pointing towards location of images
df_train['file_path']="../input/herbarium-2021-fgvc8/train/"+df_train['file_name']

df_train

In [None]:
#removing null values
df_train = df_train.dropna(how = 'all')
#cheking for missing data 
df_train.isnull().sum()


In [None]:
#checking for duplicates
df_train['file_name'].duplicated().any()

We found that some data in order cloumn is 'unknown' after checking unique values in order column

In [None]:
#finding missing data
len(df_train.loc[df_train['order'] == 'Unknown'])


In [None]:
#dropping missing data
df_train = df_train.drop(df_train[df_train.order == 'Unknown'].index)

Lets create a seperate dataset for image dimention data and then remove them from training dataset

In [None]:
dimentions = pd.DataFrame(df_train[[ 'image_id','category_id','height','width']])
dimentions

In [None]:
#removing unwanted columns
df_train = df_train.drop(['height','width'], axis=1)

In [None]:
#sorting test df on image_id
df_test.sort_values(by=['image_id'])
#sorting train df on image_id
df_train.sort_values(by=['image_id'])

In [None]:
#Removing unused dataframes
del annotations_train
del categories_train
del images_train
del images_test
del df_merge

# EDA

Lets check the number of items by each category

In [None]:
len(df_train['image_id'])

In [None]:
len(df_test['image_id'])

In [None]:
len(df_train['category_id'].unique())

In [None]:
len(df_train['family'].unique())

In [None]:
len(df_train['order'].unique())

We see that there are 2257710 images in the dataset which can be divided into 64488 specices.
There are 243020 images in test dataset
The name column gives the name of the plant as species+genus.
These specices can be further divided into 450 familes and further into 80 orders. 
Lets create a table for this.


In [None]:
#finding name of orders
n = df_train['order'].unique().tolist()


#finding number of families in each order
o_f_n=[]
for i in range(len(n)):    
    o_f_n.append(len(df_train.loc[df_train['order']==n[i],'family' ].unique()))

#finding number of species in each order
o_s_n=[]
for i in range(len(n)):    
    o_s_n.append(len(df_train.loc[df_train['order']==n[i],'category_id' ].unique()))  
    
#finding number of images in each order
o_i_n=[]
for i in range(len(n)):
    o_i_n.append(len(df_train.loc[df_train['order']==n[i]]))

In [None]:
table_order =pd.DataFrame(df_train['order'].unique(),columns =['order'])
table_order['Number_of_families_in_order'] = o_f_n
table_order['Number_of_species_in_order'] = o_s_n
table_order['Number_of_images_in_order'] = o_i_n

table_order = table_order.sort_values(by=['Number_of_images_in_order'], ascending=False)

print(table_order.to_markdown())

In [None]:
#creating table to classify by sample size
List = [10,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000]
sample_size = pd.DataFrame({'Sample_size':['more than 10','more than 100','more than 150','more than 200',
                                           'more than 250','more than 300','more than 350','more than 400',
                                           'more than 450','more than 500','more than 550','more than 600',
                                           'more than 650','more than 700','more than 750','more than 800',
                                           'more than 850','more than 900','more than 950','more than 1000']})

#finding number of species having particular sample size
s_n=[]
for i in List:
    more= df_train['category_id'].value_counts() > i
    s_n.append(len(more.index[more==True]))  

#finding number of families having particular sample size
f_n=[]
for i in List:
    more= df_train['family'].value_counts() > i
    f_n.append(len(more.index[more==True]))
    
#finding number of orders having particular sample size
o_n=[]
for i in List:
    more= df_train['order'].value_counts() > i
    o_n.append(len(more.index[more==True]))

    

sample_size['Number_of_species'] = s_n
sample_size['Number_of_families'] = f_n
sample_size['Number_of_orders'] = o_n

print(sample_size.to_markdown())

As we can see from the table, number of images if classified by order is heavily  imbalenced. Only 27 species have sample sizes more than 1000.

# Visualize

In [None]:
#Number of samoles in each order
plt.figure(figsize=(15, 10))
df_train['order'].value_counts().plot.bar()

In [None]:
df_train.head(1)

In [None]:
# plotting image by image id for single image
def  visualize(image_id):
    
    path = df_train.loc[df_train['image_id'] == image_id, 'file_path'].iloc[0]
    family = df_train.loc[df_train['image_id'] == image_id, 'family'].iloc[0]
    order = df_train.loc[df_train['image_id'] == image_id, 'order'].iloc[0]
    name = df_train.loc[df_train['image_id'] == image_id, 'name'].iloc[0]
    plt.figure(figsize=(10, 10))
    image = cv2.imread(path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(image)
    plt.title(f"ORDER: {order}FAMILY: {family} \n NAME:{name}\n Image_id:{image_id}", fontsize=10,)
    plt.show()

In [None]:
visualize(23445)