In [None]:
import os
import json
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Loading and cleaning data

In [None]:
#loading json file for train data
data_path_train = "../input/herbarium-2022-fgvc9/train_metadata.json"
with open(data_path_train) as json_file:
    meta_train = json.load(json_file)
    
#loading json file for test data
data_path_test = "../input/herbarium-2022-fgvc9/test_metadata.json"
with open(data_path_test) as json_file:
    meta_test = json.load(json_file)    

In [None]:
#Finding keys in train dictionary
meta_train.keys()

In [None]:
#creating seperate dataframes from metadata
annotations_train =  pd.json_normalize(meta_train ['annotations'])
categories_train =  pd.json_normalize(meta_train ['categories'])
images_train =  pd.json_normalize(meta_train ['images'])
genera_train =  pd.json_normalize(meta_train ['genera'])
distance_train =  pd.json_normalize(meta_train ['distances'])
licenses_train =  pd.json_normalize(meta_train ['license'])
institutions_train =  pd.json_normalize(meta_train ['institutions'])

Now lets check how each dataframe looks like

In [None]:
annotations_train.head()

In [None]:
categories_train.head()

In [None]:
images_train.head()

In [None]:
genera_train.head()

In [None]:
distance_train.head()

In [None]:
licenses_train.head()

In [None]:
institutions_train.head()

As we saw there is not much usefull information in genus, licences and institutions

In [None]:
#Removing unused dataframe
del genera_train
del licenses_train
del institutions_train

In [None]:
#Looking at test set
df_test = pd.DataFrame(meta_test)

#creating test data
df_test = df_test.drop(['license'], axis=1)


# adding file path
df_test = df_test[['image_id','file_name']]
df_test['file_path']="../input/herbarium-2022-fgvc9/test_images/"+df_test['file_name']
df_test.head()

In [None]:
# Complete df
df_merge = pd.merge(images_train[['image_id','file_name']],annotations_train[['genus_id','category_id','image_id']] , on='image_id')
df_merge = pd.merge(df_merge[['genus_id','image_id','file_name','category_id']],categories_train[['category_id','scientificName','family','genus','species']] , on='category_id')
df_merge['file_path']="../input/herbarium-2022-fgvc9/train_images/"+df_merge['file_name']
df_merge['name']=df_merge['genus']+' '+df_merge['species']
df_train = df_merge[['category_id','genus_id','image_id','family','genus','species','name','file_name','file_path']]

df_train.head()

In [None]:
del annotations_train
del categories_train
del images_train


In [None]:
#removing null values
df_train = df_train.dropna(how = 'all')
#cheking for missing data 
df_train.isnull().sum()


In [None]:
#checking for duplicates
df_train['file_name'].duplicated().any()

# EDA

In [None]:
print ('number of images in train set')
len(df_train['image_id'])

In [None]:
print ('number of images in test set')
len(df_test['image_id'])

In [None]:
print ('number of specific types of plants')
len(df_train['category_id'].unique())

In [None]:
print ('number of familes')
len(df_train['family'].unique())

In [None]:
print ('number of genus')
len(df_train['genus'].unique())

In [None]:
print ('number of species')
len(df_train['species'].unique())

We see that there are 839772 images in the dataset which can be divided into 15501 specific plants, that can be identified by category id. There are 210407 images in test dataset. The name column gives the name of the plant as species+genus. These plants can be further classified into 6932 species, 2564 genus and 272 familes. Lets create a table for this.

In [None]:
#finding name of families
n = df_train['family'].unique().tolist()
x = df_train['genus'].unique().tolist()


#finding number of genus in each family
g_f_n=[]
for i in range(len(n)):    
    g_f_n.append(len(df_train.loc[df_train['family']==n[i],'genus' ].unique()))

#finding number of species in each family
s_f_n=[]
for i in range(len(n)):    
    s_f_n.append(len(df_train.loc[df_train['family']==n[i],'species' ].unique()))
    
#finding number of species in each genus
s_g_n=[]
for i in range(len(x)):    
    s_g_n.append(len(df_train.loc[df_train['genus']==x[i],'species' ].unique()))  
    
    
#finding number of images in each family
o_i_n=[]
for i in range(len(n)):
    o_i_n.append(len(df_train.loc[df_train['family']==n[i]]))

In [None]:
table_order =pd.DataFrame(df_train['family'].unique(),columns =['family'])
table_order['Number_of_genus_in_family'] = g_f_n
table_order['Number_of_species_in_family'] = s_f_n
table_order['Number_of_images_in_family'] = o_i_n

table_order = table_order.sort_values(by=['Number_of_images_in_family'], ascending=False, ignore_index=True)

print(table_order.to_markdown())

In [None]:
#creating table to classify by sample size
List = [10,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000]
sample_size = pd.DataFrame({'Sample_size':['more than 10','more than 100','more than 150','more than 200',
                                           'more than 250','more than 300','more than 350','more than 400',
                                           'more than 450','more than 500','more than 550','more than 600',
                                           'more than 650','more than 700','more than 750','more than 800',
                                           'more than 850','more than 900','more than 950','more than 1000']})

#finding number of plant having particular sample size
p_n=[]
for i in List:
    more= df_train['category_id'].value_counts() > i
    p_n.append(len(more.index[more==True]))  


#finding number of species having particular sample size
s_n=[]
for i in List:
    more= df_train['species'].value_counts() > i
    s_n.append(len(more.index[more==True]))  

#finding number of families having particular sample size
f_n=[]
for i in List:
    more= df_train['family'].value_counts() > i
    f_n.append(len(more.index[more==True]))
    
#finding number of genus having particular sample size
g_n=[]
for i in List:
    more= df_train['genus'].value_counts() > i
    g_n.append(len(more.index[more==True]))

    

sample_size['Number_of_plants'] = p_n
sample_size['Number_of_families'] = f_n
sample_size['Number_of_genus'] = g_n
sample_size['Number_of_speciess'] = s_n

print(sample_size.to_markdown())

In [None]:
#creating table to classify by sample size
List = [10,20,30,40,50,60,70,80,90,100]
sample_size = pd.DataFrame({'Sample_size':['less than 10','less than 20','less than 30','less than 40',
                                           'less than 50','less than 60','less than 70','less than 80',
                                           'less than 90','less than 100']})

#finding number of plant having particular sample size
p_n=[]
for i in List:
    less= df_train['category_id'].value_counts() < i
    p_n.append(len(less.index[less==True]))  
                                           
sample_size['Number_of_images_per plant_categories'] = p_n
print(sample_size.to_markdown())

In [None]:
print('Maximum number of samples available for a category')
max(df_train['category_id'].value_counts())

In [None]:
print('Minimum number of samples available for a category')
min(df_train['category_id'].value_counts())

# Phylogenetic Distances Among Genera

There is also a set of pairwise phylogenetic distances among genera to test if the difference in morphological features of plant taxa well correspond to their taxonomic distances


In [None]:
distance_train.head()

# Visualize

In [None]:
#Number of samples in each famiy
plt.figure(figsize=(25, 10))
df_train['family'].value_counts().plot.bar()
plt.title(f"value count in each family", fontsize=10)

In [None]:
# plotting image by image id for single image
def  visualize(image_id):
    
    path = df_train.loc[df_train['image_id'] == image_id, 'file_path'].iloc[0]
    family = df_train.loc[df_train['image_id'] == image_id, 'family'].iloc[0]
    genus = df_train.loc[df_train['image_id'] == image_id, 'genus'].iloc[0]
    species = df_train.loc[df_train['image_id'] == image_id, 'species'].iloc[0]
    name = df_train.loc[df_train['image_id'] == image_id, 'name'].iloc[0]
    plt.figure(figsize=(10, 10))
    image = cv2.imread(path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(image)
    plt.title(f"FAMILY: {family} GENUS: {genus} SPECIES: {species}\n NAME:{name}\n Image_id:{image_id}", fontsize=10)
    plt.show()

In [None]:
visualize('00000__002')

In [None]:
# plotting image by image id for multiple image. When an image id is given, function returns 20 images 
#in same class
def visualize_many(image_id):
    train_image_path = "../input/herbarium-2022-fgvc9/train_images/"
    
    category = df_train.loc[df_train['image_id'] == image_id, 'category_id'].iloc[0]
    df = df_train.loc[df_train['category_id'] == category]
    
    
                                                           
    if  df['image_id'].count()< 20:
        x=df['image_id'].tolist()
        plt.figure(figsize=(18, 18))
        for i, j in zip(x, range(20)):       
            plt.subplot(5, 4, j + 1)
            path = df.loc[df['image_id'] == i, 'file_name'].iloc[0]
            image = cv2.imread(os.path.join(train_image_path, path))
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            plt.imshow(image)
            family = df.loc[df['image_id'] == i, 'family'].iloc[0]
            genus = df.loc[df['image_id'] == i, 'genus'].iloc[0]
            species = df.loc[df['image_id'] == i, 'species'].iloc[0]
            name = df.loc[df['image_id'] == i, 'name'].iloc[0]
            imageid= df.loc[df['image_id'] == i, 'image_id'].iloc[0]
            plt.title(f"FAMILY: {family} GENUS: {genus} SPECIES: {species}\n NAME:{name}\n Image_id:{imageid}", fontsize=10)
            plt.axis("off")
        plt.show()
    else:
        x = np.random.choice(df['image_id'], 20, replace=False).tolist()
        plt.figure(figsize=(18, 18))
        for i, j in zip(x, range(20)):       
            plt.subplot(5, 4, j + 1)
            path = df.loc[df['image_id'] == i, 'file_name'].iloc[0]
            image = cv2.imread(os.path.join(train_image_path, path))
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            plt.imshow(image)
            family = df.loc[df['image_id'] == i, 'family'].iloc[0]
            genus = df.loc[df['image_id'] == i, 'genus'].iloc[0]
            species = df.loc[df['image_id'] == i, 'species'].iloc[0]
            name = df.loc[df['image_id'] == i, 'name'].iloc[0]
            imageid= df.loc[df['image_id'] == i, 'image_id'].iloc[0]
            plt.title(f"FAMILY: {family} GENUS: {genus} SPECIES: {species}\n NAME:{name}\n Image_id:{imageid}", fontsize=10)
            plt.axis("off")
    
        plt.show()

In [None]:
visualize_many('00000__003')

In [None]:
#plot image by family name
def visualize_family(name):
    train_image_path = "../input/herbarium-2022-fgvc9/train_images/"
     
    df = df_train.loc[df_train['family'] == name]
    
    x = np.random.choice(df['image_id'], 15, replace=False).tolist()
    plt.figure(figsize=(18, 18))
    for i, j in zip(x, range(15)):       
            plt.subplot(3, 5, j + 1)
            path = df.loc[df['image_id'] == i, 'file_name'].iloc[0]
            image = cv2.imread(os.path.join(train_image_path, path))
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            plt.imshow(image)
            family = df.loc[df['image_id'] == i, 'family'].iloc[0]
            genus = df.loc[df['image_id'] == i, 'genus'].iloc[0]
            species = df.loc[df['image_id'] == i, 'species'].iloc[0]
            name = df.loc[df['image_id'] == i, 'name'].iloc[0]
            imageid= df.loc[df['image_id'] == i, 'image_id'].iloc[0]
            plt.title(f"FAMILY: {family} GENUS: {genus} SPECIES: {species}\n NAME:{name}\n Image_id:{imageid}", fontsize=10)
            plt.axis("off")
            #plt.savefig('saved_figure.png')
    
    plt.show()

In [None]:
visualize_family('Brassicaceae')

Images have different background colours, frames, and other things anlong with plant species.