In this notebook I'll try a simple EDA to understand the composition of our data. I already did a first EDA in my previous notebook [here](https://www.kaggle.com/amelnozieres/eda-sweetviz-profiling)

In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import json
from keras.preprocessing import image
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Explore the JSON files

In [1]:
def read_json(json_path):
    """ args = takes the Json files 
        returns = create the five dataframes for the five tables in the annotations json file
    """
    # # Opening JSON file
    f = open(json_path, )

    # returns JSON object as
    # a dictionary
    
    data = json.load(f)

    # Create different lists from the data dictionary

    annotations = pd.DataFrame(data["annotations"])
    category = pd.DataFrame(data["categories"])
    info = pd.DataFrame.from_dict(data["info"], orient='index')
    images = pd.DataFrame(data['images'])
    licenses = pd.DataFrame(data['licenses'])
    # Closing file
    f.close()
    return annotations, category, info, images, licenses

In [1]:
json_train_path = '/kaggle/input/fungi-annotations/train.json'
json_val_path = '/kaggle/input/fungi-annotations/val.json'
tr_annotation, tr_category, tr_info, tr_images, tr_licenses = read_json(json_train_path)
val_annotation, val_category, val_info, val_images, val_licenses = read_json(json_val_path)

In [1]:
def create_merged_json_df(images, annotations, category):
    """ args = takes the three important dataframes
    returns = create a merged dataframe with all the important data
    """
    data_df = images.copy()

    ## to the images dataframe, we add the category_id column - our target
    data_df['category_id'] = annotations[annotations['image_id'] == data_df['id']]['category_id']
    category.set_index('id')
    data_df = pd.merge(left=data_df, right=category, how='left', left_on='category_id', right_on='id')
    data_df.drop(columns = ['id_y'], inplace=True)
    #change the column to a string?
    data_df['category_id'] = data_df['category_id'].astype(str) 
    # Add the columns of image name and its subdirectory/catefory
    data_df['image_name'] = data_df['file_name'].str.split('/').str[2]
    data_df['subdir'] = data_df['file_name'].str.split('/',1).str[1]
    return data_df

In [1]:
tr_data = create_merged_json_df(tr_images, tr_annotation, tr_category)
val_data = create_merged_json_df(val_images, val_annotation, val_category)

In [1]:
tr_data

In [1]:
tr_data.dtypes

In [1]:
len(tr_data['category_id'].unique())

In [1]:
len(val_data['category_id'].unique())

ok so there is the same number of categories between the train and the validation data: 1394 classes

In [1]:
tr_data['category_id'].value_counts()

This is a bad distribution so far!

In [1]:
tr_data['name'].describe()

So here we can see that there isn't the same number of images for each class in the training data. But it is balanced in the validation data.

Let's check the distribution of our data by the target column

In [1]:
tr_data['freq'] = tr_data.groupby('category_id')['category_id'].transform('count')

In [1]:
tr_data

In [1]:
fig_dims = (15,15)

fig, ax = plt.subplots(figsize=fig_dims)

ax = sns.barplot(x = 'category_id', y = 'freq',data = tr_data)
plt.title('Distribution of the target Category_id')
plt.xlabel('Category_id')
plt.ylabel('Count of images')
plt.show()

In [1]:
fig_dims = (15,15)
fig, ax = plt.subplots(figsize=fig_dims)
sns.boxplot(x=tr_data['freq'])
plt.title('Distribution of the target Category_id')
plt.xlabel('Category_id')
plt.ylabel('Count of images')
plt.show()

In [1]:
fig_dims = (15,15)

fig, ax = plt.subplots(figsize=fig_dims)

ax = sns.countplot(x = 'category_id', data = tr_data)
plt.title('Distribution of the target Category_id')
plt.xlabel('Category_id')
plt.ylabel('Count of images')
plt.show()

In [1]:
fig_dims = (15,15)
fig, ax = plt.subplots(figsize=fig_dims)
sns.boxplot(x=tr_data['category_id'].value_counts())
plt.title('Distribution of the target Category_id')
plt.xlabel('Category_id')
plt.ylabel('Count of images')
plt.show()

So the distribution is not equivalent between categories so it will skwed our predictions. Let's check it is the same for the validation data

In [1]:
fig_dims = (15,15)

fig, ax = plt.subplots(figsize=fig_dims)

sns.distplot(val_data['category_id'])
plt.show()

In [1]:
fig_dims = (15,15)

fig, ax = plt.subplots(figsize=fig_dims)

ax = sns.countplot(x = 'category_id', data = val_data)
plt.title('Distribution of the target Category_id')
plt.xlabel('Category_id')
plt.ylabel('Count of images')
plt.show()

So we have an imbalanced training dataset. yay! 