In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import os
input_dir = '/kaggle/input'
working_dir = '/kaggle/working/'
temp_dir = '/kaggle/temp/'

## (A) Train CSV

In [None]:
train_df = pd.read_csv('/kaggle/input/tensorflow-great-barrier-reef/train.csv')
print(train_df.shape)
print(train_df.isna().sum())
train_df.head(5)

## (A) 1- Let's see some metadata stats for a single video_id

In [None]:
def get_sequence_stats(dataframe):
    num_seq_dataframe = len(dataframe.sequence.unique())
    print(f'Number of sequences : {num_seq_dataframe}')
    print('{Sequence ID}   {Num of imgs in Sequence (or Sequence length)}')
    print(dataframe.sequence.value_counts(sort=False).sort_index())
    fig, ax = plt.subplots()
    dataframe.sequence.plot(ax=ax)
    plt.show()

def get_video_frame_stats(dataframe):
    print(f'Number of video_frames : {len(dataframe.video_frame.unique())}')
    fig, ax = plt.subplots()
    dataframe.video_frame.plot(ax=ax)
    plt.show()
    
def get_annotation_stats(dataframe):
    dataframe['bbox_count'] = dataframe.annotations.apply(lambda x: x.count('{'))
    print('{BBox Count}   {Num of images with this many BBoxes}')
    print(dataframe['bbox_count'].value_counts(sort=False).sort_index())
    print(f'Total Number of bboxes : {dataframe.bbox_count.sum()}')
    num_img_with_no_bbox = (dataframe.bbox_count == 0).sum()
    num_img_with_bbox = (dataframe.bbox_count != 0).sum()
    print(f'Total Number of images (or video_frames) : {dataframe.shape[0]}')
    print(f'Total Number of images having non-empty annotation (i.e no bbox) : {num_img_with_no_bbox}')
    print(f'Total Number of images having non-empty annotation : {num_img_with_bbox}')
    dataframe_with_bbox = get_parsed_annotation(dataframe[dataframe.bbox_count != 0].copy())
    print(f'BBoxes Stats (for img having bboxes): ')
    bbox_stats(dataframe_with_bbox.copy())
    
def bbox_stats(dataframe):
    bbox_area = []
    for height_list, width_list in zip(dataframe.height, dataframe.width):
        for height, width in zip(height_list, width_list):
            bbox_area.append(height*width)
    bbox_area = pd.Series(bbox_area)
    print(f'Mean of bbox_area : {bbox_area.mean()}')
    print(f'Median of bbox_area : {bbox_area.median()}')
    fig, ax = plt.subplots()
    bbox_area.plot(ax=ax)
    plt.show()
    fig, ax = plt.subplots()
    bbox_area.hist(ax=ax)
    plt.show()
    
def get_parsed_annotation(dataframe):
    dataframe['parsed_annotation'] = dataframe.annotations.apply(lambda x: list(eval(x)))
    dataframe['width'] = dataframe.parsed_annotation.apply(lambda x: [i['width'] for i in x])
    dataframe['height'] = dataframe.parsed_annotation.apply(lambda x: [i['height'] for i in x])
    #dataframe['bbox_area'] = dataframe.apply(lambda x: [width*height for height, width in zip(x.width, x.height)], axis=1)
    return dataframe
    
def get_metadata_stats(dataframe):
    print(f'Total number of videos : {len(dataframe.video_id.unique())}')
    print()
    print(f'Cummulative sequence stats : ')
    get_sequence_stats(dataframe.copy())
    print()
    print(f'Cummulative annotation stats : ')
    get_annotation_stats(dataframe.copy())
    print()
    print('- - - - - - - - - - - - - - - - - - - -')
    
    # get stats for each video
    for video_id in dataframe.video_id.unique():
        local_dataframe = dataframe[dataframe.video_id == video_id]
        print()
        print(f'Video ID : {video_id}')
        print()
        print(f'Cummulative sequence stats : ')
        get_sequence_stats(local_dataframe.copy())
        print()
        print(f'Cummulative video_frame stats : ')
        get_video_frame_stats(local_dataframe.copy())
        print()
        print(f'Cummulative annotation stats : ')
        get_annotation_stats(local_dataframe.copy())
        print()
        print('# # # # # # # # # # # # # # # # # # # # # #')
        

In [None]:
get_metadata_stats(train_df.copy())

### (A) 2 - Checking Images

In [None]:
from PIL import Image

img_dir = '/kaggle/input/tensorflow-great-barrier-reef/train_images'

train_img_dim = {}
for video_dir in os.listdir(img_dir):
    images = os.listdir(os.path.join(img_dir, video_dir))
    for img in images:
        image_path = os.path.join(img_dir, video_dir, img)
        img_pil = Image.open(image_path)
        size = img_pil.size
        if size in train_img_dim:
            train_img_dim[size] += 1
        else:
            train_img_dim[size] = 1
print(train_img_dim)
        