# 0. Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image

import ast
import os
import cv2

def EDA_plot_image_and_annotations(vid_id, vid_frame):
    
    entry = train_csv[(train_csv['video_id'] == vid_id) & (train_csv['video_frame'] == vid_frame)]
    PATH = f"../input/tensorflow-great-barrier-reef/train_images/video_{vid_id}/{vid_frame}.jpg"
    img = np.array(Image.open(PATH))
    fig, ax = plt.subplots(1, figsize=(10, 8))
    ax.axis('off')
    ax.imshow(img)
    
    boxes = ast.literal_eval(entry.annotations.values[0])
    for box in boxes:
        rect = patches.Rectangle((box['x'], box['y']), box['width'], box['height'], linewidth=2, edgecolor='r', facecolor="none")
        ax.add_patch(rect)
    plt.show()

# 1. EDA

In [None]:
train_csv = pd.read_csv('../input/tensorflow-great-barrier-reef/train.csv')
test_csv = pd.read_csv('../input/tensorflow-great-barrier-reef/test.csv')
train_csv.info()

In [None]:
train_csv.head()

In [None]:
test_csv.head()

In [None]:
print(f"Video IDs in the train set: {train_csv.video_id.unique()}")
print(f"Video IDs in the test set: {test_csv.video_id.unique()}")

In [None]:
num_seq = [len(train_csv[train_csv['video_id'] == i]) for i in range(3)]
labels = ["0", "1", "2"]

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(9,6))
ax.set_facecolor('aliceblue')
plt.grid(color="gray", linestyle="-", zorder=0)
plt.ylabel("Number of Frames", fontsize=16, fontweight="bold")
plt.xlabel("Video ID", fontsize=16, fontweight="bold")
plt.title("Length of train videos", fontsize=20, fontweight="bold")
plt.bar(labels, num_seq, color="orange", zorder=3)
plt.show()

In [None]:
ex_pic = plt.imread('../input/tensorflow-great-barrier-reef/train_images/video_1/10015.jpg')
ex_pic.shape

In [None]:
train_csv["number_fishs"] = train_csv["annotations"].apply(lambda x: len(ast.literal_eval(x)))
train_csv.head()

In [None]:
max_num = max(train_csv.number_fishs)
max_sample = train_csv[train_csv["number_fishs"] == max_num].sample()
max_vid_id = max_sample.video_id.values[0]
max_vid_frame = max_sample.video_frame.values[0]

print('\033[1m' + f"Maximum number of starfish in one frame: {max_num} (Video {max_vid_id}, Frame {max_vid_frame})" + '\033[0m')
EDA_plot_image_and_annotations(max_vid_id, max_vid_frame)

In [None]:
cats = [str(i) for i in range(19)]
dict_counts = dict()

for i in range(3):
    set_ = train_csv[train_csv.video_id == i]
    vid_id_counts = set_["number_fishs"].value_counts().sort_index()
    for j in range(len(cats)):
        if j not in np.array(vid_id_counts.index):
            vid_id_counts = vid_id_counts.append(pd.Series([0], index=[j]))
    dict_counts.update({f"Video {i}": [i/len(set_) for i in list(vid_id_counts.values)]})

def survey(results, category_names):
    
    labels = list(results.keys())
    data = np.array(list(results.values()))
    data_cum = data.cumsum(axis=1)
    category_colors = plt.colormaps['RdYlGn'](
        np.linspace(0.15, 0.85, data.shape[1]))

    fig, ax = plt.subplots(figsize=(16.1, 6))
    ax.invert_yaxis()
    ax.xaxis.set_visible(False)
    ax.set_xlim(0, np.sum(data, axis=1).max())

    for i, (colname, color) in enumerate(zip(category_names, category_colors)):
        widths = data[:, i]
        starts = data_cum[:, i] - widths
        rects = ax.barh(labels, widths, left=starts, height=0.5,
                        label=colname, color=color)
        
    ax.legend(ncol=len(category_names), bbox_to_anchor=(0, 1),
              loc='lower left', fontsize='small')

    return fig, ax

survey(dict_counts, cats)
plt.show()

This figure shows the relative number of annotations in the frames of the three videos. As we can see, all three videos contain most of the time frames without annotations. On the one hand, the shortest video 0 labels not more than five starfishes in one frame, while the other videos include up to 18 fishes in one frame. On the other hand, the videos 1 and 2 have, relatively speaking, most of the time no starfish in front of the camera.