In [None]:
%%capture
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# EDA - Exploratory data analysis

Upvote if you found useful

<p align = "center">
<img src = "https://econews.com.au/wp-content/uploads/2016/10/Crown-of-thorns-starfish-attached-healthy-coral-reef-Mackay-.jpg">
</p>
<p align = "center">
Crown-of-thorns starfish (Image from econews.com.au)
</p>

## Content

1. **Video frames numbers**
2. **Train dataframe analysis: video analysis and annotations numbers (per video and sequence)**
3. **Visualizing some training examples**


In [None]:
import os
import ast
import PIL
import cv2
import pandas as pd
from os import listdir
from os.path import isfile,join
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Dataset overview
## 1. Video frames stats

In [None]:
DATA_PATH = '/kaggle/input/tensorflow-great-barrier-reef'
images_path = join(DATA_PATH,'train_images')
df_train = pd.read_csv(join(DATA_PATH,'train.csv'))

In [None]:
df_train = pd.read_csv(join(DATA_PATH,'train.csv'))
df_train["img_path"] = os.path.join(DATA_PATH, "train_images")+"/video_"+df_train.video_id.astype(str)+"/"+df_train.video_frame.astype(str)+".jpg"

In [None]:
from PIL import Image

def video_stats(path):
    # Lookfor files within video folder
    onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
    # Filter files by extension
    onlyfiles = [f for f in onlyfiles if f.endswith(".jpg")]
    im = Image.open(join(path,onlyfiles[0]))
    width, height = im.size
    print(f'Number of frames: {len(onlyfiles)}')
    print(f'Frames with size (w,h): ({width},{height})')

In [None]:
# Video 0
print('Video 0 Stats:')
video_stats(join(images_path,'video_0'))

# Video 1
print("\n",'Video 1 Stats:')
video_stats(join(images_path,'video_1'))

# Video 2
print("\n",'Video 2 Stats:')
video_stats(join(images_path,'video_2'))

## 2 Train dataframe analysis

- `video_id` - ID number of the video the image was part of. The video ids are not meaningfully ordered.
- `video_frame` - The frame number of the image within the video. Expect to see occasional gaps in the frame number from when the diver surfaced.
- `sequence` - ID of a gap-free subset of a given video. The sequence ids are not meaningfully ordered.
- `sequence_frame` - The frame number within a given sequence.
- `image_id` - ID code for the image, in the format '{video_id}-{video_frame}'
- `annotations` - The bounding boxes of any starfish detections in a string format that can be evaluated directly with Python. Does not use the same format as the predictions you will submit. Not available in **test.csv.** A bounding box is described by the pixel coordinate (x_min, y_min) of its upper left corner within the image together with its width and height in pixels.

### 2.1 Video details

Display the number of frames per video

In [None]:
df_train_video_group = df_train.groupby("video_id")["video_frame"].max()
fig = px.bar(df_train_video_group, 
             color=px.colors.qualitative.Plotly[:3],
             labels={"sequence":"Video id", "value":"N° of frames", "variable":"Original Column Name"},
             title="Number of frames per video ID")

fig.update_layout(xaxis=dict(type='category'), showlegend=False)
fig.show()

However, each video has several sequences. Let's see in a more detailed way.

In [None]:
df_train_sequence_group = df_train.groupby("sequence")[["sequence_frame", "video_id"]].max().sort_values(by="video_id")
df_train_sequence_group["video_id"] = df_train_sequence_group["video_id"].astype(str) # For label color mode

fig = px.bar(df_train_sequence_group, 
             color="video_id",
             labels={"sequence":"Sequence id", "value":"Number Of Frames", "variable":"Original Column Name"},
             title="Number Of Frames In Each Sequence")

fig.update_layout(xaxis=dict(type='category'), showlegend=True)
fig.show()

### 2.2 Annotation analysis

In [None]:
# Train stats
samples_without_annotations=len(df_train[df_train['annotations']=='[]'])
#ax = sns.barplot(x=['Without bbox','With bbox'], y=[samples_without_annotations,(len(df_train) - samples_without_annotations)])
colors = ['lightslategray',] * 2 
colors[1] = 'crimson'
labels = ['Without bbox','With bbox']

fig = go.Figure([go.Bar(x=labels, 
                        y=[samples_without_annotations, len(df_train) - samples_without_annotations],
                        marker_color=px.colors.qualitative.Plotly[:2])])
fig.show()
print(f'Number of training samples: {len(df_train)}')
print(f'Training samples without object labels: {samples_without_annotations}')
print(f'Training samples with object labels: {len(df_train) - samples_without_annotations}')

In [None]:
df_train["annotations"] = df_train["annotations"].apply(lambda x: ast.literal_eval(x))
df_train["num_boxes"] = df_train["annotations"].apply(len)
df_train["video_id"] = df_train["video_id"].astype(str)
df_train["sequence"] = df_train["sequence"].astype(str)

In [None]:
df_annotations_count = df_train.groupby('num_boxes')['annotations'].count()

In [None]:
df_annotations_count = df_annotations_count.drop([0]);

In [None]:
# Information about the number of samples with bounding boxes
fig = px.bar(df_annotations_count)
fig.update_layout(xaxis=dict(type='category'), showlegend=False)
fig.show()

In [None]:
# View nunber of bounding boxes per frame in each sequence
fig = px.histogram(df_train, x="sequence", color="num_boxes",
             labels={"sequence":"Sequence ID", "num_boxes":"N° of Boxes per frame"},
             title="Number of annotations in each sequence")
fig.show()

## 3 Visualizing some training examples

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt

def tf_load_img(img_path, reshape_to=None):
    if reshape_to is None:
        return tf.image.decode_image(tf.io.read_file(img_path), channels=3)
    else:
        return tf.image.resize(tf.image.decode_image(tf.io.read_file(img_path), channels=3), reshape_to)
    
def get_tl_br(bbox):
    """ Return the top-left and bottom-right bounding box """
    return (bbox['x'], bbox['y']), (bbox['x']+bbox["width"], bbox['y']+bbox["height"])
    
def plot_image(img_path, annotations=None, **kwargs):
    """ Plot an image and bounding boxes """
    img = np.array(tf_load_img(img_path))
    
    if annotations:
        plt.figure(figsize=(20,10))
        for i, bbox in enumerate(annotations):
            tl_box, br_box = get_tl_br(bbox)
            img = cv2.rectangle(img, tl_box, br_box, (255-2*i,14*i,0), 4)
        plt.imshow(img)
        plt.axis(False)
        plt.title(f"Bounding boxes plotted: ({len(annotations)})")
    else:
        plt.figure(figsize=(20,10))
        plt.imshow(img)
        plt.axis(False)
        plt.title("No bounding boxes within the image")
    plt.tight_layout()
    plt.show()

In [None]:
num_bbox_for_visualization = [1,3,7,15]
for num_bbox in sorted(num_bbox_for_visualization):
    ex_row = df_train[df_train.num_boxes==num_bbox].reset_index(drop=True).iloc[0]
    plot_image(**ex_row)