In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from PIL import Image

pd.options.display.max_colwidth = 200
plt.style.use('seaborn')

# Data exploration

## 1. Dataset files

In [None]:
data_path = '../input/tensorflow-great-barrier-reef'
!ls {data_path}

In [None]:
# greatbarrierreef/ : image delivery api
!ls {os.path.join(data_path, 'greatbarrierreef/')}

In [None]:
# train_images/ : training data folders, containing 3 videos folders : video_{video_id}
!ls {os.path.join(data_path, 'train_images/')}

In [None]:
# video folder : contains video frames {video_frame_number}.jpg. 
!ls {os.path.join(data_path, 'train_images/video_0/')} | head -n 5

## 2. train.csv

### Raw data

In [None]:
df_train = pd.read_csv(os.path.join(data_path, 'train.csv'))
df_train.head()

In [None]:
df_train.info()


- video_id - ID number of the video the image was part of.
- sequence - ID of a gap-free subset of a given video.
- video_frame - The frame number of the image within the video.
- sequence_frame - The frame number within a given sequence.
- image_id - ID code for the image, in the format '{video_id}-{video_frame}'
- annotations - The bounding boxes of any starfish detections in a string format.

### Starfish annotations example

In [None]:
# annotations : list of dict (x = x_min, y = y_min)
df_train.loc[df_train['annotations'] != '[]']['annotations'].sample(1).values[0]

### Video size

In [None]:
# size
video_ids = df_train['video_id'].unique()
print(f'Video count : {len(video_ids)}')
for video_id in video_ids:
    img_path = os.path.join(data_path, 'train_images', f'video_{video_id}', '0.jpg')
    im = Image.open(img_path)
    print(f'Video {video_id} : {im.size}, {im.mode}')
    
SIZE = (1280, 720)

## 3. Videos

### Relationship between videos, sequences, frames and annotations

In [None]:
def get_video_data(df_train):
    video_data = {}
    for video_id in df_train['video_id'].unique():
        df_video = df_train.loc[df_train['video_id'] == video_id]
        data_sequence = {}
        video_data[video_id] = {}
        video_data[video_id]['frames_count'] = 0
        video_data[video_id]['frames_with_annot_count'] = 0
        for sequence in df_video['sequence'].unique():
            df_sequence = df_video.loc[df_video['sequence'] == sequence]
            seq_annotations = {}
            seq_annotations['frames_count'] = len(df_sequence)
            seq_annotations['frames_with_annot_count'] = df_sequence.loc[df_train['annotations'] != '[]']['annotations'].count()
            data_sequence[sequence] = seq_annotations
            video_data[video_id]['frames_count'] += seq_annotations['frames_count']
            video_data[video_id]['frames_with_annot_count'] += seq_annotations['frames_with_annot_count']
        video_data[video_id]['sequence'] = data_sequence
    return video_data

def print_video_data(video_data):
    for video_id in video_data.keys():
        frames_count = video_data[video_id]['frames_count']
        frames_with_annot_count = video_data[video_id]['frames_with_annot_count']
        print(f'Video {video_id} : {frames_count} frames, {frames_with_annot_count} frames with annotation(s)')
        for sequence_id in video_data[video_id]['sequence'].keys():
            frames_count = video_data[video_id]['sequence'][sequence_id]['frames_count']
            annotations_count = video_data[video_id]['sequence'][sequence_id]['frames_with_annot_count']
            print(f'  Sequence {sequence_id} : {frames_count} frames, {annotations_count} with annotation(s)')
        print('\n')

video_data = get_video_data(df_train)
print_video_data(video_data)

In [None]:
def plot_video_data(video_data):
    plt.style.use('seaborn')
    fig, axs = plt.subplots(1, 3, figsize=((15, 5)))
    frames = {f'Video {key}': value['frames_count'] for key, value in video_data.items()}
    axs[0].bar(frames.keys(), frames.values(), width=0.3)
    axs[0].set_ylabel('frames')
    axs[0].set_title('Frames count per video')
    seq = {f'Video {key}': len(value['sequence']) for key, value in video_data.items()}
    axs[1].bar(seq.keys(), seq.values(), width=0.3)
    axs[1].set_ylabel('sequences')
    axs[1].set_title('Sequences count per video')
    annot = {f'Video {key}': value['frames_with_annot_count'] for key, value in video_data.items()}
    axs[2].bar(annot.keys(), annot.values(), width=0.3)
    axs[2].set_ylabel('annotations')
    axs[2].set_title('Annotations count per video')

plot_video_data(video_data)


### Are the annotations well distributed in the videos?

In [None]:
def plot_frame_data(video_data):
    fig, axs = plt.subplots(1, len(video_data.keys()), figsize=((15, 5)))
    for video_id in video_data.keys():
        annot = {f'Seq. {key}': value['frames_with_annot_count'] for key, value in video_data[video_id]['sequence'].items()}
        no_annot = {f'Seq. {key}': value['frames_count'] - value['frames_with_annot_count'] for key, value in video_data[video_id]['sequence'].items()}
        width = 0.5 * len(annot) / 8
        axs[video_id].bar(annot.keys(), annot.values(), width=width, label='annotation(s)')
        axs[video_id].bar(no_annot.keys(), no_annot.values(), width=width, label='no annotation', bottom=list(annot.values()))
        axs[video_id].set_ylabel('frames')
        axs[video_id].tick_params(axis='x', labelrotation=90)
        axs[video_id].set_title(f'Video {video_id} : annotations count per sequence')
        axs[video_id].legend()

plot_frame_data(video_data)

## 4. Annotations

### How many starfish are there per frame?

In [None]:
def get_annotation_count(df_train):
    df_train = df_train.sort_values(by=['video_id', 'sequence', 'sequence_frame'])
    df_train['annots_count'] = df_train['annotations'].apply(lambda annots : len(eval(annots)))
    return df_train
    
df_train = get_annotation_count(df_train)
df_annot = df_train['annots_count'].value_counts()

def plot_annot_distrib(df_annot):
    fig, ax = plt.subplots(1, 1, figsize=(8,5))
    ax.bar(df_annot.index, df_annot, tick_label=df_annot.index)
    ax.set_ylabel('frames')
    ax.set_xlabel('annotation count per frame')
    
plot_annot_distrib(df_annot)

### How are the annotations distributed over time?

In [None]:
def get_annotation_time(df_train):
    annot_data = {}
    for video_id in df_train['video_id'].unique():
        df_video = df_train.loc[df_train['video_id'] == video_id]
        annot_data[video_id] = {}
        for sequence in df_video['sequence'].unique():
            df_annot_time = df_video.loc[df_video['sequence'] == sequence]
            df_annot_time = df_annot_time.sort_values(by='sequence_frame')['annots_count']
            annot_data[video_id][sequence] = df_annot_time.values
    return annot_data

annotation_time =  get_annotation_time(df_train)

def plot_annotation_time(annotation_time):
    for video_id, sequences in annotation_time.items():
        for sequence, annot in sequences.items():
            fig, ax = plt.subplots(1, 1, figsize=(15, 2))
            ax.plot(annot)
            ax.set_ylabel('annotation count')
            ax.set_xlabel('time (frame)')
            ax.set_title(f'Video {video_id}, sequence : {sequence}')
            
plot_annotation_time(annotation_time)

### What is the shape of the bounding box?

In [None]:
def get_annotation_pos_and_size(df_train):
    annot_data = {}
    for video_id in df_train['video_id'].unique():
        df_video = df_train.loc[df_train['video_id'] == video_id]
        annot_data[video_id] = {}
        for sequence in df_video['sequence'].unique():
            annots = []
            df_annot_time = df_video.loc[df_video['sequence'] == sequence]
            raw_annots = df_annot_time['annotations'].apply(lambda annots : eval(annots)).values
            annots = [annot for sublist in raw_annots for annot in sublist]
            annots = [list(annot.values()) for annot in annots]
            annots = np.array(annots)
            annot_data[video_id][sequence] = annots
    return annot_data

annotation_pos_and_size =  get_annotation_pos_and_size(df_train)

def plot_annotation_pos(annotation_pos_and_size):
    for video_id, sequences in annotation_pos_and_size.items():
        for sequence, annot in sequences.items():
            if annot.shape[0] != 0:
                fig, ax = plt.subplots(1, 1, figsize=(8, 6))
                ax.scatter(annot[:,0], annot[:,1], alpha=0.5)
                ax.set_ylabel('height')
                ax.set_xlabel('width')
                ax.set_xbound(0, SIZE[0])
                ax.set_ybound(0, SIZE[1])
                ax.set_title(f'Starfish position (video {video_id}, sequence : {sequence})')
                
def plot_annotation_size(annotation_pos_and_size):
    fig, axs = plt.subplots(3, len(df_train['video_id'].unique()), figsize=(15, 15))
    idx = 0
    for video_id, sequences in annotation_pos_and_size.items():
        width = []
        height = []
        ratio_wh = []
        sequence_id = []
        for sequence, annot in sequences.items():
            if annot.shape[0] != 0:
                sequence_id.append(sequence)
                width.append(annot[:, 2])
                height.append(annot[:, 3])
                ratio_wh.append(annot[:, 2] / annot[:, 3])
        # plot width
        axs[0, idx].boxplot(width, labels=sequence_id)
        axs[0, idx].set_ylabel('height')
        axs[0, idx].tick_params(axis='x', labelrotation=90)
        axs[0, idx].set_xlabel('sequence')
        axs[0, idx].set_title(f'Bounding box width (video {video_id})')
        # plot height
        axs[1, idx].boxplot(height, labels=sequence_id)
        axs[1, idx].set_ylabel('height')
        axs[1, idx].tick_params(axis='x', labelrotation=90)
        axs[1, idx].set_xlabel('sequence')
        axs[1, idx].set_title(f'Bounding box height (video {video_id})')
        # plot ratio width / height
        axs[2, idx].boxplot(ratio_wh, labels=sequence_id)
        axs[2, idx].set_ylabel('ratio')
        axs[2, idx].tick_params(axis='x', labelrotation=90)
        axs[2, idx].set_xlabel('sequence')
        axs[2, idx].set_title(f'Bounding box ratio width / height (video {video_id})')
        idx += 1
    fig.tight_layout() 

plot_annotation_size(annotation_pos_and_size)


### Where are the boundings boxes?

In [None]:
plot_annotation_pos(annotation_pos_and_size)

## 3. Finally

In [None]:
def get_sample_frames(df_train):
    samples = []
    for video_id in df_train['video_id'].unique():
        df_video = df_train.loc[df_train['video_id'] == video_id]
        for sequence in df_video['sequence'].unique():
            df_sample = df_video.loc[df_video['sequence'] == sequence]
            try:
                df_sample = df_sample.loc[df_sample['annotations'] != '[]'].sample(1)
            except:
                df_sample = df_sample.sample(1)
            samples.append(df_sample)
    return samples

def process_frame(sample):
    # frame
    video_id = sample['video_id'].values[0]
    frame = sample['video_frame'].values[0]
    sequence = sample['sequence'].values[0]
    img_path = os.path.join(data_path, 'train_images', f'video_{video_id}', f'{frame}.jpg')
    frame = np.array(Image.open(img_path))
    # bounding boxes
    try:
        bboxs = eval(sample['annotations'].values[0])
        bboxs = [list(values.values()) for values in bboxs]
        bboxs = np.array(bboxs)
    except: # no bounding box in sequence
        bboxs = None
    return frame, bboxs, video_id, sequence

def display_frame(frame, bboxs, video_id, sequence):
    plt.style.use('seaborn-dark')
    fig, ax = plt.subplots(1, 1, figsize=(12, 8))
    # frame
    ax.imshow(frame)
    ax.set_ylabel('height')
    ax.set_xlabel('width')
    ax.set_xbound(0, SIZE[0])
    ax.set_ybound(0, SIZE[1])
    ax.set_title(f'Frame sample from video {video_id}, sequence : {sequence}')
    # bounding boxes
    for bbox in bboxs:
        rect = patches.Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], linewidth=3, edgecolor='r', facecolor='none')
        ax.add_patch(rect)
    return

sample_frames = get_sample_frames(df_train)

for sample in sample_frames:
    frame, bboxs, video_id, sequence = process_frame(sample)
    display_frame(frame, bboxs, video_id, sequence)