# First Look at the Starfish Dataset

This notebook was created during a live coding session on twitch. Follow here: https://www.twitch.tv/medallionstallion_/

In [None]:
!pip install nb-black > /dev/null
%load_ext lab_black

In [None]:
import pandas as pd
import numpy as np
from itertools import cycle
import matplotlib.pylab as plt
from matplotlib.patches import Rectangle
import subprocess
from tqdm.notebook import tqdm

import cv2
from cv2 import VideoWriter, VideoWriter_fourcc
import os
from IPython.display import Video


plt.style.use("ggplot")
color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"]
color_cycle = cycle(plt.rcParams["axes.prop_cycle"].by_key()["color"])

In [None]:
train = pd.read_csv("../input/tensorflow-great-barrier-reef/train.csv")
test = pd.read_csv("../input/tensorflow-great-barrier-reef/test.csv")
ss = pd.read_csv("../input/tensorflow-great-barrier-reef/example_sample_submission.csv")

train.shape, test.shape

## Example of using the submission package.
The `greatbarrierreef` package loops over the test set. We have to predict each sample before we can see the next. This will impact how we must design our model.

In [None]:
import greatbarrierreef

env = greatbarrierreef.make_env()  # initialize the environment
iter_test = (
    env.iter_test()
)  # an iterator which loops over the test set and sample submission
for (pixel_array, sample_prediction_df) in iter_test:
    break
    sample_prediction_df[
        "annotations"
    ] = "0.5 0 0 100 100"  # make your predictions here
    env.predict(sample_prediction_df)  # register your predictions

## Example Image. Can you see the starfish?

In [None]:
plt.style.use("default")
fig, ax = plt.subplots(figsize=(15, 10))
ax.imshow(pixel_array)
ax.axis("off")
ax.set_title("Example Image from the Barrier Reef Dataset", fontsize=14)
plt.show()

# Training Metadata
- `video_id` - Unique per video. We only have 3 videos in the training dataset.
- `sequence` - random number to identify a group within the video of uncut footage.
- `video_frame` - frame number within the entire video
- `squence_frame` - frame number within the sequence (shot clip from within the video)
- `image_id` - a combination of video_id + video_frame. Links to the image in the training images directory.
- `annotations` - bounding boxes for starfish within the given frame.

## Example of Sequences within the training videos

- Do we actually have 3 different videos?
- Are the videos just subsets of a single long video?

In [None]:
plt.style.use("ggplot")
fig, axs = plt.subplots(3, 1, figsize=(15, 10), sharex=True, sharey=True)

for video in [0, 1, 2]:
    for sequence, d in train.query("video_id == @video").groupby("sequence"):
        d["sequence_frame"].plot(ax=axs[video], label=f"Sequence {sequence}")
    axs[video].set_title(f"Video {video}: Sequence Frame vs Video Frame")
    axs[video].set_xlabel("Video Frame")
    axs[video].set_ylabel("Sequence Frame")
    axs[video].legend(bbox_to_anchor=(1, 1), loc="upper left")
plt.show()

# How many Annotations per Frame?
- Is it different in each video?
- Is it different in each sequence within a video?

In [None]:
train["n_annotations"] = train["annotations"].apply(lambda x: len(eval(x)))
train["video_sequence"] = (
    train["video_id"].astype("str") + "_" + train["sequence"].astype("str")
)

ax = (
    train.groupby(["video_sequence"])["sequence_frame"]
    .max()
    .sort_values()
    .plot(kind="barh", figsize=(12, 7), title="Length of Sequences")
)
ax.set_xlabel("Number of Frames in the Seqence")

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15, 5))
train.groupby(["video_sequence"])["n_annotations"].mean().sort_values().plot(
    kind="barh", title="Avg of Annotations", ax=axs[0], color=next(color_cycle)
)

train.groupby(["video_sequence"])["n_annotations"].sum().sort_values().plot(
    kind="barh", title="Total Annotations", ax=axs[1], color=next(color_cycle)
)
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
for i, d in train.groupby(["video_id", "sequence"]):
    d.set_index("sequence_frame")["n_annotations"].plot(ax=axs[i[0]])
    axs[i[0]].set_title(f"Video ID: {i[0]} - Sequence {i[1]}")
fig.suptitle("Number of Annotations per Frame for each Sequence")
plt.show()

# Examples of Annotations
The below function allows us to plot an image with its annotations

In [None]:
plt.style.use("default")


def plot_reef_image(
    image_id,
    df,
    ax=None,
    show_annotations=True,
    line_width=1,
    line_color="red",
    figsize=(30, 5),
    image_dir="../input/tensorflow-great-barrier-reef/train_images/",
):
    """
    Plot reef image. If `show_annotations` is True, create boxes
    with the annotations for starfish.
    """

    example = df.query("image_id == @image_id")
    video = example["video_id"].values[0]
    frame = example["video_frame"].values[0]
    annotations = eval(example["annotations"].values[0])

    img = plt.imread(f"{image_dir}video_{video}/{frame}.jpg")
    if ax is None:
        fig, ax = plt.subplots(figsize=figsize)
    ax.imshow(img)
    ax.axis("off")

    n_annotations = len(annotations)
    ax.set_title(f"image_id: {image_id} ({n_annotations} Starfish)", fontsize=12)

    if show_annotations:
        for a in annotations:
            ax.add_patch(
                Rectangle(
                    (a["x"], a["y"]),
                    a["width"],
                    a["height"],
                    lw=line_width,
                    facecolor="none",
                    edgecolor=line_color,
                )
            )

    return ax

In [None]:
# Find image with the most number of annotations as an example
image_id = train.sort_values("n_annotations").tail(1)["image_id"].values[0]
ax = plot_reef_image(image_id, train, line_color="red")

# Plot A Bunch of Random Images with Annotations

In [None]:
fig, axs = plt.subplots(2, 4, figsize=(20, 10))
axs = axs.flatten()
image_ids = train.sample(8, random_state=529)["image_id"].values

for i, image_id in enumerate(image_ids):
    plot_reef_image(image_id, train, ax=axs[i])
plt.tight_layout()
plt.show()

## Examples with >= 5 Annotations

In [None]:
fig, axs = plt.subplots(2, 4, figsize=(20, 10))
axs = axs.flatten()
image_ids = (
    train.query("n_annotations >= 5").sample(8, random_state=529)["image_id"].values
)

for i, image_id in enumerate(image_ids):
    plot_reef_image(image_id, train, ax=axs[i])
plt.tight_layout()
plt.show()

## Examples with 0 annotations

In [None]:
fig, axs = plt.subplots(2, 4, figsize=(20, 10))
axs = axs.flatten()
image_ids = (
    train.query("n_annotations == 0").sample(8, random_state=529)["image_id"].values
)

for i, image_id in enumerate(image_ids):
    plot_reef_image(image_id, train, ax=axs[i])
plt.tight_layout()
plt.show()

# Create a video by merging images

In [None]:
def add_annotations(img, annotations, color="red", thickness=3):
    """
    Adds annotations to an image using cv2.

    annotations: [list] of dictionaries with the annoation details
    """
    if color == "red":
        box_color = (0, 0, 255)  # Red
    elif color == "black":
        box_color = (0, 0, 0)  # Black
    for a in annotations:
        cv2.rectangle(
            img,
            (a["x"], a["y"]),
            (a["x"] + a["width"], a["y"] + a["height"]),
            box_color,
            thickness=thickness,
        )

    return img


def create_reef_video(
    train,
    video_id,
    start_video_frame,
    end_video_frame,
    annotate=True,
    output_filename="./test.mp4",
    FPS=30,
    image_dir="../input/tensorflow-great-barrier-reef/train_images/",
):

    width = 1280
    height = 720

    fourcc = VideoWriter_fourcc(*"mp4v")

    temp_fn = output_filename.replace(".mp4", "") + "_temp.mp4"

    video_file = VideoWriter(temp_fn, fourcc, float(FPS), (width, height))

    subset_df = (
        train.query(
            "video_id == @video_id and video_frame >= @start_video_frame and video_frame <= @end_video_frame"
        )
        .reset_index(drop=True)
        .copy()
    )
    for i, example in tqdm(subset_df.iterrows(), total=len(subset_df)):
        video = example["video_id"]
        frame = example["video_frame"]
        image_fn = f"{image_dir}video_{video}/{frame}.jpg"
        img = cv2.imread(image_fn)
        if annotate:
            annotations = eval(example["annotations"])
            img = add_annotations(img, annotations)
        video_file.write(img)

    video_file.release()

    subprocess.run(
        [
            "ffmpeg",
            "-i",
            temp_fn,
            "-crf",
            "18",
            "-preset",
            "veryfast",
            "-vcodec",
            "libx264",
            output_filename,
            "-loglevel",
            "error",
        ]
    )

    os.remove(temp_fn)

    return output_filename

In [None]:
create_reef_video(
    train,
    output_filename="example-1.mp4",
    annotate=True,
    video_id=1,
    start_video_frame=9090,
    end_video_frame=9172,
)
Video("example-1.mp4", width=800)

In [None]:
create_reef_video(
    train,
    output_filename="example-2.mp4",
    annotate=True,
    video_id=2,
    start_video_frame=5600,
    end_video_frame=5800,
)
Video("example-2.mp4", width=900)

In [None]:
create_reef_video(
    train,
    output_filename="example-3.mp4",
    annotate=True,
    video_id=0,
    start_video_frame=4500,
    end_video_frame=4700,
)
Video("example-3.mp4", width=900)

# Create Full Annotated Videos

In [None]:

for video, data in train.groupby("video_id"):
    print(f'======== Creating Annotated Video {video} ========')
    start_frame = data["video_frame"].min()
    end_frame = data["video_frame"].max()
    create_reef_video(
        train,
        output_filename=f"full_video{video}_annotated.mp4",
        annotate=True,
        video_id=video,
        start_video_frame=start_frame,
        end_video_frame=end_frame,
    )