In [None]:
!pip install nb-black > /dev/null
%load_ext lab_black

In [None]:
import pandas as pd
import numpy as np
from itertools import cycle
import matplotlib.pylab as plt
from matplotlib.patches import Rectangle

plt.style.use("ggplot")
color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"]
color_cycle = cycle(plt.rcParams["axes.prop_cycle"].by_key()["color"])

In [None]:
train = pd.read_csv("../input/tensorflow-great-barrier-reef/train.csv")
test = pd.read_csv("../input/tensorflow-great-barrier-reef/test.csv")
ss = pd.read_csv("../input/tensorflow-great-barrier-reef/example_sample_submission.csv")

train.shape, test.shape

# Add Additional Columns

* `n_annotaions`: Number of annotations in the frame
* `video_sequence`: `video_id` + '_' + `sequence`

In [None]:
train["sum_cots"] = train["annotations"].apply(lambda x: len(eval(x)))
train["video_sequence"] = (
    train["video_id"].astype("str") + "_" + train["sequence"].astype("str")
)

# How Naive GroupKFold Creates Unbalanced Folds

In [None]:
def plot_folds(df):
    df = df.copy()
    plt.style.use('ggplot')
    df = df.groupby('fold_id').agg(
        sum_cots=('sum_cots', 'sum'), duration=('fold_id', 'count'))
    df['mean_cots'] = df.sum_cots / df.duration
    fig, axs = plt.subplots(1, 3, figsize=(15, 4))
    df.sum_cots.plot(kind='bar', ax=axs[0])
    df.duration.plot(kind='bar', ax=axs[1])
    df.mean_cots.plot(kind='bar', ax=axs[2])
    axs[0].set_title('#COTS')
    axs[1].set_title('#Frames')
    axs[2].set_title('#COTS/frame')
    return df

In [None]:
from sklearn.model_selection import GroupKFold


def allocate_group_k_fold(df, n_split):
    df = df.copy()
    kf = GroupKFold(n_splits=n_split)
    df['fold_id'] = -1
    for fold, (train_idx, val_idx) in enumerate(kf.split(df, groups=df.video_sequence)):
        df.loc[val_idx, 'fold_id'] = fold
    return df

In [None]:
n_split = 5
df = train.copy()
df = df.query('sum_cots > 0') # select only annotated frames
df.reset_index(inplace=True)
group_k_alloc_df = allocate_group_k_fold(df, n_split)
df = plot_folds(group_k_alloc_df)
plt.suptitle('Visualization of Statistics of each Folds - GroupKFold', fontsize=16)
df, df.std()

In the avobe graph, frame number are almost perfectly balanced, but **the number of COTSs are poorly balanced**.

# Spliting Sequence into Tiny Sub-Sequences

Hypothesis:

* two consecutive annotated frames may include the same individual COTS.
* two frames in the same sequence don't include the same individual of COTS when these are sepalated by non-annotated frames. That means all the individuals of COTS are traced (annotated) consecutively.

So I introduced a simple sequence-spliting algorithm:
1. for each sequence, add the label 'annotated' to each frames which means more than one annotation(s) are inclued.
2. split the sequence into parts which have consecutive 'annotated' frames

In [None]:
df = train.copy()

# make annotated flag
df["annotated"] = df["sum_cots"].apply(lambda x: min(x, 1))

dfs = []

# calculate non-annotated frame sub_sequence
for i, d in df.groupby("video_id"):
    ad = d.groupby((d["annotated"] != d["annotated"].shift()).cumsum(), as_index=False)[
        ["video_frame", "annotated", "sum_cots"]
    ].agg(
        annotated=("annotated", "first"),
        start_frame=("video_frame", 'first'),
        end_frame=("video_frame", "last"),
        sum_cots=("sum_cots", "sum"),
        mean_cots=("sum_cots", "mean"),
    )
    ad["video_id"] = i
    dfs.append(ad)

df_annot = pd.concat(dfs)
df_annot["duration"] = df_annot["end_frame"] - df_annot["start_frame"] + 1
sub_sequence = df_annot.query("annotated == 1")

sub_sequence.reset_index(drop=True)

last_sub_sequence_end = -1
sub_sequence_id = 0
sub_sequence_ids = []
continuous = False
prev_video_id = 0
for idx, (
    annotated,
    start_frame,
    end_frame,
    sum_cots,
    mean_cots,
    video_id,
    duration,
) in sub_sequence.iterrows():
    sub_sequence_ids.append(sub_sequence_id)
    last_sub_sequence_end = end_frame
    prev_video_id = video_id
    if not (prev_video_id == video_id and last_sub_sequence_end + 1 == start_frame):
        sub_sequence_id += 1

sub_sequence.loc[:, "sub_sequence_id"] = sub_sequence_ids
sub_sequence.drop('annotated', axis=1, inplace=True)
sub_sequence.reset_index(drop=True, inplace=True)
sub_sequence

# Visualize Sub-Sequences

In [None]:
import matplotlib.patches as mpatches


fig, axes = plt.subplots(3, 1, figsize=(15, 8), sharex=True, sharey=True)
axes = axes.ravel()
max_annotation = df["sum_cots"].max()
for i, d in df.groupby(["video_id", "sequence"]):
    video_id = d["video_id"].values[0]
    ax = axes[video_id]
    d.set_index("video_frame")["sum_cots"].apply(
        lambda x: x / max_annotation
    ).plot(ax=ax, c="black", linewidth=0.5)

    ax.set_title(f"Video ID: {video_id}")


# visualize clippable interval
for (
    annotated,
    start_frame,
    end_frame,
    sum_cots,
    mean_cots,
    video_id,
    duration,
    sub_sequence_id,
) in sub_sequence.itertuples():
    ax = axes[int(video_id)]
    rect = mpatches.Rectangle(
        (start_frame, 0), duration, 1, alpha=0.3, facecolor='red'
    )
    ax.add_patch(rect)

fig.suptitle("Sub-Sequences Visualized", fontsize=15)
plt.tight_layout()
plt.show()

# Fold Split Algorithm (Sub-Sequence Allocation)

To balance the statistics (like number of COTS, frame length etc.) of folds, I used simle round-robin algorithm to allocate sub-sequences to each folds:

1. sort sub_sequence in descending order for total COTS in the sub_sequences
2. allocate sub_sequences for each folds in order

In [None]:
def allocate_fold(df, n_split, key="sum_cots"):
    df = df.copy()
    assert key in df.columns
    df.sort_values(key, ascending=False, inplace=True)
    df["fold_id"] = -1
    for fold_id in range(n_split):
        index = df.iloc[fold_id::n_split].index
        df.loc[index, "fold_id"] = fold_id

    return df

In [None]:
def plot_folds_sub_sequence(df):
    df = df.copy()
    plt.style.use('ggplot')
    df = df.groupby('fold_id').agg(
        sum_cots=('sum_cots', 'sum'), duration=('duration', 'sum'))
    df['mean_cots'] = df.sum_cots / df.duration
    fig, axs = plt.subplots(1, 3, figsize=(15, 5))
    df.sum_cots.plot(kind='bar', ax=axs[0])
    df.duration.plot(kind='bar', ax=axs[1])
    df.mean_cots.plot(kind='bar', ax=axs[2])
    axs[0].set_title('#COTS')
    axs[1].set_title('#Frames')
    axs[2].set_title('#COTS/frame')
    
    return df

In [None]:
df = allocate_fold(sub_sequence, n_split=5)
df = plot_folds_sub_sequence(df)
plt.suptitle('Statistics of Folds by Round-Robin Algorithm', fontsize=16)
plt.tight_layout()
df, df.agg('std')

By splitting into subsequences and applying subsequence-allocation algorighm, the standard deviations of #COTS and #COTS/frame are decreased, which means the unbalance is mitigated.

* std(#COTS): 955 -> 787
* std(#COTS/frame): 0.89 -> 0.42

## Find Optimal Number of Split

Since the algorithm introduced before is dependent on the number of splits, I searched the most balanced split number.

In [None]:
def calc_split_statistics(sub_sequence, n_split):
    df = allocate_fold(sub_sequence, n_split)
    df = df.groupby('fold_id').agg(
    sum_cots=('sum_cots', 'sum'), duration=('duration', 'sum'))
    df['mean_cots'] = df.sum_cots / df.duration
    return df


deviations = {"sum_cots": [], "duration": [], "mean_cots": []}
n_splits = np.arange(3, 11)
for i in n_splits:
    data = calc_split_statistics(sub_sequence, i).std()
    for key in data.keys():
        deviations[key].append(data[key])

fig, ax = plt.subplots(1, 3, figsize=(15, 4))
for i, key in enumerate(deviations.keys()):
    ax[i].plot(n_splits, deviations[key], label=key)
    ax[i].set_ylim(bottom=0)
    ax[i].set_title(key)
    ax[i].set_xlabel("n_splits")
plt.suptitle("Standard Deviation vs. #Splits", fontsize=15)

In terms of minimizing standard deviation of #COTS, the best balanced setting is `n_split == 4`.

# Visualization of Folds in Video Frames

In [None]:
import matplotlib.patches as mpatches


fig, axes = plt.subplots(3, 1, figsize=(15, 8), sharex=True, sharey=True)
axes = axes.ravel()

df = train.copy()
max_annotation = df["sum_cots"].max()
for i, d in df.groupby(["video_id", "sequence"]):
    video_id = d["video_id"].values[0]
    ax = axes[video_id]
    d.set_index("video_frame")["sum_cots"].apply(
        lambda x: x / max_annotation
    ).plot(ax=ax, c="black", linewidth=0.5)

    ax.set_title(f"Video ID: {video_id}")


n_split = 4
df = allocate_fold(sub_sequence, n_split)
oof_colors = ["red", "blue", "green", "yellow"]
# visualize clippable interval
for (
    annotated,
    start_frame,
    end_frame,
    sum_cots,
    mean_cots,
    video_id,
    duration,
    sub_sequence_id,
    fold_id,
) in df.itertuples():
    ax = axes[int(video_id)]
    rect = mpatches.Rectangle(
        (start_frame, 0), duration, 1, alpha=0.3, facecolor=oof_colors[int(fold_id)]
    )
    ax.add_patch(rect)

fig.suptitle("Fold-Splitted Sub-Sequences Visualized", fontsize=15)
plt.tight_layout()
plt.show()

# Merge Fold Column to the Train Metadata 

In [None]:
# concatenate sub_sequence table and train table

n_split = 4
df = train.copy()
dfs = []
alloc_df = allocate_fold(sub_sequence, n_split)
bb = alloc_df.copy()
for video_id, d in df.groupby("video_id"):
    a = d["video_frame"].values
    b = bb.query("video_id == @video_id").drop("video_id", axis=1)
    sub_sequence_low = b["start_frame"].values
    sub_sequence_high = b["end_frame"].values

    i, j = np.where((a[:, None] >= sub_sequence_low) & (a[:, None] <= sub_sequence_high))
    dfs.append(
        pd.DataFrame(
            np.column_stack([d.values[i], b.values[j]]),
            columns=d.columns.append(b.columns),
        )
    )

df = pd.concat(dfs)
df = df.loc[:, ~df.columns.duplicated()] # remove duplicated columns

for column in alloc_df.columns:
    if column != "mean_cots":
        df[column] = df[column].astype(int)
        
df.to_csv("train_metadata_ext.csv", index=False)
df[:3]