このコンペはアノテーションのついたデータが意外に少ない。
GANで新たな画像を水増しすることで、Recallをあげられないか？という取り組み。
論文を見るとdetectionの精度はあまり変わっていないに見えるが、[1] で「GANのモデルをPre Trainすることで学習時間を短縮した」とあるので、[1]のモデルに流用することができるかもしれない。

ここでは、歩行者の擬似画像生成に使われた Pedestrian-Syntheis-GAN [2]というモデルを使う。
BBoxとターゲットをノイズで置き換えた画像を用意すれば学習してくれるようなので、コンペの一部のデータセット（256x256のクリップ1000枚くらい）を使って学習を試みる。

[1] https://arxiv.org/abs/1910.07169
[2] https://github.com/yueruchen/Pedestrian-Synthesis-GAN


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('/kaggle/input/tensorflow-great-barrier-reef/train.csv')

In [None]:
train['n_annot'] = train['annotations'].apply(lambda x: len(eval(x)))

In [None]:
train_annot = train.query('n_annot > 0')

In [None]:
bboxes = {
    'x': [], 'y': [], 'width': [], 'height': [], 'video_id': [], 'sequence': [], 'video_frame': [],
}

for item in train_annot.itertuples():
    annots = eval(item.annotations)
    for annot in annots:
        for k in annot.keys():
            bboxes[k].append(annot[k])
        bboxes['video_id'].append(item.video_id)
        bboxes['sequence'].append(item.sequence)
        bboxes['video_frame'].append(item.video_frame)
    
bboxes = pd.DataFrame(bboxes)

In [None]:
bboxes.describe()

# Visualize BBox Distribution

In [None]:
df = bboxes[['x', 'y', 'width', 'height']]
sample_df = df.copy()
g = sns.PairGrid(sample_df)
g.map_upper(sns.histplot)
g.map_lower(sns.histplot)
g.map_diag(sns.histplot, kde=False)

# Visualize Sampled Clips

1. extract bounding BBox of specific size range
2. crop 256x256 background image around the BBox

In [None]:
BBOX_SELECT_QUERY = 'width >= 40 and height >= 40 and width <= 128 and height <= 128'

In [None]:
df = bboxes.query(BBOX_SELECT_QUERY)

plt.style.use('ggplot')
fig, axs = plt.subplots(3, 1, figsize=(15, 10))
for i, d in df.groupby('sequence'):
    video_id = d['video_id'].values[0]
    ax = axs[video_id]
    dd = d.groupby('video_frame').agg(sum_cots=('video_id', 'count'), video_frame=('video_frame', 'min'))
    ax.plot(dd['video_frame'], dd['sum_cots'])
    ax.set_title(f'Video: {video_id}')
plt.suptitle(f'BBox: {BBOX_SELECT_QUERY}', fontsize=16)
plt.tight_layout()
print(f'Rate: {len(df) / len(bboxes):.4f}')

In [None]:
image_cache = {}

def _clear_cache():
    image_cache = {}
    

def load_image(bbox_df, index, image_dir="../input/tensorflow-great-barrier-reef/train_images/"):
    '''
    load one image wich include specific bounding box
    '''
    df = bbox_df.copy()
    item = df.loc[index]
    video = item["video_id"]
    frame = item["video_frame"]
    if (video, frame) in image_cache:
        img = image_cache[(video, frame)]
    else:
        img = plt.imread(f"{image_dir}video_{video}/{frame}.jpg")
        image_cache[(video, frame)] = img
    
    return img, (item.x, item.y, item.width, item.height)


def clip_bbox_image(img, label, clip_width=128, clip_height=128, image_width=1280, image_height=720):
    '''
    load image & clip region including a bounding box
    return:
      - img: clipped image array
      - label: new bounding box label
    '''
    x, y, width, height = label
    
    # bbox が画像内に収まるように width, height を修正
    width -= max(x + width - image_width, 0)
    height -= max(y + height - image_height, 0)
    
    xs = min(max(0, x + (width // 2) - (clip_width // 2)), image_width - clip_width)
    ys = min(max(0, y + (height // 2) - (clip_height // 2)), image_height - clip_height)
    xe = xs + clip_width
    ye = ys + clip_height
    img_clip = img[ys:ye, xs:xe, :]

    # set new clipped cordinate
    offset_x = min(0, (x + width // 2) - clip_width // 2)
    offset_y = min(0, (y + height // 2) - clip_height // 2)
    offset_x_2 = max(0, x + width // 2 + clip_width // 2 - 1280)
    offset_y_2 = max(0, y + height // 2 + clip_height // 2 - 720)
    x = max(0, clip_width // 2 - width // 2) + offset_x + offset_x_2
    y = max(0, clip_height // 2 - height // 2) + offset_y + offset_y_2
    
    return img_clip, (x, y, width, height)


def add_noise(img, label):
    x, y, width, height = label
    img_noise = img.copy()
    img_noise[y:y + height, x:x + width, :] = np.random.randint(0, 255, (height, width, 1))
    return img_noise


def create_psgan_input(df, index):
    img, label = load_image(df, index)
    img_clip, label = clip_bbox_image(img, label)
    img_noise = add_noise(img_clip, label)
    img_double = np.concatenate((img_clip, img_noise), axis=1)
    
    return img_double, label

In [None]:
def plot_clipped_bbox(
    df,
    index,
    ax=None,
    figsize=(30, 5),
):
    """
    Plot reef image. If `show_annotations` is True, create boxes
    with the annotations for starfish.
    """
    if ax is None:
        fig, ax = plt.subplots(figsize=figsize)
        
    img_double, label = create_psgan_input(df, index)
    im = ax.imshow(img_double, vmin=0, vmax=255)
    ax.axis("off")
    
    return ax

In [None]:
n_figs = 12
n_cols = 3
n_rows = (n_figs - 1) // n_cols + 1
fig, axs = plt.subplots(n_rows, n_cols, figsize=(8 * n_cols, 4 * n_rows))
axs = axs.ravel()

df = bboxes.copy()
df = df.query(BBOX_SELECT_QUERY) # only extract sufficient large bbox

indexes = [idx for idx in df.sample(n_figs).index.values]

for ax, index in zip(axs, indexes):
    plot_clipped_bbox(df, index, ax=ax)

# Write Pedestrian-Synthesize GAN Input Data

In [None]:
import os
from pathlib import Path
import shutil


out_dir = Path('/kaggle/working/psgan_datasets')
if os.path.isdir(out_dir):
    shutil.rmtree(out_dir)

# create directory
out_dir.mkdir(parents=True, exist_ok=True)
image_dir = out_dir / 'images' / 'train'
label_dir = out_dir / 'bbox' / 'train'
image_dir.mkdir(parents=True, exist_ok=True)
label_dir.mkdir(parents=True, exist_ok=True)
! tree psgan_datasets

In [None]:
from os import listdir

def show_one_image(root_dir):
    plt.style.use('default')

    image_path = os.path.join(root_dir, listdir(root_dir)[0])
    im = plt.imread(image_path)
    plt.imshow(im)
    plt.axis('off')

In [None]:
import json
from PIL import Image
from tqdm import tqdm
import cv2


df = bboxes.copy()
df = df.query(BBOX_SELECT_QUERY)
df = df.sample(1200, random_state=0) # only use 1200 sample for test

for item in tqdm(df.itertuples(), total=len(df)):
    img, label = create_psgan_input(bboxes, item.Index)
    label = [int(x) for x in label]
    x, y, width, height = label
    label = {
        'x': x * 2,
        'y': y * 2,
        'w': (x + width) * 2,
        'h': (y + height) * 2,
    }
    
    # write file
    label_file_name = f'{item.Index}.json'
    with open(label_dir / label_file_name, 'w') as f:
        json.dump(label, f)
        
    # resize image
    img = cv2.resize(img, dsize=(512, 256), interpolation=cv2.INTER_CUBIC)
    im = Image.fromarray(img)
    im.save(image_dir / f'{item.Index}.png')
    
print('done!')

In [None]:
show_one_image('psgan_datasets/images/train')

# Make Varidation Data

* find parameter of train distribution
* sample from multi-valiate Gaussian distribution
* adding sampled BBoxes into non-annotated frames

## Train BBox Distribution

In [None]:
NUM_VALID_SAMPLE = 100

In [None]:
df = bboxes[['x', 'y', 'width', 'height']]
sample_df = df.query(BBOX_SELECT_QUERY)
g = sns.PairGrid(sample_df)
g.map_upper(sns.histplot)
g.map_lower(sns.histplot)
g.map_diag(sns.histplot, kde=False)

In [None]:
sample_df.cov()

## Sample From Multi-Variate Distribution

In [None]:
mean = sample_df.mean().values
cov = sample_df.cov().values

In [None]:
data = np.random.multivariate_normal(mean, cov, size=(NUM_VALID_SAMPLE)).astype(int)
sample = pd.DataFrame(data, columns=['x', 'y', 'width', 'height'])
sample['x'].clip(0, 1280 - 40, inplace=True)
sample['y'].clip(0, 720 - 40, inplace=True)
sample['width'].clip(40, inplace=True)
sample['height'].clip(40, inplace=True)


g = sns.PairGrid(sample)
g.map_upper(sns.histplot)
g.map_lower(sns.histplot)
g.map_diag(sns.histplot, kde=False)
sample

## Add BBox into Non-Annotated Frames

In [None]:
image_cache = {}
def load_image_by_id(image_id, image_dir="../input/tensorflow-great-barrier-reef/train_images/"):
    if image_id in image_cache:
        img = image_cache[image_id]
    else:
        video, frame = image_id.split('-')
        img = plt.imread(f"{image_dir}video_{video}/{frame}.jpg")
        image_cache[image_id] = img
    
    return img

def create_psgan_input_for_valid(df, index, label):
    df = df.copy()
    item = df.loc[index]
    image_id = f'{item.video_id}-{item.video_frame}'
    
    img = load_image_by_id(image_id)
    img_clip, label = clip_bbox_image(img, label)
    img_noise = add_noise(img_clip, label)
    img_double = np.concatenate((img_clip, img_noise), axis=1)
    
    return img_double, label

In [None]:
def plot_clipped_bbox2(
    df,
    index,
    ax=None,
    figsize=(30, 5),
):
    """
    Plot reef image. If `show_annotations` is True, create boxes
    with the annotations for starfish.
    """
    if ax is None:
        fig, ax = plt.subplots(figsize=figsize)
        
    img_double, label = create_psgan_input(df, index, label)
    im = ax.imshow(img_double, vmin=0, vmax=255)
    ax.axis("off")
    
    return ax

In [None]:
df = train.query('n_annot == 0').sample(NUM_VALID_SAMPLE).reset_index(drop=True)

n_figs = 12
n_cols = 3
n_rows = (n_figs - 1) // n_cols + 1
fig, axs = plt.subplots(n_rows, n_cols, figsize=(6 * n_cols, 3 * n_rows))
axs = axs.ravel()

for i, frame, label in zip(range(n_figs), df.itertuples(), sample.itertuples()):
    label = label._asdict()
    del label['Index']
    label = label.values()
    
    img, label = create_psgan_input_for_valid(df, i, label)
    ax = axs[i]
    im = ax.imshow(img, vmin=0, vmax=255)
    ax.axis("off")
    
plt.suptitle('Validation Set Sample', fontsize=16)
plt.tight_layout()

In [None]:
import os
from pathlib import Path
import shutil


out_dir = Path('/kaggle/working/psgan_datasets_val')
if os.path.isdir(out_dir):
    shutil.rmtree(out_dir)

# create directory
out_dir.mkdir(parents=True, exist_ok=True)
image_dir = out_dir / 'images' / 'test'
label_dir = out_dir / 'bbox' / 'test'
image_dir.mkdir(parents=True, exist_ok=True)
label_dir.mkdir(parents=True, exist_ok=True)
! tree psgan_datasets_val

In [None]:
import json
from PIL import Image
from tqdm import tqdm
import cv2


df = train.query('n_annot == 0').sample(NUM_VALID_SAMPLE).reset_index(drop=True)

for i, (frame, label) in enumerate(tqdm(zip(df.itertuples(), sample.itertuples()), total=NUM_VALID_SAMPLE)):
    label = label._asdict()
    del label['Index']
    label = label.values()
    img, label = create_psgan_input_for_valid(df, i, label)
    
    label = [int(x) for x in label]
    x, y, width, height = label
    label = {
        'x': x * 2,
        'y': y * 2,
        'w': (x + width) * 2,
        'h': (y + height) * 2,
    }
    
    # write file
    label_file_name = f'{i}.json'
    with open(label_dir / label_file_name, 'w') as f:
        json.dump(label, f)
        
    # resize image
    img = cv2.resize(img, dsize=(512, 256), interpolation=cv2.INTER_CUBIC)
    im = Image.fromarray(img)
    im.save(image_dir / f'{i}.png')
    
print('done!')

In [None]:
show_one_image('psgan_datasets_val/images/test')