# Great Barrier Reefs


## Notebook d'exploration des données 
Plan:
* Exloration et visualisation des données 
* Explication sur le F2-score avec exemples

# Référence
* https://www.kaggle.com/diegoalejogm/great-barrier-reefs-eda-with-animations

In [None]:
!pip install pycodestyle
!pip install --index-url https://test.pypi.org/simple/ nbpep8

In [None]:
from nbpep8.nbpep8 import pep8


In [None]:
'''
Pour toujours être dans les standards PEP8, on utilise la commmande pep8(_ih) dans chaque cellule de code afin de contôler notre code.
Par exemple dans le code a=1, il manque les espaces autour de l'opérateur  :
(Ainsi que la première ligne de commentaire est trop longue)
'''
a=1
pep8(_ih)

## Installation des outils et des chemins


In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import image
import matplotlib.patches as patches
from PIL import Image
import pickle
import cv2
import ast
import sys
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn')

pep8(_ih)

In [None]:
data_path = '../input/tensorflow-great-barrier-reef'
!ls {data_path}
pep8(_ih)

In [None]:
# greatbarrierreef/ : image delivery api
!ls {os.path.join(data_path, 'greatbarrierreef/')}
pep8(_ih)

In [None]:
# train_images/ : training data folders, containing 3 videos folders : video_{video_id}
!ls {os.path.join(data_path, 'train_images/')}
pep8(_ih)

# Exploration

****Exploration des données de la compétition pour comprendre comment se répartissent les annotations dans les vidéos****

In [None]:
df_train = pd.read_csv(os.path.join(data_path, 'train.csv'))
df_train[12:22]

In [None]:
df_train.info()
pep8(_ih)

In [None]:
# size
video_ids = df_train['video_id'].unique()
print(f'Video count : {len(video_ids)}')
for video_id in video_ids:
    img_path = os.path.join(data_path, 'train_images',
                            f'video_{video_id}', '0.jpg')
    im = Image.open(img_path)
    print(f'Video {video_id} : {im.size}, {im.mode}')
SIZE = (1280, 720)
pep8(_ih)

In [None]:
def get_video_data(df_train):
    video_data = {}
    for video_id in df_train['video_id'].unique():
        df_video = df_train.loc[df_train['video_id'] == video_id]
        data_sequence = {}
        video_data[video_id] = {}
        video_data[video_id]['frames_count'] = 0
        video_data[video_id]['frames_with_annot_count'] = 0
        for sequence in df_video['sequence'].unique():
            df_sequence = df_video.loc[df_video['sequence'] == sequence]
            seq_annotations = {}
            seq_annotations['frames_count'] = len(df_sequence)
            seq_annotations['frames_with_annot_count'] = df_sequence.loc[df_train['annotations'] != '[]']['annotations'].count()
            data_sequence[sequence] = seq_annotations
            video_data[video_id]['frames_count'] += seq_annotations['frames_count']
            video_data[video_id]['frames_with_annot_count'] += seq_annotations['frames_with_annot_count']
        video_data[video_id]['sequence'] = data_sequence
    return video_data


def print_video_data(video_data):
    for video_id in video_data.keys():
        frames_count = video_data[video_id]['frames_count']
        frames_with_annot_count = video_data[video_id]['frames_with_annot_count']
        print(f'Video {video_id} : {frames_count} frames, {frames_with_annot_count} frames with annotation(s)')
        for sequence_id in video_data[video_id]['sequence'].keys():
            frames_count = video_data[video_id]['sequence'][sequence_id]['frames_count']
            annotations_count = video_data[video_id]['sequence'][sequence_id]['frames_with_annot_count']
            print(f'  Sequence {sequence_id} : {frames_count} frames, {annotations_count} with annotation(s)')
        print('\n')


video_data = get_video_data(df_train)
print_video_data(video_data)
pep8(_ih)

In [None]:
def plot_video_data(video_data):
    plt.style.use('seaborn')
    fig, axs = plt.subplots(1, 3, figsize=((15, 5)))
    frames = {f'Video {key}': value['frames_count'] for key, value in video_data.items()}
    axs[0].bar(frames.keys(), frames.values(), width=0.3)
    axs[0].set_ylabel('frames')
    axs[0].set_title('Frames count per video')
    seq = {f'Video {key}': len(value['sequence']) for key, value in video_data.items()}
    axs[1].bar(seq.keys(), seq.values(), width=0.3)
    axs[1].set_ylabel('sequences')
    axs[1].set_title('Sequences count per video')
    annot = {f'Video {key}': value['frames_with_annot_count'] for key, value in video_data.items()}
    axs[2].bar(annot.keys(), annot.values(), width=0.3)
    axs[2].set_ylabel('annotations')
    axs[2].set_title('Annotations count per video')


plot_video_data(video_data)
pep8(_ih)

In [None]:
def plot_frame_data(video_data):
    fig, axs = plt.subplots(1, len(video_data.keys()), figsize=((15, 5)))
    for video_id in video_data.keys():
        annot = {f'Seq. {key}': value['frames_with_annot_count'] for key, value in video_data[video_id]['sequence'].items()}
        no_annot = {f'Seq. {key}': value['frames_count'] - value['frames_with_annot_count'] for key, value in video_data[video_id]['sequence'].items()}
        width = 0.5 * len(annot) / 8
        axs[video_id].bar(annot.keys(), annot.values(), width=width, label='annotation(s)')
        axs[video_id].bar(no_annot.keys(), no_annot.values(), width=width, label='no annotation', bottom=list(annot.values()))
        axs[video_id].set_ylabel('frames')
        axs[video_id].tick_params(axis='x', labelrotation=90)
        axs[video_id].set_title(f'Video {video_id} : annotations count per sequence')
        axs[video_id].legend()


plot_frame_data(video_data)
pep8(_ih)

In [None]:
def get_annotation_count(df_train):
    df_train = df_train.sort_values(by=['video_id', 'sequence', 'sequence_frame'])
    df_train['annots_count'] = df_train['annotations'].apply(lambda annots: len(eval(annots)))
    return df_train


df_train = get_annotation_count(df_train)
df_annot = df_train['annots_count'].value_counts()


def plot_annot_distrib(df_annot):
    fig, ax = plt.subplots(1, 1, figsize=(8, 5))
    ax.bar(df_annot.index, df_annot, tick_label=df_annot.index)
    ax.set_ylabel('frames')
    ax.set_xlabel('annotation count per frame')


plot_annot_distrib(df_annot)
pep8(_ih)

In [None]:
def get_annotation_time(df_train):
    annot_data = {}
    for video_id in df_train['video_id'].unique():
        df_video = df_train.loc[df_train['video_id'] == video_id]
        annot_data[video_id] = {}
        for sequence in df_video['sequence'].unique():
            df_annot_time = df_video.loc[df_video['sequence'] == sequence]
            df_annot_time = df_annot_time.sort_values(by='sequence_frame')['annots_count']
            annot_data[video_id][sequence] = df_annot_time.values
    return annot_data


annotation_time = get_annotation_time(df_train)


def plot_annotation_time(annotation_time):
    for video_id, sequences in annotation_time.items():
        for sequence, annot in sequences.items():
            fig, ax = plt.subplots(1, 1, figsize=(15, 2))
            ax.plot(annot)
            ax.set_ylabel('annotation count')
            ax.set_xlabel('time (frame)')
            ax.set_title(f'Video {video_id}, sequence : {sequence}')


plot_annotation_time(annotation_time)
pep8(_ih)

In [None]:
def get_annotation_pos_and_size(df_train):
    annot_data = {}
    for video_id in df_train['video_id'].unique():
        df_video = df_train.loc[df_train['video_id'] == video_id]
        annot_data[video_id] = {}
        for sequence in df_video['sequence'].unique():
            annots = []
            df_annot_time = df_video.loc[df_video['sequence'] == sequence]
            raw_annots = df_annot_time['annotations'].apply(lambda annots: eval(annots)).values
            annots = [annot for sublist in raw_annots for annot in sublist]
            annots = [list(annot.values()) for annot in annots]
            annots = np.array(annots)
            annot_data[video_id][sequence] = annots
    return annot_data


annotation_pos_and_size = get_annotation_pos_and_size(df_train)


def plot_annotation_pos(annotation_pos_and_size):
    for video_id, sequences in annotation_pos_and_size.items():
        for sequence, annot in sequences.items():
            if annot.shape[0] != 0:
                fig, ax = plt.subplots(1, 1, figsize=(8, 6))
                ax.scatter(annot[:, 0], annot[:, 1], alpha=0.5)
                ax.set_ylabel('height')
                ax.set_xlabel('width')
                ax.set_xbound(0, SIZE[0])
                ax.set_ybound(0, SIZE[1])
                ax.set_title(f'Starfish position (video {video_id}, sequence: {sequence})')


def plot_annotation_size(annotation_pos_and_size):
    fig, axs = plt.subplots(3, len(df_train['video_id'].unique()), figsize=(15, 15))
    idx = 0
    for video_id, sequences in annotation_pos_and_size.items():
        width = []
        height = []
        ratio_wh = []
        sequence_id = []
        for sequence, annot in sequences.items():
            if annot.shape[0] != 0:
                sequence_id.append(sequence)
                width.append(annot[:, 2])
                height.append(annot[:, 3])
                ratio_wh.append(annot[:, 2] / annot[:, 3])
        # plot width
        axs[0, idx].boxplot(width, labels=sequence_id)
        axs[0, idx].set_ylabel('height')
        axs[0, idx].tick_params(axis='x', labelrotation=90)
        axs[0, idx].set_xlabel('sequence')
        axs[0, idx].set_title(f'Bounding box width (video {video_id})')
        # plot height
        axs[1, idx].boxplot(height, labels=sequence_id)
        axs[1, idx].set_ylabel('height')
        axs[1, idx].tick_params(axis='x', labelrotation=90)
        axs[1, idx].set_xlabel('sequence')
        axs[1, idx].set_title(f'Bounding box height (video {video_id})')
        # plot ratio width / height
        axs[2, idx].boxplot(ratio_wh, labels=sequence_id)
        axs[2, idx].set_ylabel('ratio')
        axs[2, idx].tick_params(axis='x', labelrotation=90)
        axs[2, idx].set_xlabel('sequence')
        axs[2, idx].set_title(f'Bounding box ratio width / height (video {video_id})')
        idx += 1
    fig.tight_layout()


plot_annotation_size(annotation_pos_and_size)
pep8(_ih)

# Visualisation 

#### On veut visualiser les images issues des vidéos, on va écrire une fonction pour nous permettre de visualiser une image et son histogramme.
#### Comme nos images sont en couleur, on va passer d'abord par une représentation avec luminance et chrominance puis faire l'égalisation uniquement sur la luminance Y pour avoir un histogramme égalisée, et donc de meilleur qualité pour la visualisation

In [None]:
img_0 = plt.imread(data_path+"/train_images/video_0/0.jpg")
plt.axis("off")
plt.imshow(img_0)
plt.show()
pep8(_ih)

### Histogram equalization

In [None]:
def Histogram_equalization(img):
    f = plt.figure(figsize=(25, 25))
    # original
    ax1 = f.add_subplot(521)
    plt.axis("off")
    ax2 = f.add_subplot(522)
    ax1.imshow(img)
    ax1.set_title('img')
    ax2.hist(img.flatten(), bins=range(256), color='navy')
    ax2.set_title('Original histogram')
    # RGB
    ax3 = f.add_subplot(523)
    plt.axis("off")
    ax4 = f.add_subplot(524)
    img_yuv = cv2.cvtColor(img, cv2.COLOR_BGR2YUV)
    ax3.imshow(img_yuv)
    ax3.set_title('img YUV')
    plt.axis("off")
    ax4.hist(img_yuv.flatten(), bins=range(256), color='navy')
    ax4.set_title('Histogram img YUV')
    # YUV
    ax5 = f.add_subplot(525)
    plt.axis("off")
    ax6 = f.add_subplot(526)
    img_yuv[:, :, 0] = cv2.equalizeHist(img_yuv[:, :, 0])
    img_output = cv2.cvtColor(img_yuv, cv2.COLOR_YUV2BGR)
    ax5.imshow(img_output)
    ax5.set_title('img equalized')
    ax6.hist(img_output.flatten(), bins=range(256), color='navy')
    ax6.set_title('Histograme qualized')


pep8(_ih)

In [None]:
Histogram_equalization(img_0)
pep8(_ih)

In [None]:
# img_bgr = cv2.cvtColor(img_0,cv2.COLOR_BGR2RGB)
plt.axis("off")
img_yuv = cv2.cvtColor(img_0, cv2.COLOR_BGR2YUV)
img_yuv[:, :, 0] = cv2.equalizeHist(img_yuv[:, :, 0])
img_output = cv2.cvtColor(img_yuv, cv2.COLOR_YUV2BGR)
plt.imshow(img_output)
pep8(_ih)

### Filter

In [None]:
''' Appliquer le non-local means filter sur l'image test 0
pour essayer d'améliorer l'image'''
img_flt = cv2.fastNlMeansDenoisingColored(src=img_0, dst=None, h=10, hColor=10, templateWindowSize=7, searchWindowSize=21)
fig = plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
plt.imshow(img_0)
plt.title("frame")
plt.axis("off")
plt.subplot(1, 2, 2)
plt.imshow(img_flt)
plt.title("img filter")
plt.axis("off")

pep8(_ih)

In [None]:
def get_sample_frames(df_train):
    samples = []
    for video_id in df_train['video_id'].unique():
        df_video = df_train.loc[df_train['video_id'] == video_id]
        for sequence in df_video['sequence'].unique():
            df_sample = df_video.loc[df_video['sequence'] == sequence]
            try:
                df_sample = df_sample.loc[df_sample['annotations'] != '[]'].sample(1)
            except:
                df_sample = df_sample.sample(1)
            samples.append(df_sample)
    return samples

def process_frame(sample):
    # frame
    video_id = sample['video_id'].values[0]
    frame = sample['video_frame'].values[0]
    sequence = sample['sequence'].values[0]
    img_path = os.path.join(data_path, 'train_images', f'video_{video_id}', f'{frame}.jpg')
    frame = np.array(Image.open(img_path))
    # bounding boxes
    try:
        bboxs = eval(sample['annotations'].values[0])
        bboxs = [list(values.values()) for values in bboxs]
        bboxs = np.array(bboxs)
    except: # no bounding box in sequence
        bboxs = None
    return frame, bboxs, video_id, sequence

def display_frame(frame, bboxs, video_id, sequence):
    plt.style.use('seaborn-dark')
    fig, ax = plt.subplots(1, 1, figsize=(12, 8))
    # frame
    ax.imshow(frame)
    ax.set_ylabel('height')
    ax.set_xlabel('width')
    ax.set_xbound(0, SIZE[0])
    ax.set_ybound(0, SIZE[1])
    ax.set_title(f'Frame sample from video {video_id}, sequence : {sequence}')
    # bounding boxes
    for bbox in bboxs:
        rect = patches.Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], linewidth=3, edgecolor='r', facecolor='none')
        ax.add_patch(rect)
    return

sample_frames = get_sample_frames(df_train)

for sample in sample_frames:
    frame, bboxs, video_id, sequence = process_frame(sample)
    display_frame(frame, bboxs, video_id, sequence)


In [None]:
from PIL import Image, ImageDraw

def fetch_image_list(df_tmp, video_id, num_images, start_frame_idx):
    def fetch_image(frame_id):
        path_base = '/kaggle/input/tensorflow-great-barrier-reef/train_images/video_{}/{}.jpg'
        raw_img = Image.open(path_base.format(video_id, frame_id))

        row_frame = df_tmp[(df_tmp.video_id == video_id) & (df_tmp.video_frame == frame_id)].iloc[0]
        bounding_boxes = ast.literal_eval(row_frame.annotations)

        for box in bounding_boxes:
            draw = ImageDraw.Draw(raw_img)
            x0, y0, x1, y1 = (box['x'], box['y'], box['x']+box['width'], box['y']+box['height'])
            draw.rectangle( (x0, y0, x1, y1), outline=180, width=3)
        return raw_img

    return [np.array(fetch_image(start_frame_idx + index)) for index in range(num_images)]

images = fetch_image_list(df_train, video_id = 0, num_images = 80, start_frame_idx = 25)

print("Num images: ", len(images))
plt.imshow(images[0], interpolation='nearest')
plt.axis('off')
plt.show()
pep8(_ih)

#### Vidéos avec annotations 

In [None]:
def add_annotations(img, annotations, color="red", thickness=3):
    """
    Adds annotations to an image using cv2.

    annotations: [list] of dictionaries with the annoation details
    """
    if color == "red":
        box_color = (0, 0, 255)  # Red
    elif color == "black":
        box_color = (0, 0, 0)  # Black
    for a in annotations:
        cv2.rectangle(
            img,
            (a["x"], a["y"]),
            (a["x"] + a["width"], a["y"] + a["height"]),
            box_color,
            thickness=thickness,
        )

    return img


def create_reef_video(
    train,
    video_id,
    start_video_frame,
    end_video_frame,
    annotate=True,
    output_filename="./test.mp4",
    FPS=30,
    image_dir="../input/tensorflow-great-barrier-reef/train_images/",
):

    width = 1280
    height = 720

    fourcc = VideoWriter_fourcc(*"mp4v")

    temp_fn = output_filename.replace(".mp4", "") + "_temp.mp4"

    video_file = VideoWriter(temp_fn, fourcc, float(FPS), (width, height))

    subset_df = (
        train.query(
            "video_id == @video_id and video_frame >= @start_video_frame and video_frame <= @end_video_frame"
        )
        .reset_index(drop=True)
        .copy()
    )
    for i, example in tqdm(subset_df.iterrows(), total=len(subset_df)):
        video = example["video_id"]
        frame = example["video_frame"]
        image_fn = f"{image_dir}video_{video}/{frame}.jpg"
        img = cv2.imread(image_fn)
        if annotate:
            annotations = eval(example["annotations"])
            img = add_annotations(img, annotations)
        video_file.write(img)

    video_file.release()

    subprocess.run(
        [
            "ffmpeg",
            "-i",
            temp_fn,
            "-crf",
            "18",
            "-preset",
            "veryfast",
            "-vcodec",
            "libx264",
            output_filename,
            "-loglevel",
            "error",
        ]
    )

    os.remove(temp_fn)

    return output_filename


pep8(_ih)

In [None]:
train = pd.read_csv("../input/tensorflow-great-barrier-reef/train.csv")
test = pd.read_csv("../input/tensorflow-great-barrier-reef/test.csv")
ss = pd.read_csv("../input/tensorflow-great-barrier-reef/example_sample_submission.csv")

train.shape, test.shape
pep8(_ih)

In [None]:
from cv2 import VideoWriter, VideoWriter_fourcc
import subprocess
from IPython.display import Video
pep8(_ih)

In [None]:
create_reef_video(
    train,
    output_filename="example-1.mp4",
    annotate=True,
    video_id=1,
    start_video_frame=9090,
    end_video_frame=9172,
)
Video("example-1.mp4", width=800)
pep8(_ih)

In [None]:
create_reef_video(
    train,
    output_filename="example-2.mp4",
    annotate=True,
    video_id=2,
    start_video_frame=5600,
    end_video_frame=5800,
)
Video("example-2.mp4", width=900)

pep8(_ih)

In [None]:
create_reef_video(
    train,
    output_filename="example-3.mp4",
    annotate=True,
    video_id=0,
    start_video_frame=4500,
    end_video_frame=4700,
)
Video("example-3.mp4", width=900)

pep8(_ih)

## F2-score

 #### La métrique de la compétition est le f2-score dont la formule est: $$F2 = 5 \cdot \frac{precision \cdot recall}{4\cdot precision + recall}$$
 #### L'objectif avec un tel score est de discriminer les faux positifs par rapport aux faux négatifs, il est en effet plus important dans ce problème de détecter un COTS que de ne pas le détecter ou de détecter faux COTS.
 #### Le score F1 est la moyenne harmonique entre la précision et le rappel : $$F1 = 2 \cdot \frac{precision \cdot recall}{precision + recall}$$
 

In [None]:
''' De petits exemples pour comprendre le f2-score et
pourquoi il est meilleur dans le cas où on cherche
avant tout à préviligier les faux négatifs vis à vis des faux positifs
avec comparaison avec le F-0.5 et F-1 score'''
from sklearn.metrics import fbeta_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


def compare_f_score(y_true, y_pred):
    p = precision_score(y_true, y_pred)
    r = recall_score(y_true, y_pred)
    f05 = round(fbeta_score(y_true, y_pred, beta=0.5), 4)
    f1 = round(f1_score(y_true, y_pred), 4)
    f2 = round(fbeta_score(y_true, y_pred, beta=2.0), 4)
    return [p, r, f05, f1, f2]


pep8(_ih)

In [None]:
from sklearn.metrics import confusion_matrix
# Confusion Matrix
def cf_matrix(y_true, y_pred):
    cf_matrix = confusion_matrix(y_true, y_pred)
    fig = plt.figure(figsize=(15,6))
    plt.subplot(121)
    ax = sns.heatmap(cf_matrix, annot=True)
    ax.set_xlabel("Predicted labels", color="g")
    ax.set_ylabel("True labels", color="navy")
    plt.title("Matrice de confusion", fontsize=18)
    
    plt.subplot(122)
    y=compare_f_score(y_true,y_pred)
    sns.barplot(x, y)
    
    
    plt.show()

In [None]:
x=['p','r','f05','f1','f2']

In [None]:
# exemple 1
y_true = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
y_pred = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
cf_matrix(y_true, y_pred)

pep8(_ih)

In [None]:
# exemple 2
y_true = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
y_pred = [1, 0, 0, 0, 0, 0, 0, 1, 1, 1]
cf_matrix(y_true, y_pred)

pep8(_ih)

In [None]:
# exemple 3
y_true = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
y_pred = [1, 1, 1, 1, 0, 0, 0, 1, 1, 1]
cf_matrix(y_true, y_pred)

pep8(_ih)

In [None]:
# exemple 4
y_true = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
y_pred = [1, 1, 1, 1, 0, 1, 1, 1, 1, 1]
cf_matrix(y_true, y_pred)

pep8(_ih)