# Import

In [None]:
import warnings
warnings.filterwarnings("ignore")
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import pandas as pd
import os
import numpy as np
import cv2
import ast

# Pre process

In [None]:
%%time

train_df = pd.read_csv('../input/tensorflow-great-barrier-reef/train.csv')
train_df = train_df[train_df['annotations'].str.len() > 2]
train_df['ann'] = train_df['annotations'].apply(lambda x: ast.literal_eval(x))
train_df['count'] = train_df['ann'].str.len()
train_df['bboxes'] = train_df['ann'].apply(lambda x: [[ann['x'], ann['y'], ann['x'] + ann['width'], ann['y']+ann['height']] for ann in x])
train_df['areas'] = train_df['ann'].apply(lambda x: [np.array([ann['width']*ann['height'] for ann in x])])

In [None]:
train_df.sample(5)

# Filtering reefs lies on last 10% Area

In [None]:
area_list = np.concatenate(train_df['areas'].values.flatten(), axis=1).ravel().tolist()
print("Total crops : " + str(len(area_list)))
print("Mean ", np.mean(area_list), " Median ", np.median(area_list))

In [None]:
filter_per = 0.10 # bottom 10 %
f_index = int(filter_per * len(area_list))
s_area_list = np.sort(area_list)
f_area_list = s_area_list[f_index:]
min_area = f_area_list[0]
print("Actual min area:", s_area_list[0], ' After 10% filtered min area : ', min_area)

In [None]:
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Histogram(x=area_list, name='Area'), 1, 1)
fig.add_vrect(x0=0, x1=min_area,fillcolor="LightSalmon", opacity=0.5, layer="below", line_width=0, col=1)
fig.add_trace(go.Box(x=area_list, name='Area'), 1, 2)
fig.add_trace(go.Box(x=f_area_list, name='Filtered_Area'), 1, 2)
fig.show()

In [None]:
%%time
train_df['f_bboxes'] = train_df['bboxes'].apply(lambda x : [i for i in x if ((i[2]-i[0])*(i[3]-i[1])) >= min_area])
train_df['f_count'] = train_df['f_bboxes'].str.len()
f_train_df = train_df[train_df['f_count']!=0]

print("Actual data frame len :", len(train_df), " after filtered : ", len(f_train_df))

In [None]:
sample_df = f_train_df[(f_train_df['f_count'] > 3) & (f_train_df['f_count']!=f_train_df['count'])].sample(5)
sample_df

# Visualizing filter effect

In [None]:
train_dir = '../input/tensorflow-great-barrier-reef/train_images'
video_pre = 'video_'
def get_img_path(v_id, img_id):
    return os.path.join(train_dir, video_pre + str(v_id), str(img_id) + '.jpg')

def get_cropped_images(v_id, image_id, bboxes): 
    img = Image.open(get_img_path(v_id, image_id))
    images = list()
    for bbox in bboxes:
        images.append(img.crop(bbox))
    return images

def draw_bboxes(v_id, image_id, bboxes): 
    img = Image.open(get_img_path(v_id, image_id))
    draw = ImageDraw.Draw(img)
    for bbox in bboxes:
        draw.rectangle(bbox, outline='Red', width=10)
    return img

for i, row in sample_df.iterrows():
    plt.figure(figsize=(20, 15))
    
    img1 = draw_bboxes(row.video_id, row.video_frame, row.bboxes)
    plt.subplot(1, 2, 1)
    plt.imshow(img1)
    
    img2 = draw_bboxes(row.video_id, row.video_frame, row.f_bboxes)
    plt.subplot(1, 2, 2)
    plt.imshow(img2)
    
    plt.show()