# References
* Thanks to DIEGO GOMEZ & BAEK KYUN SHIN
* https://www.kaggle.com/diegoalejogm/great-barrier-reefs-eda-with-animations
* https://www.kaggle.com/werooring/basic-eda-starter-for-everyone

# Imports

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 
import seaborn as sns
from PIL import ImageDraw
from matplotlib import animation, rc
rc('animation', html='jshtml')

import ast

# Load data

In [None]:
train = pd.read_csv('../input/tensorflow-great-barrier-reef/train.csv')
test = pd.read_csv('../input/tensorflow-great-barrier-reef/test.csv')
sample = pd.read_csv('../input/tensorflow-great-barrier-reef/example_sample_submission.csv')

# Analyze

In [None]:
train.head()

In [None]:
train.tail()

In [None]:
train.info()

### Checking Duplicates

In [None]:
train.duplicated().sum()

In [None]:
train['video_id'].nunique()

### As you can see, we have totally 3 videos in the training dataset. Now lets see row count for each videos

In [None]:
sns.set_theme(style="whitegrid")
ax = sns.countplot(x='video_id', data=train)

### Not all the images have Crown-Of-Thorns Starfish (COTS) for which we have annotations as []

In [None]:
for i in range(3):
    print("Video " + str(i))
    print("Frames with Annotations : " + str((train[train['video_id'] != i]['annotations'] != '[]').sum()) )
    print("Frames without Annotations : " + str((train[train['video_id'] == i]['annotations'] != '[]').sum()) )
    print("---------")

# Feature Engineering

In [None]:
train.iloc[16].annotations
# Note the below result is string. We need to convert it to list

In [None]:
# Convert String to List Type
train['annotations'] = train['annotations'].apply(ast.literal_eval)

### Lets create a feature which have info about number of annotations per image

In [None]:
train['num_bboxes'] = train['annotations'].apply(lambda x: len(x))

In [None]:
print('Total rows without annotations : {}'.format(train[train['num_bboxes'] == 0]['num_bboxes'].count()))

### Lets see the distribution of number of COTS per image

In [None]:
plt.figure(figsize = (15,8))
sns.countplot(x=train[train['num_bboxes'] > 0].num_bboxes,data=train)

In [None]:
train[train['annotations'].str.len() > 2]

# Validate Images

In [None]:
from os import listdir
from PIL import Image

def validate_images(video_id):
    path = '/kaggle/input/tensorflow-great-barrier-reef/train_images/video_{}/'.format(video_id)
    
    print("Verifying that video {} frames are valid...".format(video_id))
    for filename in listdir(path):
        if filename.endswith('.jpg'):
            try:
                img = Image.open(path+filename)
                img.verify() # Verify it is in fact an image
            except (IOError, SyntaxError) as e:
                print('Bad file:', filename) # Print out the names of corrupt files
    print("Verified! Video {} has all valid images".format(video_id))

for video_id in range(3):
    validate_images(video_id)

In [None]:
def fetch_image(df, video_id, frame_id):
    # get frame
    frame = df[(df['video_id'] == video_id) & (df['video_frame'] == frame_id)].iloc[0]
    # get bounding_boxes
    bounding_boxes = frame['annotations']
    # open image
    img = Image.open('/kaggle/input/tensorflow-great-barrier-reef/' + f'train_images/video_{video_id}/{frame_id}.jpg')

    for box in bounding_boxes:
        x0, y0, x1, y1 = (box['x'], box['y'], box['x']+box['width'], box['y']+box['height'])
        draw = ImageDraw.Draw(img)
        draw.rectangle( (x0, y0, x1, y1), outline=180, width=5)
    return img

def fetch_image_list(df, video_id, num_images, start_frame_idx):
    image_list = [np.array(fetch_image(df, video_id, start_frame_idx + index)) for index in range(num_images)]

    return image_list

In [None]:
images = fetch_image_list(train, video_id=0, num_images=80, start_frame_idx=25)

print(f'Number of images: {len(images)}')

# Visualize COTS Annimation

In [None]:
grid = gridspec.GridSpec(4, 2) 
plt.figure(figsize=(18, 20))

idx_list = [0, 5, 10, 15, 20, 25, 30, 35] 

for i, idx in enumerate(idx_list): 
    ax = plt.subplot(grid[i])
    plt.imshow(images[idx], interpolation='nearest')
    ax.set_title(f'frame index {idx}')
    plt.axis('off')

In [None]:
def create_animation(imgs, frame_interval=130):
    fig = plt.figure(figsize=(7, 4))
    plt.axis('off')
    img = plt.imshow(imgs[0])

    def animate(i):
        img.set_array(imgs[i])
        return [img]

    return animation.FuncAnimation(fig, animate, frames=len(imgs), interval=frame_interval)

In [None]:
frame_interval = 130 # set smaller number if you want to play fast, otherwise set bigger

create_animation(images, frame_interval=frame_interval)