According to The Great Barrier Reef Foundation, COTS are an important part of a healthy ecosystem in a reasonable number on healthy coral reefs since they advance the coral diversity of the reefs. However, when they appear in mass, they can negatively impact the life of the coral reef. 
It is the reason why we would like to develop a computer vision model to detect these COTS and save the reef from outbreaks. 


Our EDA has pointed out some considerations to build our model. Thanks, [Diego Gomez](https://www.kaggle.com/diegoalejogm), for the [animations](https://www.kaggle.com/diegoalejogm/great-barrier-reefs-eda-with-animations). Thanks, [Prabhakaran D](https://www.kaggle.com/get2jawa),for the [basic and simple EDA](https://www.kaggle.com/get2jawa/great-barrier-reefs-basic-simple-eda), Thanks, [Shijin Yang](https://www.kaggle.com/sjyangkevin), for sharing your [EDA, Bouding Box Analysis & Annotated Videos](https://www.kaggle.com/sjyangkevin/eda-bouding-box-analysis-annotated-videos) notebook. 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns 
from matplotlib.image import imread
import ast
import os

In [None]:
#Load data
meta_df = pd.read_csv('../input/tensorflow-great-barrier-reef/train.csv')
meta_df_test = pd.read_csv('../input/tensorflow-great-barrier-reef/test.csv')

  **1. EDA of META-DATA**

In [None]:
meta_df.shape

In [None]:
meta_df.isnull().sum()

In [None]:
meta_df.head()

In [None]:
meta_df.describe(include='all')
#Could perform imputation here with NaN value

In [None]:
meta_df.info()

In [None]:
meta_df_cat = meta_df.select_dtypes(include = 'object').copy()
meta_df_cat.nunique()

In [None]:
meta_df.duplicated().sum()

In [None]:
meta_df.isnull().sum()

There are no null values.

In [None]:
from os import listdir
from PIL import Image

"""
Verifying images are valid
"""
def validated_images(video_id):
    path = '../input/tensorflow-great-barrier-reef/train_images'.format(video_id)
    print ('Verifying that video {} frames are valid...'.format(video_id))
    for filename in listdir(path):
        if '.jpg' in filename:
            try:
                image = Image.open(path+filename)
                image.verify()
            except (IOError, SyntaxError) as e:
                print ('Bad file:', filename)
            print ('Verfied! Video {} has all valid images'.format(video_id))
for video_id in range(3):
    validated_images(video_id)

In [None]:
sequence_counts = meta_df['sequence'].value_counts().sort_values().reset_index()
sequence_counts.columns = [['sequence', 'num_frames']]
print ("number of sequences:", len(sequence_counts))
sequence_counts.head()

In [None]:
num_obj_wt_frame = meta_df[meta_df.annotations =='[]']['annotations'].count()
print ("Number of frames without objects:" , num_obj_wt_frame)

In [None]:
num_obj_with_frame = meta_df[meta_df.annotations != '[]']['annotations'].count()
print ("Number of frames with objects:",num_obj_with_frame)

In [None]:
#Showing what images can look like
folder = '../input/tensorflow-great-barrier-reef/train_images/video_0/'
for i in range(9):
    plt.subplot(330 + 1 + i)
    filename = folder + str(i) + '.jpg'
    image = imread(filename)
    plt.imshow(image)
plt.show()

In [None]:
meta_df[meta_df.annotations != '[]'].head()

In [None]:
num_obj_wt_frame = meta_df[meta_df.annotations =='[]']['annotations'].count()
print ("Number of frames without objects:" , num_obj_wt_frame)

In [None]:
num_obj_with_frame = meta_df[meta_df.annotations != '[]']['annotations'].count()
print ("Number of frames with objects:",num_obj_with_frame)

In [None]:
print('ratio of frames with objects:', num_obj_with_frame / len(meta_df))

fig, axes = plt.subplots(1,1, figsize=(12, 6))

sns.barplot(ax=axes, x=['Number of Frames with Objects', 'Number of Frames with No Objects'], y=[num_obj_with_frame, num_obj_wt_frame])
axes.set_title("Distribution of Frames with/without Objects")
axes.set_xlabel("Frame Types")
axes.set_ylabel("Count")

plt.show()

Most frames do not have objects in them.

In [None]:
frame_counts = meta_df['video_id'].value_counts().sort_values().to_frame()
frame_counts.head()

In [None]:
meta_df[meta_df.annotations.str.len()>2]

In [None]:
meta_df.annotations.map(len).value_counts()

In [None]:
meta_df ['video_id'].value_counts()

In [None]:
meta_df['sequence'].value_counts()

In [None]:
sns.set_theme(style='whitegrid')
ax=sns.countplot(x='video_id',data=meta_df)

In [None]:
meta_df_test.head()

In [None]:
meta_df_test.shape

**2. VISUALISTION OF IMAGES**

In [None]:

from os import listdir
from PIL import Image
"""
Verifying images are valid
"""
def validate_images(video_id):
    path = '../input/tensorflow-great-barrier-reef/train_images'.format(video_id)
    print ('Verifying that video {} frames are valid...'.format(video_id))
    for filename in listdir(path):
        if '.jpg'in filename:
            try:
                image = Image.open(path+filename)
                image.verify()
            except(IOError,SyntaxError) as e:
                print ('Bad file:', filename)
    print('Verified! Video {} has all valid images'.format(video_id))
for video_id in range(3):
    validate_images(video_id)

In [None]:
#Images of COTS for which we have annotations as []

for i in range(3):
    print ("Video" + str(i))
    print ("Frames with annotations:" + str((meta_df[meta_df['video_id'] !=i] ['annotations'] != '[]').sum()))
    print ("Frames without annotations: " +str((meta_df[meta_df['video_id']==i]['annotations']!= '[]').sum()))
    print("-------")

In [None]:
#load sequence of images with annotations
from PIL import Image, ImageDraw
import numpy as np

def fetch_image_list(df_tmp, video_id, num_images, start_frame_idx):
    
    '''
    Load sequence of images with annotations
    '''
    def fetch_image(frame_id):
        path_base = '../input/tensorflow-great-barrier-reef/train_images/video_{}/{}.jpg'
        raw_image = Image.open(path_base.format(video_id, frame_id))

        row_frame = df_tmp[(df_tmp.video_id == video_id) & (df_tmp.video_frame == frame_id)].iloc[0]
        bounding_boxes = ast.literal_eval(row_frame.annotations)

        for box in bounding_boxes:
            draw = ImageDraw.Draw(raw_image)
            x0, y0, x1, y1 = (box['x'], box['y'], box['x']+box['width'], box['y']+box['height'])
            draw.rectangle( (x0, y0, x1, y1), outline=180, width=3)
        return raw_image

    return [np.array(fetch_image(start_frame_idx + index)) for index in range(num_images)]

images = fetch_image_list(meta_df, video_id = 0, num_images = 80, start_frame_idx = 25)

print("Num images: ", len(images))
plt.imshow(images[0], interpolation='nearest')
plt.axis('off')
plt.show()

In [None]:
from matplotlib import animation, rc
rc('animation',html = 'jshtml')

def create_animation(ims):
    fig = plt.figure(figsize=(9,9))
    plt.axis('off')
    im = plt.imshow(ims[0])
    
    def animate_func(i):
        im.set_array(ims[i])
        return [im]
    return animation.FuncAnimation(fig,animate_func, frames=len(ims),
                                  interval=1000/12)

create_animation(images)

In [None]:
#Check the meta-data against image data 

In [None]:
#loading images
video_0 = os.listdir('../input/tensorflow-great-barrier-reef/train_images/video_0')
video_1 = os.listdir('../input/tensorflow-great-barrier-reef/train_images/video_1')
video_2 = os.listdir('../input/tensorflow-great-barrier-reef/train_images/video_2')

In [None]:
#Creating the image paths
meta_df['image_path'] = "video_"+ meta_df['video_id'].astype(str)+'/'+ meta_df['video_frame'].astype(str)+".jpg"
meta_df.head()

In [None]:
def show_image(meta_df, idx):
    f_name = meta_df.iloc[idx]['image_path']
    return PIL.Image.open("../input/tensorflow-great-barrier-reef/train_images/"+f_name)

In [None]:
# Finding out whether all the image paths are existing regular files 
meta_df['image_path'].apply(lambda f_name:os.path.isfile("../input/tensorflow-great-barrier-reef/train_images/"+f_name)).all()

In [None]:
import PIL
show_image(meta_df,0).resize((400,256))

In [None]:
#Check the npy files

In [None]:
pixels = np.load("../input/tensorflow-great-barrier-reef/example_test.npy")

In [None]:
pixels.shape

In [None]:
PIL.Image.fromarray(pixels[0, :]).resize((400, 256))

In [None]:
PIL.Image.fromarray(pixels[2, :]).resize((400, 256))

In [None]:
#Evaluate annotation strings containing literals
ast.literal_eval(meta_df.iloc[16].annotations)

In [None]:
# Change the annotations into a list 
meta_df ['annotations']= meta_df['annotations'].apply(eval)
meta_df[meta_df['annotations'].str.len()>1].iloc[0]['annotations']

In [None]:
#Distribution of boxes in frames
meta_df['n_objects'] = meta_df['annotations'].str.len()
meta_df.value_counts('n_objects').plot.bar(figsize=(10,5),alpha=0.5,rot=0,title='Distribution of boxes in frames');

The hope for this competition is that we can find a better way to tame COTS outbreaks ethically and effectively. Maybe in the future we will not have to kill a massive number of COTS.

Please feel free to ask any question or give us any feedback on our work. Thank you. 