In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # numeric library
import pandas as pd # data structure library

import matplotlib.pyplot as plt # plot library

In [None]:
# reading the dataset
path = "../input/tensorflow-great-barrier-reef/"
train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"test.csv")

# dataset shape
print('Train Images: {}'.format(train.shape[0]))
print('Test Images: {}'.format(test.shape[0]))

# how many images contain the starfish and how many does not
print('Train Images without Starfish: {} and Train Images with Starfish: {}'.format(train[train['annotations']=='[]'].shape[0], train[train['annotations']!='[]'].shape[0]))
print('Percentage of Images with Starfish: {:.2f}%'.format(train[train['annotations']!='[]'].shape[0]/train.shape[0]*100))

In [None]:
# Add images path to dataframe
train["image_path"] = "../input/tensorflow-great-barrier-reef/train_images/video_"+train["video_id"].astype(str)+"/"+train["image_id"].apply(lambda x: x.split("-")[1])+".jpg"

# eval annotations
train["annotations"] = train["annotations"].apply(eval)

# viewing some sample rows of the dataframe
train.head()

In [None]:
from collections import OrderedDict
def draw_bar_graph(ax, x, y, div_factor, *args):
    d = dict(zip(x, y)) # creating a dictionary using the argument (x, y): x=key, y=value
    d = OrderedDict(sorted(d.items())) # sorting the dictionary w.r.t. keys

    width = 0.75 # the width of the bars 

    ax.bar(list(d.keys()), list(d.values()), width=width, align='center', edgecolor='darkblue', facecolor='lightblue')
    ax.set_xticks(list(d.keys()), list(d.keys()))

    # this draws the labels
    for i, v in d.items():
        ax.text(i-width/div_factor, v+10, str(v), color='black', fontweight='bold')
        
    if len(args) == 3:
        ax.set_title('{}: {}'.format(args[2], np.dot(list(d.values()), list(d.keys()))))
     
    ax.set_xlabel(args[0])
    ax.set_ylabel(args[1])
    return ax

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12.5, 5))
image_count = train['video_id'].value_counts()
axes[0] = draw_bar_graph(axes[0], image_count.index, image_count.values, 10, 'Video Id', 'Number of Images')

# count of images with/without starfish
temp_series1 = train[train['annotations'].str.len()==0].groupby('video_id').count()['annotations']
temp_series2 = train[train['annotations'].str.len()!=0].groupby('video_id').count()['annotations']

# creating a dataframe
temp_df = pd.DataFrame({ 'Starfish (without)': temp_series1, 'Starfish (with)': temp_series2 })

video_id = [0,1,2]

# From raw value to percentage
totals = [i+j for i,j in zip(temp_df['Starfish (without)'], temp_df['Starfish (with)'])]
greenBars = [i / j * 100 for i,j in zip(temp_df['Starfish (without)'], totals)]
orangeBars = [i / j * 100 for i,j in zip(temp_df['Starfish (with)'], totals)]
 
# plot
width = 0.75
# Create green Bars
axes[1].bar(video_id, greenBars, color='#119999', edgecolor='white', width=width, label='Starfish (without)')
# Create orange Bars
axes[1].bar(video_id, orangeBars, bottom=greenBars, color='#f9bc86', edgecolor='white', width=width, label='Starfish (with)')

for i in video_id:
    axes[1].text(i-width/5, greenBars[i]-50, '{:.2f}%'.format(greenBars[i]), color='black', fontweight='bold')
    axes[1].text(i-width/5, greenBars[i]+3, '{:.2f}%'.format(orangeBars[i]), color='black', fontweight='bold')
    
# Custom x axis
axes[1].set_xticks(video_id, video_id)
axes[1].set_xlabel("Video id")
axes[1].set_ylabel("Percentage of Images")

# Add a legend
#axes[1].legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)
axes[1].legend(loc='lower right')

plt.tight_layout()
plt.show()
plt.close() # doesn't hold the image in memory anymore

In [None]:
fig, ax = plt.subplots(figsize=(10, 7.5))
    
starfish_count = train[train.annotations.str.len()!=0].annotations.str.len().value_counts()
draw_bar_graph(ax, starfish_count.index, starfish_count.values, 3, 'Number of Starfish per Image', 'Number of Images', 'Distribution of starfish numbers across images\nTotal starfish in all images')

plt.tight_layout()
plt.show()
plt.close() # doesn't hold the image in memory anymore

In [None]:
# creating a new dataframe temp_df just separating the bounding box information
# this is to just create the histogram of bounding box sizes
out = {}
for i, (k, v) in enumerate(train[train['annotations'].str.len()!=0]['annotations'].items()):
    temp_df = pd.DataFrame(v)
    if temp_df.empty:
        out[(i, k)] = pd.DataFrame(index=[0], columns=['Id'])
    else:
        out[(i, k)] = temp_df
temp_df = pd.concat(out, sort=True).reset_index(level=[0,2], drop=True)
# temp_df will contain all the 11898 starfish bounding box details
temp_df['area'] = temp_df['height'] * temp_df['width']
temp_df['aspect_ratio'] = temp_df['width'] / temp_df['height']
print('Number of bounding boxes (which is also the number of starfish): {}'.format(temp_df.shape[0]))
temp_df.head()

In [None]:
fig, axes = plt.subplots(2,2, figsize=(15, 10))

# drawing histograms by dividing into different bounding box area ranges
# gives an idea how big/small starfish objects are within the images
temp_df[temp_df['area']<=1600]['area'].hist(ax = axes[0, 0]) # number of bounding boxes where area < 1600 pixel^2
temp_df[(temp_df['area']>1600)&(temp_df['area']<=5000)]['area'].hist(ax = axes[0, 1]) # number of bounding boxes where area > 1600 pixel^2 and <= 5000 pixel^2
temp_df[(temp_df['area']>5000)&(temp_df['area']<=10000)]['area'].hist(ax = axes[1, 0]) # ...
temp_df[(temp_df['area']>10000)]['area'].hist(ax = axes[1, 1]) # ...

# setting x and y labels of the figures
for i in range(2):
    for j in range(2):
        axes[i, j].set_xlabel('Bounding box area, (pixel)$^2$')
        axes[i, j].set_ylabel('Frequency')
        
plt.tight_layout()
plt.show()
plt.close()

In [None]:
# looking at the various aspect ratios of the bounding boxes containing starfish object
fig, axes = plt.subplots(1,2, sharey=True, figsize=(12.5, 5))

temp_df[temp_df['aspect_ratio'] < 1.0]['aspect_ratio'].hist(ax=axes[0])
temp_df[temp_df['aspect_ratio'] >= 1.0]['aspect_ratio'].hist(ax=axes[1])

axes[0].set_xlabel('Aspect Ratio')
axes[1].set_xlabel('Aspect Ratio')
axes[0].set_ylabel('Frequency')
plt.tight_layout()
plt.show()
plt.close()

In [None]:
print('The image ID with the largest bounding box area: {}'.format(temp_df['area'].idxmax()))
# we have seen an image can contain upto 18 starfish(es): finding one such image id
print('The image ID with the highest number of starfish: {}'.format(train['annotations'].str.len().idxmax()))

In [None]:
from PIL import Image, ImageDraw

# draw an image with bounding boxes: argument is the dataframe and image id
def img_viz(df, id):
    image = df['image_path'][id]
    img = Image.open(image)
    
    for box in df['annotations'][id]:
        shape = [box['x'], box['y'], box['x']+box['width'], box['y']+box['height']]
        ImageDraw.Draw(img).rectangle(shape, outline ="red", width=3)
    return img

# in this part, we draw a few images to get an idea of the image quality, surroundings, shapes of starfish inside it, etc.
fig, axes = plt.subplots(2,2, figsize=(15, 10))

# draw an image with starfish
img1 = img_viz(train, id=5474)
axes[0, 0].set_title('An image with starfish')
axes[0, 0].imshow(img1)

# draw an image with the largest bounding box containing starfish
img2 = img_viz(train, id=7336)
axes[0, 1].set_title('The image with the largest bounding box containing starfish')
axes[0, 1].imshow(img2)

# draw an image with the highest number of starfish
img3 = img_viz(train, id=12679)
axes[1, 0].set_title('One of three images with the highest number (18) of starfish')
axes[1, 0].imshow(img3)

# draw an image without a starfish as well
img4 = img_viz(train, id=1)
axes[1, 1].set_title('An image with no starfish')
axes[1, 1].imshow(img4)

plt.tight_layout()
plt.show()
plt.close() # doesn't hold the image in memory anymore

In [None]:
# in this part, we draw a few images to get an idea of the image quality, surroundings, shapes of starfish inside it, etc.
fig, axes = plt.subplots(1,1, figsize=(15, 10))

# draw an image with starfish
img1 = img_viz(train, id=7336)
axes.set_title('An image with starfish')
axes.imshow(img1)

plt.tight_layout()
plt.show()
plt.close()

<h2>Summary</h2>
<ul>
    <li>The images vary in quality (e.g., clarity). The background colour seems to be affected as well (not always sea blue?). </li>
    <li>The surroudnings should definitely be considered in terms of choosing the object (starfish) detection algorithm - also if applying 'transfer learning' (pre-built model for object detection) too.</li>
    <li>There are different bounding box sizes within the image. Some of them are very hard to identify with naked eye. Is there a need to apply 'image augmentation' techniques especially for starfish bounding boxes (the larger ones?) that are not abundant in quantity?</li>
    <li>Semantic segmenation algorighms for only learning the 'semantic' inforamtion of the starfish. In other words, no need to differentiate between multiple starfish. As long as, a starfish object can be detected and localised within the image correctly, the task will be achieved.</li>
    <li>The projection angle of the scene seems to be important too, e.g., the bigger starfish will be easier to identify than the distant ones. Is there a way to differentiate between different projections, e.g., presence of prodominant blue gives a notion of whether the image is taken from farther away (or difficult angle) than images where there is absence of blue as a predominant colour?</li>
    <li>The initial choice of algorithm: Mask R-CNN algorithm?</li>
</ul>