In [None]:
# Standard libraries
import datetime
import random
import json
import ast
import glob

# Third-party libraries
import PIL.Image
import PIL.ImageDraw
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Jupyter widgets
from IPython.core.display import display, HTML

# Configurations
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## Part 1: Images
Create the dataframe with the JPEG images available on disk.

In [None]:
# Location of dataset
DATASET_PATH = '../input/airbus-oil-storage-detection-dataset'

# List all images in the folder
image_list = [filename.split('/')[-1].split('.')[0] for filename in glob.glob(DATASET_PATH + "/images/*.jpg")]
image_ids = pd.DataFrame(image_list).rename(columns={0:"image_id"})
print("Number of images in folder: {}".format(len(image_ids)))
image_ids.head()

## Part 2: Annotations
Now, we want to add the bounding box informations to the dataframe. A bounding box is a rectangle around the object detected. We only need to know the coordinates of 2 points to describe a bounding box, for example top-left and bottom-right.
![](assets/bbox_image.jpg)

In [None]:
# convert a string record into a valid python object
def f(x): 
    return ast.literal_eval(x.rstrip('\r\n'))

# read the CSV with annotations
labels = pd.read_csv(DATASET_PATH + '/annotations.csv',
        converters={'bounds': f})

# just in case, join annotations with image list
#labels = image_ids.merge(right=labels, how='left')

# print first lines
labels.head()

In [None]:
print("Total number of unique 'image_ids' in annotations: {}".format(len(labels['image_id'].unique())))

In [None]:
print("Total number of annotations: {}".format(len(labels)))

In [None]:
print("Available classes: {}".format(labels['class'].unique().tolist()))

## Part 3: Compute some statistics
Compute the number of objects in each image and store it in a dataframe named `histo`

In [None]:
histo = labels.image_id.value_counts()
print("Images with more than 400 oil storage tanks")
histo[(histo > 400)]

In [None]:
# Add number of storage tanks per tile in the dataframe and sort it
labels.at[:, 'records'] = labels.loc[:, 'image_id'].apply(lambda image_id: histo.loc[image_id])
labels = labels.sort_values(by=['records'], ascending=False)
labels.head(10)

In [None]:
# Display the histogram of objects per image
plt.figure(figsize=(25, 15))
plt.title("Number of Oil Storage Tanks per image")
plt.grid(which='both')
g = sns.countplot(x='image_id', data=labels)
plt.xlabel("image_id")
plt.ylabel("n row per image_id")

# Rotate x labels
g.set_xticklabels(labels=g.get_xticklabels(), rotation=90);
# Or hide them
# g.set_xticklabels(labels=[None]);

plt.savefig("objects-per-image.png")

## Part 4: Plot some images

In [None]:
# Create polygon from bounds
def create_polygon_from_bounds(bbox):
    (xmin, ymin, xmax, ymax) = bbox
    coords = []
    coords.append((xmin, ymin))
    coords.append((xmin, ymax))
    coords.append((xmax, ymax))
    coords.append((xmax, ymin))
    coords.append((xmin, ymin))
    return coords

def overlay_image(image_id, bbox_df):
    img = PIL.Image.open(DATASET_PATH + "/images/" + image_id + '.jpg')
    draw = PIL.ImageDraw.Draw(img)

    for k, row in bbox_df[bbox_df['image_id'] == image_id].iterrows():
        geometry = create_polygon_from_bounds(row['bounds'])
        draw.polygon(geometry, outline=(255,0,0))
        #draw.text(geometry[0], row['class'], fill=(255,0,0))
        
    return img

# select a random image or images with most annotations
#pickone = random.choice(image_ids.to_numpy().tolist())[0]
pickone = "1fcb9fee-da89-43f8-83d9-b5d17575f5e6" # 893 annotations
#pickone = "9892f3a0-f541-43b8-bc62-d640701841f7" # 540 annotations

img = overlay_image(pickone, labels)
filename = "oil-storage-sample.jpg"
img.save(filename)
display(img)

## Part 5: More Exploratory Data Analysis
### Compute width and height of each storage tanks
Add the width and the height in pixels of each storage tanks to the dataframe.
Then display some usefull statistics and plot an histogram.

In [None]:
def getWidth(bounds):
    try: 
        (xmin, ymin, xmax, ymax) = bounds
        return np.abs(xmax - xmin)
    except:
        return np.nan

def getHeight(bounds):
    try: 
        (xmin, ymin, xmax, ymax) = bounds
        return np.abs(ymax - ymin)
    except:
        return np.nan
# Create width and height
labels.loc[:,'width'] = labels.loc[:,'bounds'].apply(getWidth)
labels.loc[:,'height'] = labels.loc[:,'bounds'].apply(getHeight)

# Display head
labels.head()

In [None]:
labels.describe()

#### What conclusion can we draw from these statistics ?
Are they any annotations that we can discard ?

In [None]:
plt.figure(figsize=(25, 15))
sns.distplot(labels[labels['width'].notnull()]['width'])
plt.xlim(0, 150)
plt.show()

In [None]:
plt.figure(figsize=(25, 15))
sns.distplot(labels[labels['width'].notnull()]['height'])
plt.xlim(0, 150)
plt.show()

### Compute the aspect ratio of each storage tanks
Clean the results in order to remove NaN elements.
Then display some useful statistics and plot an histogram.

In [None]:
labels.at[:,'aspect_ratio'] = labels[['height', 'width']].max(axis=1) / labels[['height', 'width']].min(axis=1)
labels['aspect_ratio'].describe()

In [None]:
safe_labels = labels[(np.isfinite(labels['aspect_ratio'])) & labels['aspect_ratio'].notnull()]
safe_labels['aspect_ratio'].describe()

In [None]:
plt.figure(figsize=(25, 15))
sns.distplot(safe_labels['aspect_ratio'], bins = 100)
plt.show()

#### Based on these informations, can we discard some annotations ?
Display annotations with unusual aspect ratio bounding box.

In [None]:
# Filter records with an aspect ratio > 2.5 and display them.
strange_labels = safe_labels.loc[safe_labels['aspect_ratio'] > 2.5]
strange_labels

In [None]:
pickone = strange_labels.sample()
print(pickone)
img = overlay_image(pickone['image_id'].tolist()[0], pickone)
display(img)

### Cleaning by aspect ratio
Very small objects (typically under 5 pixels) will not be correctly managed by YOLO and might not be oil storage tanks anyhow. Aspect ratios over 2.5 seems very weird as well. Remove them from the training dataset.

In [None]:
keep_tags_wt_width_over_px = 5
keep_tags_wt_height_over_px = 5
bb_aspect_ratio_upper_limit =  2.5
                                
filter_too_small = np.logical_or(safe_labels['width'] < keep_tags_wt_width_over_px, safe_labels['height'] < keep_tags_wt_width_over_px)
print(sum(filter_too_small), "records too small")
filter_ratio_too_high = safe_labels['aspect_ratio'] > bb_aspect_ratio_upper_limit
print(sum(filter_ratio_too_high), "records with too high aspect ratio ")

cleaned_labels = safe_labels[np.logical_not(np.logical_or(filter_too_small,filter_ratio_too_high))]