In [None]:
import gc #garbage collector
import ast # operate with string representation of list
import os

import pandas as pd
import numpy as np
import cv2
from scipy.stats import shapiro # for normal distribution checks 

import plotly.express as px
import plotly.graph_objects as go
import plotly
plotly.offline.init_notebook_mode(connected = True)

import matplotlib.pyplot as plt

# Global Wheat Detection. EDA

This notebook is dedicated to exploratory data analysis for [Global Wheat Detection](https://www.kaggle.com/c/global-wheat-detection) competition. The main focus will be on visualizing the differences between datasets from different sources and selecting key components for further data augmentation.


### Utilities

In [None]:
def convert_coords(bbox):
    '''
    Transform boundary box coordinates from pandas dataframe to cv2.rectangle values
    Pandas df values: x, y  width, height
    '''
    x, y, width, height = bbox
    start_point = x, y
    end_point = (x + width), (y + height)
    return start_point, end_point

def plot_samples(df, img_ids=None, threshold=6, title=''):
    '''
    Plot image grid from seleted dataframe 
    https://stackoverflow.com/questions/46615554/how-to-display-multiple-images-in-one-figure-correctly/46616645
    '''
    if img_ids is None:
        img_ids = df['image_id_ext'].unique()[:threshold]
    cols = 3
    rows = len(img_ids) // cols + 1
    fig = plt.figure(figsize = (15, 5 * rows))
    for i, img_id in enumerate(img_ids):
        bboxes_list = df[df['image_id_ext'] == img_id].bbox.to_list()
        img = cv2.imread(os.path.join(TRAIN_DIR_PATH, img_id))
        for item in bboxes_list:
            bbox = list(map(int, ast.literal_eval(item)))
            strart_point, end_point = convert_coords(bbox)
            color = (255, 0, 0) #RGB
            thickness = 2
            img = cv2.rectangle(img, strart_point, end_point, color, thickness)
        fig.add_subplot(rows, cols, i+1)
        plt.imshow(img)
    plt.suptitle(title, fontsize=16)
    fig.tight_layout()
    fig.subplots_adjust(top=0.95)
    plt.show()
    
# Took ideas of these functions from https://www.kaggle.com/aleksandradeis/globalwheatdetection-eda

def get_image_brightness(image_id_ext):
    img = cv2.imread(os.path.join(TRAIN_DIR_PATH, image_id_ext))
    # convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # get average brightness
    return np.array(gray).mean()

def get_percentage_of_green_pixels(image_id_ext):
    img = cv2.imread(os.path.join(TRAIN_DIR_PATH, image_id_ext))
    # convert to HSV
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    
    # get the green mask
    hsv_lower = (40, 40, 40) 
    hsv_higher = (70, 255, 255)
    green_mask = cv2.inRange(hsv, hsv_lower, hsv_higher)
    
    return float(np.sum(green_mask)) / 255 / (1024 * 1024)

def get_percentage_of_yellow_pixels(image_id_ext):
    img = cv2.imread(os.path.join(TRAIN_DIR_PATH, image_id_ext))
    # convert to HSV
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    
    # get the green mask
    hsv_lower = (25, 40, 40) 
    hsv_higher = (35, 255, 255)
    yellow_mask = cv2.inRange(hsv, hsv_lower, hsv_higher)
    
    return float(np.sum(yellow_mask)) / 255 / (1024 * 1024)

### Constants

In [None]:
MAIN_PATH = '/kaggle/input/global-wheat-detection/'
TRAIN_DIR_PATH = '/kaggle/input/global-wheat-detection/train/'
TEST_DIR_PATH = '/kaggle/input/global-wheat-detection/test/'

# Boundary boxes analysis

In [None]:
print('Train images amount', len(os.listdir(os.path.join(MAIN_PATH, 'train'))))
print('Test images amount', len(os.listdir(os.path.join(MAIN_PATH, 'test'))))

Note, that test dataset extremely small. Train dataset quite small as well, so I will have to use data augmentation.

In [None]:
train_df = pd.read_csv(os.path.join(MAIN_PATH, 'train.csv'))
sample_submission = pd.read_csv(os.path.join(MAIN_PATH, 'sample_submission.csv'))

In [None]:
train_df.head().T

In [None]:
sample_submission.head().T

Add additional column to train dataset with ```.jpg``` extention

In [None]:
train_df['image_id_ext'] = train_df['image_id'] + '.jpg'

Let's see on the diversity of boundary boxes amounts.

In [None]:
ser = train_df['image_id']

fig = px.histogram(ser, title = 'Amunt of bundary boxes on the each picture', 
                   labels={'x':'image id', 'y':'bbox amount'})
fig.update_xaxes(categoryorder='total descending')
fig.show()

In [None]:
temp_df = train_df.copy()
temp_df['dummy_column'] = 1
ser = temp_df.groupby(['image_id']).sum()['dummy_column']

In [None]:
fig = px.histogram(ser, title = 'Sum distribution of bboxes amount', 
                   labels={'y':'bbox amount'})
fig.show()

Pictures usually have ~35 bboxes. Also, we have something looks like a nice normal distribution.<br>
We can use this values to validate our results: distribution on valdation dataset should be the same.

In [None]:
print('Shapiro-Wilk test for normality result\n statistic:{:.3f}, p-value:{:.3E}'.format(*shapiro(ser.to_list())))

Yep, normal distribution.

In [None]:
del temp_df
gc.collect()

Let's check how many pictures from train set don't have any bboxes at all

In [None]:
print('How many no-bbox images does exist in train dataset?', 
      len(os.listdir(TRAIN_DIR_PATH)) - len(train_df['image_id'].unique()))

Save their ids into additional set

In [None]:
no_bbox_img_ids = set(os.listdir(TRAIN_DIR_PATH)) - set((train_df['image_id_ext']).unique().tolist())

Checking the size distribution of boundary boxes. 

In [None]:
%%time
# convert column '[x, y, width, height]' to the separate pandas dataframe
splitted_data = train_df['bbox'].str.split(r'[^\d.]+')
bbox_df = pd.DataFrame.from_dict(dict(zip(splitted_data.index,splitted_data))).T
bbox_df.drop(columns = [0, 5], inplace = True) #drop empty columns
bbox_df.columns = ['x', 'y', 'bwidth', 'bheight']
bbox_df = bbox_df.astype(float)
bbox_df['size'] = bbox_df['bwidth'] * bbox_df['bheight']

In [None]:
train_df = train_df.join(bbox_df)
train_df.head()

In [None]:
train_df['size'].describe()

We definately have some anomalies here

In [None]:
max_bbox_size = train_df.groupby(['image_id']).max()['size'].to_frame()

fig = px.histogram(max_bbox_size, title = 'Max distribution of bboxes area size', 
                   labels={'y':'bbox area size'})
fig.show()

In [None]:
MAX_ANOMALY_THRESHOLD = 120000
print('Size of anomaly frame: ', train_df[train_df['size'] > MAX_ANOMALY_THRESHOLD].shape)

In [None]:
plot_samples(train_df[train_df['size'] > MAX_ANOMALY_THRESHOLD], 
             threshold=15, 
             title='Images with anomaly big bboxes')

In [None]:
min_bbox_size = train_df.groupby(['image_id']).min()['size'].to_frame()

fig = px.histogram(min_bbox_size, title = 'Min distribution of bboxes area size', 
                   labels={'y':'bbox area size'})
fig.show()

In [None]:
MIN_ANOMALY_THRESHOLD = 1000
print('Size of anomaly frame: ', train_df[train_df['size'] < MIN_ANOMALY_THRESHOLD].shape)

In [None]:
plot_samples(train_df[train_df['size'] < MIN_ANOMALY_THRESHOLD], 
             threshold=9, 
             title = 'Images with anomaly small bboxes')

Thus, only bboxes of excessive size are dangerous anomalies in the dataset. There are few of them and they can be removed without damaging the generality of the model.<br>
Ultra-small boundary boxes can be explained either by very careful tracing of plants and by zooming the photos during the creation of the dataset. Anyway it will not hurt the model quality. <br>
This is a signal for us, that we should use this crop tric during data augmentation as well.

In [None]:
sum_bbox_areas =  train_df.groupby(['image_id']).sum()['size']/(1024 * 1024)

fig = px.histogram(sum_bbox_areas, title = 'Percentage distribution of image box coverage', 
                   labels={'y':'bbox area size'})
fig.show()

And again, we got something similar to the normal distribution, like it was in distribution of bboxes amount.
We can expect similar distribution on validation and test (if we will find additional data) datasets.

In [None]:
print('Shapiro-Wilk test for normality result \n statistic:{:.3f}, p-value:{:.3E}'.format(*shapiro(sum_bbox_areas.to_list())))

# Source distribution

According to the description of the competition and the transcription of acronyms from the dataset, the sources of the image are: 
* ```arvalis``` - ARVALIS, Institut du vegetal, France. 
* ```ethz``` - ETHZ, Swiss Federal Institute of Technology in Zurich, Switzerland
* ```rres``` - Rothamsted Research Institute, UK
* ```usask``` - University of Saskatchewan, Canada
* ```inrae``` - INRAE, National Research Institute for Agriculture, Food and Environment, France

In [None]:
fig = px.histogram(train_df.groupby(['image_id', 'source']).mean().index.to_frame()['source'],
                   title = 'Image sources distribution',
                   labels={'x':'source name', 'y':'images amount'})
fig.update_xaxes(categoryorder='total descending')
fig.show()

We will review the image samples from the each source below.

# Dataset samples

**Samples with the maximum amount of boundary boxes.**

In [None]:
max_bbox_ids = train_df['image_id_ext'].value_counts().index.to_list()[:6]
plot_samples(train_df, img_ids=max_bbox_ids, title='Samples with the maximum amount of boundary boxes')

Boundary boxes looks correct, but I will separetely check their areas later. Some boxes and wheat heads overlap, that can cause false positive errors. 

**Samples with the minimum amount of boundary boxes.**

In [None]:
min_bbox_ids = train_df['image_id_ext'].value_counts(ascending=True).index.to_list()
plot_samples(train_df, img_ids=min_bbox_ids[:6], title='Samples with the minimum amount of boundary boxes')

Looks like the second and the third photos are serial. <br>
You can also see that the unmarked photos are pictures of the ground that has been trampled. Let's dive a little deeper into this problem, maybe some significant parts of dataset have this  problem?

In [None]:
print('Sources of top 100 photost with the least amount of bboxes: ', 
      train_df[train_df['image_id_ext'].isin(min_bbox_ids[:100])]['source'].unique())    

In [None]:
fig = px.histogram(train_df[train_df['source'] == 'arvalis_3'].groupby(['image_id']).mean()['size'], 
                   title = 'Percentage distribution of image box coverage in arvalis_3 dataset')                 
fig.show()

Yes, we have  problem images, but overall dataset behaviour the same with summary dataset.

**Samples from different sources**

In [None]:
sources_list = list(train_df['source'].unique())
print('Complete list of sources: \n', sources_list)

In [None]:
plot_samples(train_df[train_df['source'] == 'arvalis_1'], 
             title='Source: ARVALIS (Institut du vegetal is an applied agricultural research)')

In [None]:
plot_samples(train_df[train_df['source'] == 'arvalis_2'])

In [None]:
plot_samples(train_df[train_df['source'] == 'arvalis_3'])

In [None]:
plot_samples(train_df[train_df['source'] == 'inrae_1'],
             title='Source: INRAE (National Research Institute for Agriculture, Food and Environment)')

In [None]:
plot_samples(train_df[train_df['source'] == 'ethz_1'],
             title='Source: ETHZ (Swiss Federal Institute of Technology in Zurich)')


In [None]:
plot_samples(train_df[train_df['source'] == 'rres_1'],
             title='Soure: Rothamsted Research Institute')

In [None]:
plot_samples(train_df[train_df['source'] == 'usask_1'],
             title='Source: University of Saskatchewan')

It can be seen that different plant species are typical for different sources. Moreover, the photos were taken at different times of the day, from different angles and at different stages of harvest ripening.<br>
Perhaps you should add a separate classifier to the model to determine the type of plants and train the appropriate model for it.

**Empty images without boundary boxes**

In [None]:
plot_samples(train_df, img_ids=list(no_bbox_img_ids)[:6], 
             title='Examples of empty images without boundary boxes')

All of this  image (all 49, I mean) are very dark and can be haracterize by empty ground. 

# Color hystograms

In view of the large difference in the quality of images of the dataset, consider the distribution of their brightness, perhaps there will be anomalies or patterns.

In [None]:
%%time
#Calculate mean brightness
ser = train_df.groupby(['image_id_ext']).mean().reset_index()['image_id_ext']
mean_brightness = ser.apply(get_image_brightness)
# Add results to train_df
bright_df = pd.DataFrame({'image_id_ext': ser, 'mean brightness': mean_brightness})
train_df = train_df.merge(bright_df, on='image_id_ext')

In [None]:
# Brightness (min - max)
fig = px.histogram(mean_brightness, title = 'Mean brightness distribution')
fig.show()

Well, we can see the bimodal distribution. There are 2 large groups of images with similar brightness in the dataset.

In [None]:
first_group_id = train_df[(train_df['mean brightness'] >= 76) & (train_df['mean brightness'] <= 78)]['image_id_ext'].unique()
second_group_id = train_df[(train_df['mean brightness'] >= 106) & (train_df['mean brightness'] <= 108)]['image_id_ext'].unique()

In [None]:
plot_samples(train_df, img_ids=first_group_id[:6], 
             title='Examples of images with brightness [76 - 78]')

In [None]:
plot_samples(train_df, img_ids=second_group_id[:6], 
             title='Examples of images with brightness [106 - 108]')


In [None]:
sorted_bright_images = train_df.groupby(['image_id_ext']).mean()['mean brightness']\
                       .sort_values(ascending=False).index.to_list()

In [None]:
plot_samples(train_df, img_ids=sorted_bright_images[:6], 
             title='The most bright images')

In [None]:
plot_samples(train_df, img_ids=sorted_bright_images[:-7:-1], 
             title='The most dark images')

Different brightness corresponds to different datasets and different plant species.

Idea to look at color distributions was taken  from [here](https://www.kaggle.com/aleksandradeis/globalwheatdetection-eda).<br>
The point was that plant with different  maturnity levels has different green pixels saturated. In this case we can find a kink between color and size of wheat heads. In the other hand empty pictures would have gray dominant pixels. 

In [None]:
%%time
#Calculate color percentage 
ser = train_df.groupby(['image_id_ext']).mean().reset_index()['image_id_ext']
green_percentage = ser.apply(get_percentage_of_green_pixels)
yellow_percentage = ser.apply(get_percentage_of_yellow_pixels)
# Add results to train_df
colors_df = pd.DataFrame({'image_id_ext': ser, 'green %': green_percentage, 'yellow %': yellow_percentage})
train_df = train_df.merge(colors_df, on='image_id_ext')

In [None]:
gc.collect()

In [None]:
yellow_means = train_df.groupby(['image_id']).mean()['yellow %']
green_means = train_df.groupby(['image_id']).mean()['green %']

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=yellow_means, marker_color='#eeff00'))
fig.add_trace(go.Histogram(x=green_means, marker_color='#55ff00'))

# Reduce opacity to see both histograms
fig.update_traces(opacity=0.7)
fig.update_layout(
    #barmode='overlay',
    title_text='Yellow and green percentage distribution', 
    xaxis_title_text='% of colored pixels', # xaxis label
    yaxis_title_text='Count', # yaxis label
    #bargap=0.2, # gap between bars of adjacent location coordinates
    #bargroupgap=0.1 # gap between bars of the same location coordinates
)

fig.show()

In [None]:
sorted_yellow_images = train_df.groupby(['image_id_ext']).mean()['yellow %']\
                       .sort_values(ascending=False).index.to_list()
sorted_green_images = train_df.groupby(['image_id_ext']).mean()['green %']\
                       .sort_values(ascending=False).index.to_list()


In [None]:
plot_samples(train_df, img_ids=sorted_yellow_images[:3], 
             title='The most yellow images')

In [None]:
plot_samples(train_df, img_ids=sorted_yellow_images[:-4:-1], 
             title='The least yellow images')

In [None]:
plot_samples(train_df, img_ids=sorted_green_images[:3], 
             title='The most green images')

In [None]:
plot_samples(train_df, img_ids=sorted_green_images[:-4:-1], 
             title='The least green images')

# Conclusions

*Dataset*

* The images of plants in the dataset vary greatly in brightness and number of boxes. What's more, biological species are just as different.
* A large number of targets and boundary boxes overlapping each other. It increases the risk of false negative error.
* The distribution of the number of targets and the area of boxes relative to the image are subject to normal distribution. This can be used for model quality control during validation.
* Training dataset are relatively small for this competition, data augmentation will be critical part.


*Data Augmentation*

What might work:
* Flipping images horizontally and vertically
* Crop-resize
* Gamma, contrast and brightness tuning

### Acknowledgments
Notebooks, that was useful  for this research:
* [GlobalWheatDetection EDA](https://www.kaggle.com/aleksandradeis/globalwheatdetection-eda)
* [GWD: EDA + Starter Code](https://www.kaggle.com/yashchoudhary/gwd-eda-and-starter-code-beginner-friendly)