In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from tqdm import tqdm
import ast
import cv2
from PIL import Image
import matplotlib.patches as patches
import albumentations as albu
from albumentations.pytorch.transforms import ToTensor
from albumentations.core.transforms_interface import DualTransform
from albumentations.augmentations.bbox_utils import denormalize_bbox, normalize_bbox

In [None]:
# Setup the paths to train and test images
TRAIN_DIR = '../input/global-wheat-detection/train/'
TEST_DIR = '../input/global-wheat-detection/test/'
# Glob the directories and get the lists of train and test images
train_fns = glob(TRAIN_DIR + '*')
test_fns = glob(TEST_DIR + '*')

In [None]:
train_data = pd.read_csv("/kaggle/input/global-wheat-detection/train.csv")

In [None]:
train_data.sample(5)

In [None]:
train_data.shape

In [None]:
# How many unique images?
len(train_data["image_id"].unique())

In [None]:
# Total number of images in the training directory
print('Number of train images is {}'.format(len(train_fns)))

In [None]:
print('Number of images without heads are: {}'.format(len(train_fns)- len(train_data["image_id"].unique())))

This means that 3422 - 3373 i.e. 49 images do not have any annotations.

In [None]:
# stats about data
train_data.describe()

In [None]:
# is there any data with width or height greater than or less than 1024
print(train_data[train_data['width'] > 1024])
print("--------------")
print(train_data[train_data['width'] < 1024])
print("--------------")
print(train_data[train_data['height'] > 1024])
print("--------------")
print(train_data[train_data['height'] < 1024])

In [None]:
def count_missing_data(data_df):
    total = data_df.isnull().sum().sort_values(ascending = False)
    percent = (data_df.isnull().sum()/data_df.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

In [None]:
count_missing_data(train_data)

In [None]:
#Unique sources of data
sources = train_data['source'].unique()
print("There are {} unique sources of data: {}".format(len(sources), sources))

In [None]:
# How many images from each sources
train_data['source'].value_counts()

In [None]:
f, ax = plt.subplots(1,1, figsize=(11,5))
sns.countplot(train_data['source'],order = train_data['source'].value_counts().index, palette='Set3')

total = float(len(train_data))
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}% ({:d})'.format(100*height/total, height),
            ha="center") 
plt.show()

In [None]:
# Create dataframe with all training images

train_images_df = pd.DataFrame([fns.split('/')[-1][:-4] for fns in glob(TRAIN_DIR + '*')])
train_images_df.columns=['image_id']

# merge it with bboxes dataframe
train_images_df = train_images_df.merge(train_data, on='image_id', how='left')


# replace null values with list of 0
train_images_df['bbox'] = train_images_df.bbox.fillna('[0,0,0,0]')

In [None]:

# Separating out the coordinates
bbox_xmin, bbox_ymin, bbox_width,bbox_height  = [], [], [], []
for i in tqdm(train_images_df["bbox"]):
    cooridinates_list = ast.literal_eval(i)
    bbox_xmin.append(cooridinates_list[0])
    bbox_ymin.append(cooridinates_list[1])
    bbox_width.append(cooridinates_list[2])
    bbox_height.append(cooridinates_list[3])

In [None]:
len(bbox_xmin), len(bbox_ymin), len(bbox_width), len(bbox_height)

In [None]:

train_images_df["bbox_xmin"] = bbox_xmin
train_images_df["bbox_ymin"] = bbox_ymin
train_images_df["bbox_width"] = bbox_width
train_images_df["bbox_height"] = bbox_height
train_images_df.head()

In [None]:
#Locating the wheat location

train_images_df['x_center']=(train_images_df['bbox_xmin'] + train_images_df['bbox_width'])/2
train_images_df['y_center']=(train_images_df['bbox_ymin'] + train_images_df['height'])/2
# Plot x and y centers
sns.jointplot("x_center", "y_center",kind="kde", data=train_images_df, height=9, alpha=0.5)
plt.suptitle('Wheat location')

In [None]:
## Aspect ratio of bounding boxes in the sample
train_images_df['aspect_ratio'] = train_images_df['bbox_width']/train_images_df['bbox_height']
sns.distplot(train_images_df['aspect_ratio'].dropna(), norm_hist=True)
plt.title('Distribution plot: Aspect ratio of bounding boxes of images in the sample')
plt.show()

# Area of bounding boxes in the sample
train_images_df['bbox_area'] = train_images_df['bbox_width']* train_images_df['bbox_height']
sns.distplot(train_images_df['bbox_area'].dropna(), norm_hist=True)
plt.title('Distribution plot: Area of bounding boxes of images in the sample')
plt.show()

# Relationship between aspect ratio and area of bounding boxes of images in the sample
sns.relplot(x='bbox_area', y='aspect_ratio', data=train_images_df, height=5, alpha=0.7, aspect=1.4)
plt.title('Aspect ratio and area of bounding boxes of images in the sample')
plt.show()

## Sample Images from the Dataset

In [None]:
# Visualizing some samples from the training set

sample_indices = np.random.choice(np.unique(train_data["image_id"].tolist()), 10)

fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(20, 10))
count=0

for row in ax:
    for col in row:
        img = plt.imread(TRAIN_DIR  + sample_indices[count] + ".jpg")
        col.grid(False)
        col.set_xticks([])
        col.set_yticks([])
        col.imshow(img)
        count += 1
plt.show()

## Images with the Bounding Box 

In [None]:
def get_bbox(image_id, df, col, color='white'):
    bboxes = df[df['image_id'] == image_id]
    
    for i in range(len(bboxes)):
        # Create a Rectangle patch
        rect = patches.Rectangle(
            (bboxes['bbox_xmin'].iloc[i], bboxes['bbox_ymin'].iloc[i]),
            bboxes['bbox_width'].iloc[i], 
            bboxes['bbox_height'].iloc[i], 
            linewidth=2, 
            edgecolor=color, 
            facecolor='none')

        # Add the patch to the Axes
        col.add_patch(rect)

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(20, 10))
count=0
for row in ax:
    for col in row:
        img = plt.imread(TRAIN_DIR + sample_indices[count] + ".jpg")
        col.grid(False)
        col.set_xticks([])
        col.set_yticks([])
        get_bbox(sample_indices[count], train_images_df, col, color='red')
        col.imshow(img)
        count += 1
plt.show()

## Images with no bounding boxes

In [None]:
# Images with bounding box
images_with_bbox = train_data["image_id"].unique()


In [None]:
images_without_bbox = list(set(train_fns) - set(images_with_bbox))

In [None]:
# Visualizing some images without any bounding box 

fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(20, 10))
count=0

for row in ax:
    for col in row:
        img = plt.imread(images_without_bbox[count])
        col.grid(False)
        col.set_xticks([])
        col.set_yticks([])
        col.imshow(img)
        count += 1
plt.show()

In [None]:
train_images_df.sample(5)

In [None]:
# This function will take dataframe and image_id for which we want bounding boxes and return list of x, y, w, h

def get_all_bboxes(dataframe, image_id):
  image_bounding_boxes = dataframe[dataframe.image_id == image_id]
  
  bounding_boxes = []
  for _, row in image_bounding_boxes.iterrows():
    bounding_boxes.append((row.bbox_xmin, row.bbox_ymin, row.bbox_width, row.bbox_height))
    
  return bounding_boxes
        
    
    

In [None]:
def plot_image_examples(dataframe, rows = 2, cols = 5, title = 'Image examples', size = (20, 10)):
  fig, axs = plt.subplots(rows, cols, figsize=size)
  for row in range(rows):
    for col in range(cols):
      idx = np.random.randint(len(dataframe), size = 1)[0]
      img_id = dataframe.iloc[idx].image_id
      
      img = Image.open(TRAIN_DIR + img_id + '.jpg')
      axs[row, col].imshow(img)
      
      bboxes = get_all_bboxes(dataframe, img_id)
      
      for bbox in bboxes:
        rect = patches.Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], linewidth=1, edgecolor='r', facecolor='none')
        axs[row, col].add_patch(rect)
        
        axs[row, col].axis('off')    
  plt.suptitle(title)

In [None]:
# number of bouding boxes per train image
train_images_df['count'] = train_images_df.apply(lambda row: 1 if np.isfinite(row.width) else 0, axis=1)
train_images_df_count = train_images_df.groupby('image_id').sum().reset_index()

In [None]:
train_images_df['bbox_xmax'] = train_images_df['bbox_xmin'] + train_images_df['bbox_width']
train_images_df['bbox_ymax'] = train_images_df['bbox_ymin'] + train_images_df['bbox_height']

In [None]:
less_spikes_ids = train_images_df_count[train_images_df_count['count'] < 10].image_id
plot_image_examples(train_images_df[train_images_df["image_id"].isin(less_spikes_ids)], title='Example images with small number of spikes')

In [None]:
more_spikes_ids = train_images_df_count[train_images_df_count['count'] > 100].image_id
plot_image_examples(train_images_df[train_images_df["image_id"].isin(more_spikes_ids)], title='Example images with more number of spikes')

In [None]:
# Image with usask_1 source
usask_1_images = train_images_df[train_images_df['source'] == 'usask_1'].image_id
plot_image_examples(train_images_df[train_images_df.image_id.isin(usask_1_images)], title='Images with source usask_1')

In [None]:
# Image with arvalis_1 source
arvalis_1_images = train_images_df[train_images_df['source'] == 'arvalis_1'].image_id
plot_image_examples(train_images_df[train_images_df.image_id.isin(arvalis_1_images)], title='Images with source arvalis_1')

In [None]:
# Image with inrae_1 source
inrae_1_images = train_images_df[train_images_df['source'] == 'inrae_1'].image_id
plot_image_examples(train_images_df[train_images_df.image_id.isin(inrae_1_images)], title='Images with source inrae_1')

In [None]:
# Image with arvalis_3 source
arvalis_3_images = train_images_df[train_images_df['source'] == 'arvalis_3'].image_id
plot_image_examples(train_images_df[train_images_df.image_id.isin(arvalis_3_images)], title='Images with source arvalis_3')


In [None]:
# Image with rres_1 source
rres_1_images = train_images_df[train_images_df['source'] == 'rres_1'].image_id
plot_image_examples(train_images_df[train_images_df.image_id.isin(rres_1_images)], title='Images with source rres_1')


In [None]:
# Image with arvalis_2 source

arvalis_2_images = train_images_df[train_images_df['source'] == 'arvalis_2'].image_id
plot_image_examples(train_images_df[train_images_df.image_id.isin(arvalis_2_images)], title='Images with source arvalis_2')

In [None]:
# Image with ethz_1 source

ethz_1_images = train_images_df[train_images_df['source'] == 'ethz_1'].image_id
plot_image_examples(train_images_df[train_images_df.image_id.isin(ethz_1_images)], title='Images with source ethz_1')

In [None]:
train_images_df["bbox_area"].max()

In [None]:
# Example images with large bounding box area
large_boxes_ids = train_images_df[train_images_df['bbox_area'] > 200000].image_id
plot_image_examples(train_images_df[train_images_df.image_id.isin(large_boxes_ids)], title='Example images with large bbox area')

In [None]:
min_area = train_images_df[train_images_df['bbox_area'] > 0].bbox_area.min()
print('The smallest bouding box area is {}'.format(min_area))

In [None]:
# Example images with small bounding box area
small_boxes_ids = train_images_df[(train_images_df['bbox_area'] < 25) & (train_images_df['bbox_area'] > 0)].image_id
plot_image_examples(train_images_df[train_images_df.image_id.isin(small_boxes_ids)], title='Example images with small bbox area')

In [None]:
# compute the total bounding boxes area per image
area_per_image = train_images_df.groupby(by='image_id').sum().reset_index()

# compute the percentage of the image area covered by bounding boxes
area_per_image_percentage = area_per_image.copy()
area_per_image_percentage['bbox_area'] = area_per_image_percentage['bbox_area'] / (1024*1024) * 100

In [None]:
# Example images with small percentage of area covered by bounding boxes

small_area_perc_ids = area_per_image_percentage[area_per_image_percentage['bbox_area'] < 7].image_id
plot_image_examples(train_images_df[train_images_df.image_id.isin(small_area_perc_ids)], title='Example images with small percentage of area covered by bounding boxes')

In [None]:
# Example images with large percentage of area covered by bounding boxes
large_area_perc_ids = area_per_image_percentage[area_per_image_percentage['bbox_area'] > 95].image_id
plot_image_examples(train_images_df[train_images_df.image_id.isin(large_area_perc_ids)], title='Example images with large percentage of area covered by bounding boxes')

In [None]:
def get_image_brightness(image):
    # convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # get average brightness
    return np.array(gray).mean()

def add_brightness(df):
    brightness = []
    for _, row in df.iterrows():
        img_id = row.image_id
        image = cv2.imread(TRAIN_DIR + img_id + '.jpg')
        brightness.append(get_image_brightness(image))
        
    brightness_df = pd.DataFrame(brightness)
    brightness_df.columns = ['brightness']
    df = pd.concat([df, brightness_df], ignore_index=True, axis=1)
    df.columns = ['image_id', 'brightness']
    
    return df

In [None]:
images_df = pd.DataFrame(train_images_df.image_id.unique())
images_df.columns = ['image_id']

In [None]:
# add brightness to the dataframe
images_df = pd.DataFrame(train_images_df.image_id.unique())
images_df.columns = ['image_id']
brightness_df = add_brightness(images_df)

train_images_df = train_images_df.merge(brightness_df, on='image_id')

In [None]:
# darkest images
dark_ids = train_images_df[train_images_df['brightness'] < 30].image_id
plot_image_examples(train_images_df[train_images_df.image_id.isin(dark_ids)], title="darkest images")

In [None]:
# Brightest images
bright_ids = train_images_df[train_images_df['brightness'] > 130].image_id
plot_image_examples(train_images_df[train_images_df.image_id.isin(bright_ids)], title='Brightest images')

In [None]:
def get_percentage_of_green_pixels(image):
  # convert to HSV
  hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
  
  # get the green mask
  hsv_lower = (40, 40, 40) 
  hsv_higher = (70, 255, 255)
  green_mask = cv2.inRange(hsv, hsv_lower, hsv_higher)
  
  return float(np.sum(green_mask)) / 255 / (1024 * 1024)

In [None]:
def get_percentage_of_yellow_pixels(image):
  # convert to HSV
  hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
  
  # get the green mask
  hsv_lower = (25, 40, 40)
  hsv_higher = (35, 255, 255)
  yellow_mask = cv2.inRange(hsv, hsv_lower, hsv_higher)
  
  return float(np.sum(yellow_mask)) / 255 / (1024 * 1024)

In [None]:
def add_green_pixels_percentage(df):
  green = []
  for _, row in df.iterrows():
    img_id = row.image_id
    image = cv2.imread(TRAIN_DIR + img_id + '.jpg')
    green.append(get_percentage_of_green_pixels(image))
    
  green_df = pd.DataFrame(green)
  green_df.columns = ['green_pixels']
  df = pd.concat([df, green_df], ignore_index=True, axis=1)
  df.columns = ['image_id', 'green_pixels']
  
  return df

In [None]:
def add_yellow_pixels_percentage(df):
  yellow = []
  for _, row in df.iterrows():
    img_id = row.image_id
    image = cv2.imread(TRAIN_DIR + img_id + '.jpg')
    yellow.append(get_percentage_of_yellow_pixels(image))
    
  yellow_df = pd.DataFrame(yellow)
  yellow_df.columns = ['yellow_pixels']
  df = pd.concat([df, yellow_df], ignore_index=True, axis=1)
  df.columns = ['image_id', 'yellow_pixels']
    
  return df

In [None]:
# add a column with the percentage of green pixels
green_pixels_df = add_green_pixels_percentage(images_df)
train_images_df = train_images_df.merge(green_pixels_df, on='image_id')

In [None]:
# The most green images
green_ids = train_images_df[train_images_df['green_pixels'] > 0.55].image_id
plot_image_examples(train_images_df[train_images_df.image_id.isin(green_ids)], title='The most green images')

In [None]:
# add a column with the percentage of yellow pixels
yellow_pixels_df = add_yellow_pixels_percentage(images_df)
train_images_df = train_images_df.merge(yellow_pixels_df, on='image_id')

In [None]:
# The most yellow images
yellow_ids = train_images_df[train_images_df['yellow_pixels'] > 0.55].image_id
plot_image_examples(train_images_df[train_images_df.image_id.isin(yellow_ids)], title='The most yellow images')

In [None]:
# Here we are trying little Augmentation
example_transforms = albu.Compose([
    albu.RandomSizedBBoxSafeCrop(512, 512, erosion_rate=0.0, interpolation=1, p=1.0),
    albu.HorizontalFlip(p=0.5),
    albu.VerticalFlip(p=0.5),
    albu.OneOf([albu.RandomContrast(),
                albu.RandomGamma(),
                albu.RandomBrightness()], p=1.0),
    albu.CLAHE(p=1.0)], p=1.0)

In [None]:
def apply_transforms(transforms, df, n_transforms=3):
  idx = np.random.randint(len(df), size=1)[0]
  
  image_id = df.iloc[idx].image_id
  bboxes = []
  for _, row in df[df.image_id == image_id].iterrows():
    bboxes.append([row.bbox_xmin, row.bbox_ymin, row.bbox_width, row.bbox_height])
    
  image = Image.open(TRAIN_DIR + image_id + '.jpg')
  
  fig, axs = plt.subplots(1, n_transforms+1, figsize=(15,7))
  
  # plot the original image
  axs[0].imshow(image)
  axs[0].set_title('original')
  for bbox in bboxes:
    rect = patches.Rectangle((bbox[0],bbox[1]),bbox[2],bbox[3],linewidth=1,edgecolor='r',facecolor='none')
    axs[0].add_patch(rect)
    
  # apply transforms n_transforms times
  for i in range(n_transforms):
    params = {'image': np.asarray(image),
              'bboxes': bboxes,
              'category_id': [1 for j in range(len(bboxes))]}
    augmented_boxes = transforms(**params)
    bboxes_aug = augmented_boxes['bboxes']
    image_aug = augmented_boxes['image']
    
    # plot the augmented image and augmented bounding boxes
    axs[i+1].imshow(image_aug)
    axs[i+1].set_title('augmented_' + str(i+1))
    for bbox in bboxes_aug:
      rect = patches.Rectangle((bbox[0],bbox[1]),bbox[2],bbox[3],linewidth=1,edgecolor='r',facecolor='none')
      axs[i+1].add_patch(rect)
  plt.show()

In [None]:
apply_transforms(example_transforms, train_images_df, n_transforms=3)