# Global Wheat Competitions EDA


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# General Dataset Information
The dataset can help farmers knowing how their crops are growing. How close it is to harvest. 


Step 1: Importing Libraries and Datasets

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob
import os
from PIL import Image
from matplotlib import patches


from bokeh.models import ColumnDataSource, HoverTool, Panel
from bokeh.models.widgets import Tabs
from bokeh.plotting import figure
from bokeh.io import output_notebook, show, output_file
%matplotlib inline
import cv2
from bokeh.resources import INLINE
import bokeh.io
bokeh.io.output_notebook(INLINE) 

In [None]:
train_dir ='/kaggle/input/global-wheat-detection/train/'
test_dir = '../input/global-wheat-detection/test/'
train_csv_path = '../input/global-wheat-detection/train.csv' 

In [None]:
from bokeh.resources import INLINE
import bokeh.io

bokeh.io.output_notebook(INLINE)

In [None]:
train =pd.read_csv(train_csv_path)
train.head()

train_images = glob(train_dir+ '*')
test_images = glob(test_dir + '*')
print("The images in train images are ",len(train_images))
print("The images in test images are ",len(test_images))

There are very few images for training and testing. We must use data augemntation for this.

In [None]:
train.head()

Step 2: In the dataframe, we have records of every patches of a single image. We have to add all the patches and make it a single image.

In [None]:
#train_images
all_train_images = pd.DataFrame(i.split('/')[-1][:-4] for i in train_images)
all_train_images.columns = ['image_id']
all_train_images = all_train_images.merge(train,on = 'image_id',how='left')

In [None]:
all_train_images.head()

In [None]:
all_train_images['bbox'] = all_train_images['bbox'].fillna('[0,0,0,0]')
bbox_items = all_train_images['bbox'].str.split(',',expand = True)
all_train_images['bbox_xmin'] = bbox_items[0].str.strip('[').astype(float)
all_train_images['bbox_ymin'] = bbox_items[1].str.strip(' ').astype(float)
all_train_images['bbox_width'] = bbox_items[2].str.strip(' ').astype(float)
all_train_images['bbox_height'] = bbox_items[3].str.strip(']').astype(float)


The DataFrame now contains the image id along with the grain patch x and y co-ordinate along with its width and height

In [None]:
all_train_images

In [None]:
print("Images without heads is",len(all_train_images)-len(train))

Step 3: Plot the images along with the patches to find the amount of grain in it.


We create two function, one to get the coordinates of the patches and one to merge the image of the wheat along with its grain patches

In [None]:
def get_all_boxes(df,image_id):
    bboxes = []
    image_bbox = df[df.image_id==image_id]
    for _,rows in image_bbox.iterrows():
        bboxes.append((rows.bbox_xmin,rows.bbox_ymin,rows.bbox_width,rows.bbox_height))
        
    return bboxes

def plot_image_examples(df,rows= 3,columns=3,title ='Image Examples'):
    fig,axs = plt.subplots(rows,columns,figsize=(10,10))
    for row in range(rows):
        for col in range(columns):
            idx = np.random.randint(len(df),size=1)[0]
            img_id = df.iloc[idx].image_id
            
            img = Image.open(train_dir + img_id + '.jpg')
            
            axs[row,col].imshow(img)
            
            bboxes = get_all_boxes(df,img_id)
            
            for bbox in bboxes:
                
                rect = patches.Rectangle((bbox[0],bbox[1]),bbox[2],bbox[2],edgecolor='r',linewidth=1,facecolor='none')
                axs[row,col].add_patch(rect)
                
            axs[row,col].axis('off')
            
    plt.suptitle(title)
            


In [None]:
plot_image_examples(all_train_images)

We can see images taken at different lightining conditions and different maturity stages.


In [None]:
all_train_images['width'].value_counts()

   Step 4 We find about different features of the dataset by visualization

Count numbers of bounding boxes

In [None]:
all_train_images['counts'] = all_train_images.apply(lambda row: 1 if np.isfinite(row.width) else 0,axis =1)
train_images_count = all_train_images.groupby('image_id').sum().reset_index()

In [None]:
train_images_count

In [None]:
# See this article on how to plot bar charts with Bokeh:
# https://towardsdatascience.com/interactive-histograms-with-bokeh-202b522265f3
def hist_hover(dataframe, column, colors=["#94c8d8", "#ea5e51"], bins=30, title=''):
    hist, edges = np.histogram(dataframe[column], bins = bins)
    
    hist_df = pd.DataFrame({column: hist,
                             "left": edges[:-1],
                             "right": edges[1:]})
    hist_df["interval"] = ["%d to %d" % (left, right) for left, 
                           right in zip(hist_df["left"], hist_df["right"])]

    src = ColumnDataSource(hist_df)
    plot = figure(plot_height = 400, plot_width = 600,
          title = title,
          x_axis_label = column,
          y_axis_label = "Count")    
    plot.quad(bottom = 0, top = column,left = "left", 
        right = "right", source = src, fill_color = colors[0], 
        line_color = "#35838d", fill_alpha = 0.7,
        hover_fill_alpha = 0.7, hover_fill_color = colors[1])
        
    hover = HoverTool(tooltips = [('Interval', '@interval'),
                              ('Count', str("@" + column))])
    plot.add_tools(hover)
    
    output_notebook()
    show(plot)

In [None]:
hist_hover(train_images_count,'counts','Number of wheat spikes per image')

As we look at the plot,we find that most of the counts are in range 20-65.

In [None]:
#Lets plot some image with less number of count
less_spikes = train_images_count[train_images_count['counts']<10].image_id

In [None]:
plot_image_examples(all_train_images[all_train_images.image_id.isin(less_spikes)],title = 'Images with less spikes')

Observations:

Most of the example have more ground

Most of them are zoomed too much

In [None]:
#Plotting the images with highest spikes
more_spikes = train_images_count[train_images_count['counts']>100].image_id

In [None]:
plot_image_examples(all_train_images[all_train_images.image_id.isin(more_spikes)],title= 'High number of Spikes')

As you can observe the number of spikes are much higher

Now we will calculate the area of bounding boxes

In [None]:
all_train_images['bbox_area'] = all_train_images['bbox_width']*all_train_images['bbox_height']

In [None]:
hist_hover(all_train_images,'bbox_area',title ='Area of a single bounding box')

In [None]:
#The max area of bounding box
max(all_train_images['bbox_area'])

The distribution of Area of bbox is in very wide range. Lets look at the highest bbox areas

In [None]:
large_area = all_train_images[all_train_images['bbox_area'] >200000].image_id

In [None]:
plot_image_examples(all_train_images[all_train_images.image_id.isin(large_area)],title = 'Large bbox area in a image')

As you can see there are many anomally outliers in this images and they will cause a problem when we train, so its better to remove them

Lets also check the images with small bbox area

In [None]:
small_area = all_train_images[all_train_images['bbox_area']<50].image_id
plot_image_examples(all_train_images[all_train_images.image_id.isin(small_area)],title='Small bbox area in images')

In [None]:
area_per_image = all_train_images.groupby("image_id").sum().reset_index()

In [None]:
area_per_image_percentage = area_per_image.copy()
area_per_image_percentage['bbox_area'] = area_per_image['bbox_area']/(1024*1024)*100

In [None]:
area_per_image.head()

In [None]:
area_per_image_percentage.head()

This shows the plot of how much percentage of image area is covered by bbox

In [None]:
hist_hover(area_per_image_percentage,'bbox_area',title ='Percentage of image covered by bbox')

As per the plot, most of the percentage lies in between 18% to 36%
We have to check in with the lowest and the highest perecentage covered by bbox

In [None]:
small_percentage = area_per_image_percentage[area_per_image_percentage['bbox_area']<8].image_id
plot_image_examples(all_train_images[all_train_images.image_id.isin(small_percentage)],title='low area covered by bbox')

In [None]:
high_percentage = area_per_image_percentage[area_per_image_percentage['bbox_area']>50].image_id
plot_image_examples(all_train_images[all_train_images.image_id.isin(high_percentage)],title='high area covered by bbox')

Lets plot w.r.t to brightness

In [None]:
def get_brightness(image):
    
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    return np.array(gray).mean()
    
def add_brightness(df):
    
    brightness = []
    for _, row in df.iterrows():
        img_id = row.image_id
        image = cv2.imread(train_dir+img_id+'.jpg')
        brightness.append(get_brightness(image))
        
    brightness_df = pd.DataFrame(brightness)
    brightness_df.columns = ['brightness']
    df = pd.concat([df,brightness_df],ignore_index = True,axis=1)
    df.columns = ['image_id','brightness']
    
    return df




    


In [None]:
image_df = pd.DataFrame(all_train_images.image_id.unique())

In [None]:
image_df.columns = ['image_id']

In [None]:
brightness_df = add_brightness(image_df)

all_train_images = all_train_images.merge(brightness_df,on='image_id')

In [None]:
hist_hover(all_train_images,'brightness',title ='Brightness in images')

 All though, there is a peak the brightness ranges from to 116. Lets check out the outliers.

In [None]:
dark_ids = all_train_images[all_train_images['brightness']<25].image_id
plot_image_examples(all_train_images[all_train_images.image_id.isin(dark_ids)],title='The image with low brightness')

As you can see, Its harder for even humans to detect.

In [None]:
bright_ids = all_train_images[all_train_images['brightness']>130].image_id
plot_image_examples(all_train_images[all_train_images.image_id.isin(bright_ids)],title='The image with high brightness')

They are very different from the dark images

Now,we should know that color represents a important part because it shows much far from harvest it is. If its Green it requires more time, If its brown, it have ground in them and if its Yellow, its ready to be harvested. 

In [None]:
def green_pixels(image):
    img = cv2.cvtColor(image,cv2.COLOR_BGR2HSV)
    
    #Get the green mask. I got from "https://stackoverflow.com/questions/47483951/how-to-define-a-threshold-value-to-detect-only-green-colour-objects-in-an-image"
    low =(40,40,40)
    high = (70,255,255)
    green_mask = cv2.inRange(img,low,high)
    
    return float( np.sum(green_mask))/255/(1024*1024)

def yellow_pixels(image):
    img = cv2.cvtColor(image,cv2.COLOR_BGR2HSV)
    low= (25,40,40)
    high = (35,255,255)
    yellow_mask = cv2.inRange(img,low,high)
    
    return float(np.sum(yellow_mask))/255/(1024*1024)


def add_green(df):
    
    brightness = []
    for _, row in df.iterrows():
        img_id = row.image_id
        image = cv2.imread(train_dir+img_id+'.jpg')
        brightness.append(green_pixels(image))
        
    brightness_df = pd.DataFrame(brightness)
    brightness_df.columns = ['green_bright']
    df = pd.concat([df,brightness_df],ignore_index = True,axis=1)
    df.columns = ['image_id','green_bright']
    
    return df

def add_yellow(df):
    
    brightness = []
    for _, row in df.iterrows():
        img_id = row.image_id
        image = cv2.imread(train_dir+img_id+'.jpg')
        brightness.append(yellow_pixels(image))
        
    brightness_df = pd.DataFrame(brightness)
    brightness_df.columns = ['yellow_bright']
    df = pd.concat([df,brightness_df],ignore_index = True,axis=1)
    df.columns = ['image_id','yellow_bright']
    
    return df


    

In [None]:
green_pixels_df = add_green(image_df)
all_train_images = all_train_images.merge(green_pixels_df,on='image_id')

Now lets plot on green color pixels


In [None]:
hist_hover(all_train_images,'green_bright',title ='Green Colors in images')

In [None]:
green_ids = all_train_images[all_train_images['green_bright']>0.4].image_id
plot_image_examples(all_train_images[all_train_images.image_id.isin(green_ids)],title='The image with high green color')

The green color suggests that plant is grown near by and doesn't have that much spikes in it

In [None]:
yellow_pixels_df = add_yellow(image_df)
all_train_images = all_train_images.merge(yellow_pixels_df,on='image_id')

In [None]:
hist_hover(all_train_images,'yellow_bright',title ='yellow Colors in images')

Lets see the images with high yellow color

In [None]:
yellow_ids = all_train_images[all_train_images['yellow_bright']>0.55].image_id
plot_image_examples(all_train_images[all_train_images.image_id.isin(yellow_ids)],title='The image with high yellow color')

As you can see the images with high yellow pixels are ready to be harvest

# Why Data Augementation is Important

As we can see the training image is very less and its then the model will Underfit, thats why we will use albumentation to create new images by augementing them

In [None]:
import albumentations as al
example = al.Compose([
    al.RandomSizedBBoxSafeCrop(512,512,erosion_rate=0.0,interpolation=1,p=1.0),
    al.HorizontalFlip(p=0.5),
    al.VerticalFlip(p=0.5),
    al.OneOf([al.RandomContrast(),
             al.RandomGamma(),
             al.RandomBrightness()],p=1.0),
    al.CLAHE(p=0.1)], p=1.0, bbox_params=al.BboxParams(format='coco', label_fields=['category_id']))

In [None]:
def apply_transform(transforms,df,n_transforms=3):
    idx = np.random.randint(len(df),size=1)[0]
    bboxes = []
    image_id = df.iloc[idx].image_id
    image_bbox = df[df.image_id==image_id]
    for _,rows in image_bbox.iterrows():
        bboxes.append([rows.bbox_xmin,rows.bbox_ymin,rows.bbox_width,rows.bbox_height])
        
    


            
    img = Image.open(train_dir + image_id + '.jpg')
            
    fix,axs = plt.subplots(1,n_transforms+1,figsize=(15,7))
            
    axs[0].imshow(img)
    axs[0].set_title("Original")
            
    for bbox in bboxes:
        
        rect = patches.Rectangle((bbox[0],bbox[1]),bbox[2],bbox[3],edgecolor='r',linewidth=1,facecolor='none')
        axs[0].add_patch(rect)
                
    # apply transforms n_transforms times
    for i in range(n_transforms):
        params = {'image': np.asarray(img),
                  'bboxes': bboxes,
                  'category_id': [1 for j in range(len(bboxes))]}
        augmented_boxes = transforms(**params)
        bboxes_aug = augmented_boxes['bboxes']
        image_aug = augmented_boxes['image']

        # plot the augmented image and augmented bounding boxes
        axs[i+1].imshow(image_aug)
        axs[i+1].set_title('augmented_' + str(i+1))
        for bbox in bboxes_aug:
            rect = patches.Rectangle((bbox[0],bbox[1]),bbox[2],bbox[3],linewidth=1,edgecolor='r',facecolor='none')
            axs[i+1].add_patch(rect)
    plt.show()
            


In [None]:
apply_transform(example,all_train_images,n_transforms=3)

In [None]:
apply_transform(example,all_train_images,n_transforms=3)

Thus we conclude that these points are important for this Competition. I will update my notebook if I got any more ideas. If you like the, pls upvote :=>