# Plant Pathology 2021 üçÉüçÉ EDA üìäüìä
![](https://s3.eu-west-2.amazonaws.com/growinginteractive/blog/apple-spartan-2x.jpg)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
import gc
import cv2
import matplotlib.pyplot as plt
from torchvision import transforms,datasets,models
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.preprocessing import StandardScaler
import time
import datetime
from PIL import Image
import warnings
from tqdm.notebook import tqdm
tqdm.pandas()
import random

In [None]:
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

In [None]:
train_image_path = '../input/plant-pathology-2021-fgvc8/train_images'
test_image_path = '../input/plant-pathology-2021-fgvc8/test_images'
train_df_path = '../input/plant-pathology-2021-fgvc8/train.csv'
test_df_path = '../input/plant-pathology-2021-fgvc8/sample_submission.csv'

### Test and Train data frame 

#### Train data frame

In [None]:
train_df = pd.read_csv(train_df_path)
classes = train_df.labels.unique()
x=train_df.labels.value_counts()
train_df.head(10)

#### Test data frame

In [None]:
test_df = pd.read_csv(test_df_path)
test_df.head()

In [None]:
train_df['labels'].value_counts()

In [None]:
classes

In [None]:
plt.figure(figsize=(20,12))
labels = sns.barplot(x.index,x)
for item in labels.get_xticklabels():
    item.set_rotation(45)
plt.gca().set_ylabel('samples')

In total there are 12 types of disease in apple trees as compared to the 2020 dataset, where only 4 types were given. This dataset is skewed as the **scab** disease count the most number of images count to 4826 and the least count of **Powdery Mildew Complex**(87).  
Suggestion: Use **Stratified K-Fold** to divide the dataset in n folds  
Link: [Stratified K-Fold](https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2)

In [None]:
def load_image(image_id, image_path=train_image_path):
    file_path = image_id
    image = cv2.imread(os.path.join(image_path, file_path))
    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

train_images = train_df["image"][:200].progress_apply(load_image)

In [None]:
red_values = [np.mean(train_images[idx][:, :, 0]) for idx in range(len(train_images))]
green_values = [np.mean(train_images[idx][:, :, 1]) for idx in range(len(train_images))]
blue_values = [np.mean(train_images[idx][:, :, 2]) for idx in range(len(train_images))]
values = [np.mean(train_images[idx]) for idx in range(len(train_images))]

In [None]:
group_labels = ['RGB Values']
colors = ['magenta']
fig = ff.create_distplot([values], group_labels, bin_size=.5,
                         curve_type='normal', # override default 'kde'
                         colors=colors)
fig.update_layout(
    title = 'Distribution of Channel Values',
    font_family="Courier New",
    font_color="magenta",
    title_font_family="Times New Roman",
    title_font_color="magenta",
    legend_title_font_color="magenta"
)
fig.show()

In [None]:
group_labels = ['Red Values']
colors = ['red']
fig = ff.create_distplot([red_values], group_labels, bin_size=.5,
                         curve_type='normal', # override default 'kde'
                         colors=colors)
fig.update_layout(
    title = 'Distribution of Red Channel Values',
    font_family="Courier New",
    font_color="red",
    title_font_family="Times New Roman",
    title_font_color="red",
    legend_title_font_color="red"
)
fig.show()

In [None]:
group_labels = ['Green Values']
colors = ['green']
fig = ff.create_distplot([green_values], group_labels, bin_size=.5,
                         curve_type='normal', # override default 'kde'
                         colors=colors)
fig.update_layout(
    title = 'Distribution of Greeb Channel Values',
    font_family="Courier New",
    font_color="green",
    title_font_family="Times New Roman",
    title_font_color="green",
    legend_title_font_color="green"
)
fig.show()

In [None]:
group_labels = ['Blue Values']
colors = ['blue']
fig = ff.create_distplot([blue_values], group_labels, bin_size=.5,
                         curve_type='normal', # override default 'kde'
                         colors=colors)
fig.update_layout(
    title = 'Distribution of Blue Channel Values',
    font_family="Courier New",
    font_color="blue",
    title_font_family="Times New Roman",
    title_font_color="blue",
    legend_title_font_color="blue"
)
fig.show()

In [None]:
group_labels = ['Red Values', 'Green Values', 'Blue Values']
colors = ['red', 'green', 'blue']
fig = ff.create_distplot([red_values, green_values, blue_values], group_labels, bin_size=.5,
                         curve_type='normal', # override default 'kde'
                         colors=colors)
fig.update_layout(
    title = 'Combined Distribution of RGB Channel Values',
    font_family="Courier New",
    font_color="black",
    title_font_family="Times New Roman",
    title_font_color="black",
    legend_title_font_color="black"
)
fig.show()

In [None]:
def visualize_leaves(condition, is_cond=True):
    if not is_cond:
        cols, rows = 3, min([3, len(train_images)//3])
        fig, ax = plt.subplots(nrows=rows, ncols=cols, figsize=(30, rows*20/3))
        for col in range(cols):
            for row in range(rows):
                ax[row, col].imshow(train_images.loc[train_images.index[-row*3-col-1]])
        return None
        
    positions = list(train_df[train_df['labels']== condition].index)   
        
    images = train_df.image.loc[positions]
    cols, rows = 3, min([3, len(images)//3])
    
    fig, ax = plt.subplots(nrows=rows, ncols=cols, figsize=(30, rows*20/3))
    for col in range(cols):
        for row in range(rows):
            img = cv2.imread(os.path.join(train_image_path, images.loc[images.index[row*3+col]]))
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            ax[row, col].imshow(img)
    plt.title(condition)
    plt.show()

## Visualizing Leaves

### Healthy Leaves

In [None]:
visualize_leaves(condition=classes[0])

### Scab Frog-Eye-Leaf-Spot Complex

In [None]:
visualize_leaves(condition=classes[1])

### Scab

In [None]:
visualize_leaves(condition=classes[2])

### Complex

In [None]:
visualize_leaves(condition=classes[3])

### Rust

In [None]:
visualize_leaves(condition=classes[4])

### Frog-Eyed-Leaf-Spot

In [None]:
visualize_leaves(condition=classes[5])

### Powdery Mildew

In [None]:
visualize_leaves(condition=classes[6])

### Scab Frog-Eyed-Leaf-Spot

In [None]:
visualize_leaves(condition=classes[7])

### Frog-Eyed-Leaf-Spot Complex

In [None]:
visualize_leaves(condition=classes[8])

### Rust Frog-Eyed-Leaf-Spot

In [None]:
visualize_leaves(condition=classes[9])

### Powdery Mildew Complex

In [None]:
visualize_leaves(condition=classes[10])

### Rust Complex

In [None]:
visualize_leaves(condition=classes[11])

# Do upvote the notebook, if you find it usefull.