# Loading libraries

In [None]:
import os
import json
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Loading data

In [None]:
df_data = pd.read_csv('../input/plant-pathology-2021-fgvc8/train.csv')

In [None]:
df_data.head(1)

### Cleaning data

As we can see there is two columns in the dataset. One named images and other named labels. Images give the image id and labels give the diseases in the leaf. Lets make a new colum for image id and path of image.

In [None]:
# removing .jpg from images and storing id in seperate column
df_data['image_id'] = df_data['image'].map(lambda x: x.rstrip('.jpg'))
df_data.head(1)

In [None]:
# checking for missing values in new column
df_data['image_id'].isnull().unique()

In [None]:
# creating a column of file location
df_data['file_path'] = "../input/plant-pathology-2021-fgvc8/train_images/"+df_data['image']
df_data.head(1)

In [None]:
# removing unwanted columns
df_data = df_data.drop(['image'], axis=1)
# changing order of column
df_data = df_data[['image_id','labels','file_path']]
df_data.head(1)

In [None]:
#removing null values
df_data = df_data.dropna(how = 'all')
# cheking for missing data 
df_data.isnull().sum()

### EDA

In [None]:
df_data.shape

In [None]:
df_data.describe()

In [None]:
df_data.info()

In [None]:
len(df_data['labels'].unique())

In [None]:
df_data['labels'].unique()

There are total of 18632 images given and a total of 12 diseases labels given. On further inspection we see that there are five individual diseases namely 
- frog_eye_leaf_spot 
- complex
- rust
- scab
- powdery_mildew

Other six labels are the combination of two or more of these diseases. The remining one label is healthy leaf without any diseases

In [None]:
#Number of images in each label
plt.figure(figsize=(15, 10))
df_data['labels'].value_counts().plot.bar()

In [None]:
plt.figure(figsize=(15, 15))
df_data['labels'].value_counts().plot.pie(autopct='%.2f')

From the plot we can see that most of the images are concentrated in the five individual diseases labels. Labels with multiple diseases have asmall dataset. This might skew the detection matrix.

In [None]:
# list of labels
n = df_data['labels'].unique().tolist()
# finding number of images in each label
number=[]
for i in range(len(n)):
    number.append(len(df_data.loc[df_data['labels']==n[i]]))
    
# creating a dataset with labels and number of images in them    
table =pd.DataFrame(df_data['labels'].unique(),columns =['labels'])
table['Number_of_images_in_label'] = number
table = table.sort_values(by=['Number_of_images_in_label'], ascending=False)
print(table.to_markdown())

# Visualize

In [None]:
# plotting image by image id for single image
def  visualize(image_id):
    
    path = df_data.loc[df_data['image_id'] == image_id, 'file_path'].iloc[0]
    label = df_data.loc[df_data['image_id'] == image_id, 'labels'].iloc[0]
    
    plt.figure(figsize=(10, 10))
    image = cv2.imread(path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(image)
    plt.title(f"Label: {label}", fontsize=10,)
    plt.show()

In [None]:
visualize('800113bb65efe69e')

In [None]:
#plot image by label . plot 15 random images from label
def visualize_label(label):
    
     
    df = df_data.loc[df_data['labels'] == label]
    
    x = np.random.choice(df['image_id'], 15, replace=False).tolist()
    plt.figure(figsize=(18, 18))
    for i, j in zip(x, range(15)):       
            plt.subplot(3, 5, j + 1)
            path = df.loc[df['image_id'] == i, 'file_path'].iloc[0]
            image = cv2.imread(path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            plt.imshow(image)
           
            labels = df.loc[df['image_id'] == i, 'labels'].iloc[0]
            imageid= df.loc[df['image_id'] == i, 'image_id'].iloc[0]
            plt.title(f" Label: {labels}\n Image_id:{imageid}", fontsize=9,)
            plt.axis("off")
            #plt.savefig('saved_figure.png')
    
    plt.show()                


#### Healthy leaf

In [None]:
visualize_label('healthy')

#### Scab

In [None]:
visualize_label('scab')

#### Complex

In [None]:
visualize_label('complex')

#### Rust

In [None]:
visualize_label('rust')

#### Frog_eye_leaf_spot

In [None]:
visualize_label('frog_eye_leaf_spot')

#### Powdery_mildew

In [None]:
visualize_label('powdery_mildew')

#### Creating test dataset