### **EDA**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import cv2
from tqdm import tqdm
import os

import plotly.express as px
import imagehash

In [None]:
os.getcwd()

### 1. Preparing the ground

In [None]:
path_dict = {'test_images' : '../input/plant-pathology-2021-fgvc8/test_images',
            'train_images': '../input/plant-pathology-2021-fgvc8/train_images',
            'train_csv'   : '../input/plant-pathology-2021-fgvc8/train.csv'
            }


train = pd.read_csv(path_dict['train_csv'])
str_label = train['labels']

label_dict = {}
for i, label in enumerate(str_label.unique()):
    label_dict[label] = i

    
'''label_dict = {'healthy': 0,
 'scab frog_eye_leaf_spot complex': 4,
 'scab': 1,
 'complex': 4,
 'rust': 2,
 'frog_eye_leaf_spot': 3,
 'powdery_mildew': 4,
 'scab frog_eye_leaf_spot': 4,
 'frog_eye_leaf_spot complex': 4,
 'rust frog_eye_leaf_spot': 4,
 'powdery_mildew complex': 4,
 'rust complex': 4}
 '''
    
# Integer Coding    
train['int_encoder'] = train['labels']
train.replace({'int_encoder': label_dict}, inplace = True)
train_y = tf.keras.utils.to_categorical(train['int_encoder'].to_numpy(), num_classes = 12)

In [None]:
pd.concat([train,pd.DataFrame(train_y)], axis = 1)


### Classifying 12 disease in in 4 classes in final_label_dict and Finding the Percentage of disese in complete data

In [None]:
import seaborn as sns

df = train.copy()
df = df.groupby('labels').count()
df = df.reset_index()
df = df[['labels','image']]

df['percentage'] = df['image'] / len(train) * 100

fig = plt.figure(figsize = (25,10))
ax = sns.barplot(x = 'labels' , y = 'percentage' , data = df)
ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha="right")
plt.title('Percentage of Disease in data')
plt.xlabel('Types of Disease')


## **Seaborn Introduction** 

### Related to Catagorical (categorical vs integer variable)
1. BoxPlot
2. Barplot
3. Voilin Plot
4. count PLot
5. stripplot and swarmplot
6. factorplot - This is in general plot having kind parameter which canbe set as bar, box, violin.


### Related to Distribution plot (for examining univariate and bivariate distributions.)
1. Distplot  -  univariant set of observations and visualizes it through a histogram.
2. Joinplot  -  draw a plot of two variables with bivariate and univariate graphs.
3. Pair Plot -  plot between each pair varibales.
4. RugbPlot  -  dashes plot for a single column 


### Related to Regression Plots
1. Simple linear plot

### Visulaisation in pi chart

In [None]:
%config Completer.use_jedi = False

In [None]:
plt.figure(figsize = (10,10))
plt.pie(x = df['percentage'],labels = df['labels'] ,autopct='%1.1f%%',labeldistance=1.2,radius=0.9,pctdistance= 0.7)
plt.legend(loc='upper right')

#### **Conclusion**

1. Data set is imbalanced

### Plotting Sample Images

In [None]:
def visualize_batch(path,image_id, labels):
    plt.figure(figsize=(16, 12))
    
    for ind, (image_id, label) in enumerate(zip(image_id, labels)):
        plt.subplot(5,4, ind + 1)
        image = cv2.imread(os.path.join(path, image_id))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        plt.imshow(image)
        plt.title(f"Class: {label}", fontsize=12)
        plt.axis("off")
    plt.show()
    
sample_df = train.sample(20)
image_id = sample_df["image"].values
labels = sample_df["labels"].values
visualize_batch(path_dict['train_images'],image_id,labels)

In [None]:
arr = cv2.imread(path_dict['train_images'] + '/'+ 'acc99659863d9f0a.jpg')
arr = cv2.cvtColor(arr, cv2.COLOR_BGR2RGB)

### Distinct List of labels

* healthy
* complex
* rust
* frog_eye_leaf_spot
* powdery_mildew
* scab

### Training Data ['labels'] splitting by ''

In [None]:
distinct_labels = ['healthy','complex','rust','frog_eye_leaf_spot','powdery_mildew','scab']
train['label_list'] = train['labels'].str.split()
for x in distinct_labels:
    train[x] = 0


def overlapping_category(label_list, coln):
    if coln in label_list :
        return 1
    else:
        return 0

for x in distinct_labels:
    myfunc = np.vectorize(overlapping_category)
    train[x] = myfunc(train['label_list'], x)

In [None]:
px.parallel_categories(train , dimensions= distinct_labels, color= 'healthy', color_continuous_scale="sunset")

### Removing Duplicate Images

we are going to use image hash library to delete the duplicate images.

In [None]:
from PIL import Image

threshold = .9
img_height = 8
img_width = 9
seed = 42

root = path_dict['train_images']
list_of_images = os.listdir(root)
df = pd.read_csv(path_dict['train_csv'], index_col='image')

for i in tqdm(list_of_images, total=len(list_of_images)):
    image = os.path.join(root,i)
    tens = tf.io.read_file(image)
    image = tf.image.decode_png(tens, channels=3)
    image = tf.image.resize(image,[img_height, img_width])
    image = tf.cast(image, tf.uint8).numpy()
    plt.imsave(i, image)

In [None]:
import PIL
hash_functions = [
    imagehash.average_hash,
    imagehash.phash,
    imagehash.dhash,
    imagehash.whash]

image_ids = []
hashes = []

list_of_images_2 = tf.io.gfile.glob('./*.jpg')

for i in tqdm(list_of_images_2, total=len(list_of_images_2)):

    image = PIL.Image.open(i)

    hashes.append(np.array([x(image).hash for x in hash_functions]).reshape(-1,))
    image_ids.append(i.split('/')[-1])
    
hashes = np.array(hashes)
image_ids = np.array(image_ids)

In [None]:
duplicate_ids = []   # To store all duplicate ids
id_similar_id = {}      # to store which image_id is simalilar to which image_ids

for id, hash in tqdm(zip(image_ids,hashes), total=len(hashes)):
    if id not in duplicate_ids:
        similarity = (hash == hashes).mean(axis=1)
        similar_to = list(image_ids[similarity > threshold])
        similar_to.remove(id)
        if(len(similar_to)>0):
            id_similar_id[id] = similar_to
        
        for i in similar_to:
            duplicate_ids.append(i)

In [None]:
plt.figure(figsize=(20,100))


for i ,(key, value) in enumerate(id_similar_id.items()):
    plt.subplot(27,2,2*i+1)
    plt.imshow(cv2.imread(path_dict['train_images'] +'/'+key))
    plt.title(key)
    plt.axis('off')
    
    plt.subplot(27,2,2*i+2)
    plt.imshow(cv2.imread(path_dict['train_images'] +'/'+value[0]))
    plt.title(value[0])
    plt.axis('off')

    
    
    
