# About this Notebook

In my previous notebook [here](https://www.kaggle.com/tanulsingh077/how-to-become-leaf-doctor-with-deep-learning) I showed that there are similarity in Images across different Labels and also some mislabels present . This idea is the outcome of that same fact

In a normal case where the values/images across different labels might not be related to each other we could have gone with simple StratifiedKfold CV strategy without even a doubt but here where there is a lot of similarity in leaves across different labels and also with the mislabels present would it be a good idea to go with StratifiedKfold?

If not StratifiedKfold then what do you suggest?
In the same notebook I also show that clustering the images in our dataset gives very interesting results and the clusters are also very well formed . In this notebook I cluster the images and form image groups and based on those groups I suggest GroupStratifiedKfold as cv strategy and also compare it with StratifiedKfold Cv strategy

<font color='red'> Note : It would be great if the community also puts forward their views on what is a better CV strategy and why the following dicussion thread </font> :

https://www.kaggle.com/c/cassava-leaf-disease-classification/discussion/201699

In [None]:
# Essentials
from pathlib import Path
import json
from tqdm import tqdm
tqdm.pandas()
import random

# Visuals and CV2
import seaborn as sns
import matplotlib.pyplot as plt
import cv2
from PIL import Image
%matplotlib inline

# Prelims
import pandas as pd
import numpy as np
from collections import Counter, defaultdict

# Clustering
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN

#keras
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array 
from keras.applications.resnet50 import preprocess_input 

# models 
from keras.applications.resnet50 import ResNet50
from keras.models import Model

In [None]:
BASE_DIR = Path('../input/cassava-leaf-disease-classification')

## Reading DataFrame having Labels
train = pd.read_csv(BASE_DIR/'train.csv')

## Label Mappings
with open(BASE_DIR/'label_num_to_disease_map.json') as f:
    mapping = json.loads(f.read())
    mapping = {int(k): v for k,v in mapping.items()}

print(mapping)

In [None]:
train['label_names'] = train['label'].map(mapping)
train.head()

# Extracting Features for Clustering

In [None]:
def extract_features(image_id, model):
    file = BASE_DIR/'train_images'/image_id
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224,224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img) 
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3) 
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    
    return features

In [None]:
model = ResNet50()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

''' You can uncommnet the below to get the features but I have already done that and saved the features as a numpy 
file which you can load directly'''

#train['features'] = train['image_id'].progress_apply(lambda x:extract_features(x,model))

# Clustering

In [None]:
###################### use this when extracting features instead of loading from numpy array #########################################
'''
features = np.array(train['features'].values.tolist()).reshape(-1,2048)
'''
################### else use the following ######################
features = np.load('../input/cassava/features.npy')
image_ids = np.array(train['image_id'].values.tolist())
labels = train['label'].values.tolist()

In [None]:
# Clustering
kmeans = KMeans(n_clusters=10,random_state=22)
kmeans.fit(features)

In [None]:
train['cluster'] = kmeans.labels_

# Visualizing Clusters

In [None]:
#PCA with three principal components
pca_3d = PCA(n_components=3)

In [None]:
#And this DataFrame contains three principal components that will aid us
#in visualizing our clusters in 3-D
PCs_3d = pd.DataFrame(pca_3d.fit_transform(features))
PCs_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]

train = pd.concat([train,PCs_3d], axis=1, join='inner')
train.head()

In [None]:
sns.set(style = "darkgrid")

fig = plt.figure(figsize=(16,11))
ax = fig.add_subplot(111, projection = '3d')

x = train['PC1_3d']
y = train['PC2_3d']
z = train['PC3_3d']

ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")

ax.scatter(x, y, z,c=train['cluster'].values)

plt.show()

* We can see some clusters are very nicely separated and some are intertwined , but all in all it looks like the image grouping is done alright

In [None]:
train['cluster'].value_counts()

* Let's now also visualize the images in respective clusters starting with the cluster having lowest number of images to clusters having highest number of Images

# Visualizing Images in Clusters

In [None]:
groups = {}
for file,label,cluster in zip(image_ids,labels,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append((file,label))
    else:
        groups[cluster].append((file,label))

In [None]:
def view_cluster(cluster):
    plt.figure(figsize = (25,25));
    # gets the list of filenames for a cluster
    files = [ids for ids,_ in groups[cluster]]
    labels = [lab for _,lab in groups[5]]    
    # only allow up to 30 images to be shown at a time
    if len(files) > 30:
        print(f"Clipping cluster size from {len(files)} to 25")
        start = np.random.randint(0,len(files))
        print(start)
        files = files[start:start+25]
        labels = labels[start:start+25]
    # plot each image in the cluster
    for index,(label,file) in enumerate(zip(labels,files)):
        plt.subplot(5,5,index+1);
        img = load_img(BASE_DIR/'train_images'/file)
        img = np.array(img)
        plt.imshow(img)
        plt.title(file+' '+"label: "+str(label))
        plt.axis('off')

In [None]:
view_cluster(3)

In [None]:
view_cluster(4)

In [None]:
view_cluster(9)

In [None]:
view_cluster(6)

* The clusters seem to be reasonable now let's see if the clusters are well represented in every label or not

In [None]:
temp = train.groupby(['label','cluster']).count()['image_id'].reset_index()

In [None]:
print(temp[temp['label']==0])
print(temp[temp['label']==1])
print(temp[temp['label']==2])
print(temp[temp['label']==3])
print(temp[temp['label']==4])

* The representations also seem to be fine , lets now create folds and analyze them

# GroupStratifiedKFold

The code for  creation of GroupStratified-Kfold is taken from [here](https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation)

In [None]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)
    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

In [None]:
train_x = train['image_id'].values
train_y = train.label.values
groups = np.array(train.cluster.values)

def get_distribution(y_vals):
        y_distr = Counter(y_vals)
        y_vals_sum = sum(y_distr.values())
        return [f'{y_distr[i] / y_vals_sum:.2%}' for i in range(np.max(y_vals) + 1)]

In [None]:
train['kfold'] = -1
distrs = [get_distribution(train_y)]
index = ['training set']

for fold_ind, (dev_ind, val_ind) in enumerate(stratified_group_k_fold(train_x, train_y, groups, k=5)):
    dev_y, val_y = train_y[dev_ind], train_y[val_ind]
    dev_groups, val_groups = groups[dev_ind], groups[val_ind]
    train.loc[val_ind, 'kfold'] = fold_ind
    
    assert len(set(dev_groups) & set(val_groups)) == 0
    
    distrs.append(get_distribution(dev_y))
    index.append(f'development set - fold {fold_ind}')
    distrs.append(get_distribution(val_y))
    index.append(f'validation set - fold {fold_ind}')

display('Distribution per class:')
pd.DataFrame(distrs, index=index, columns=[f'Label {l}' for l in range(np.max(train_y) + 1)])

In [None]:
train.groupby('kfold')['label'].value_counts()

In [None]:
train[['image_id','label','label_names','cluster','kfold']].to_csv('cassava_folds.csv')

# Conclusion

Here I have analyzed and created StratitifedGroup - 5Fold based on clustering of images. According to me the overall distribution and the folds looks better with this . It would be great to hear community's thoughts on this