In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.model_selection import StratifiedKFold

import cassava_utils as utils

# Exploratory Data Analyses (EDA)

In [None]:
folderpath = "../input/cassava-leaf-disease-classification"
label2name_json = "../input/cassava-leaf-disease-classification/label_num_to_disease_map.json"

In [None]:
# check how many files are in the folders
train_files = os.listdir(os.path.join(folderpath, "train_images"))
test_files = os.listdir(os.path.join(folderpath, "test_images"))
print("images in training set:", str(len(train_files)))
print("images in test set:", str(len(test_files)))

There is only one sample in the test set. There will be added more test images in submitting process.

In [None]:
# check if all files end with ".jpeg"
print("all training files end with '.jpg':", 
      str(sum([f.endswith('.jpg') for f in train_files]) == len(train_files)))
print("all test files end with '.jpg':", 
      str(sum([f.endswith('.jpg') for f in test_files]) == len(test_files)))

In [None]:
# check for duplicates
print("no duplicates in train_files:",len(set(train_files))==len(train_files))

In [None]:
df = pd.read_csv(os.path.join(folderpath, "train.csv"))

In [None]:
# check if there is a label for all files in train_images folder
print("All files are in df:", sorted(train_files) == sorted(list(df.image_id)))
print(df.label.unique())

All files are jpgs, unique and labeled with 0 to 4

In [None]:
# check class distribution in train files
class_dist = df.label.value_counts(normalize=True)
print(class_dist)

In [None]:
base_color = sns.color_palette()[0]
sns.countplot(data=df, x="label", color=base_color)
plt.ylabel("samples")
plt.xlabel("")
plt.xticks([0, 1, 2, 3, 4], ["CBB", "CBSD", "CGM", "CMD", "healthy"])
# get counts
n_points = df.shape[0]
counts = df.label.value_counts()
#loop through locations and label pairs of diagramm (xticks)
locs,_ = plt.xticks()
for loc, label_ in enumerate(locs):
    count = counts[label_]
    percentage = "{:0.1f}%".format(100*count/n_points)
    plt.text(loc, count-800, percentage, ha="center", color="w")
    

* label 3 (disease CMD) is overrepresented (61%)
* only 12% of the files show healthy plants (label 4)

In [None]:
# # check image sizes
# image_sizes = []

# for img_id in train_files:
#     img = Image.open(os.path.join(folderpath, "train_images", img_id))
#     image_sizes.append(img.size)
    
# print("Image sizes of train_files:", set(image_sizes))

All image files are of pixel size 800x600 (width x height).

### Display Images

In [None]:
# split image_ids according to their label
files_label_0 = df.loc[df.label==0].image_id.values
files_label_1 = df.loc[df.label==1].image_id.values
files_label_2 = df.loc[df.label==2].image_id.values
files_label_3 = df.loc[df.label==3].image_id.values
files_label_4 = df.loc[df.label==4].image_id.values

In [None]:
# display images of label 0
utils.display_images(files_label_0, df, os.path.join(folderpath, "train_images"), label2name_json)

In [None]:
# display images of label 1
utils.display_images(files_label_1, df, os.path.join(folderpath, "train_images"), label2name_json)

In [None]:
# display images of label 2
utils.display_images(files_label_2, df, os.path.join(folderpath, "train_images"), label2name_json)

In [None]:
# display images of label 3
utils.display_images(files_label_3, df, os.path.join(folderpath, "train_images"), label2name_json)

In [None]:
# display images of label 4
utils.display_images(files_label_4, df, os.path.join(folderpath, "train_images"), label2name_json)

# Data Preprocessing

## Train/Val Split

In [None]:
### Version 1: using just one train/val split with sklearn.train_test_split

# train_df, val_df = train_test_split(df, test_size=0.2, 
#                                     random_state=seed)
# #save dataframes as csv
# train_df.to_csv("train_df.csv", index=False)
# val_df.to_csv("val_df.csv", index=False)

In [None]:
### Version 2: using k-Fold cross validation with stratified k-folds
# from "Approaching almost all machine learning problems" by Abhishek Thaku

# we create a new column called kfold and fill it with -1
df["kfold"] = -1
# the next step is to randomize the rows of the data
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
 # fetch targets
y = df.label.values
# initiate the kfold class from model_selection module
kf = StratifiedKFold(n_splits=5)
# fill the new kfold column
for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
    df.loc[v_, 'kfold'] = f
# save the new csv with kfold column
df.to_csv("train_folds.csv", index=False)