In [3]:
import os

In [22]:
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

%matplotlib inline

# Original Dataset

In [1]:
original_dataset_path = "D:\\Machine Learning Datasets\\Tomato Plant Disease Classification Dataset - Kanchana"

In [4]:
os.listdir(original_dataset_path)

['Tomato_Bacterial_Spot',
 'Tomato_Early_Blight',
 'Tomato_Healthy',
 'Tomato_Late_Blight',
 'Tomato_Leaf_Mold',
 'Tomato_Spider_Mite_Damage',
 'Tomato_Target_Spot',
 'Tomato_Yellow_Leaf_Curl_Virus']

In [11]:
for fldr in os.listdir(original_dataset_path):
    filenames = os.listdir(os.path.join(original_dataset_path,fldr))
    print("%s : %d"%(fldr,len(filenames)))

Tomato_Bacterial_Spot : 2128
Tomato_Early_Blight : 999
Tomato_Healthy : 1000
Tomato_Late_Blight : 917
Tomato_Leaf_Mold : 951
Tomato_Spider_Mite_Damage : 1568
Tomato_Target_Spot : 1404
Tomato_Yellow_Leaf_Curl_Virus : 2739


# Simplified Dataset

In [16]:
def mkdir(dirname):
    if not os.path.exists(dirname):
        os.makedirs(dirname)

In [12]:
new_dataset_path = "D:\\Machine Learning Datasets\\Tomato Plant Disease Classification Dataset - Simplified"
mkdir(new_dataset_path)

## Resizing Images

In [30]:
def resize_and_save(fin, fout):
    img = Image.open(fin, 'r')
    out = img.resize((224, 224))
    out.save(fout)

In [28]:
resized_path = os.path.join(new_dataset_path,'resized')
mkdir(resized_path)

In [31]:
for class_label in os.listdir(original_dataset_path):
    print(class_label)
    mkdir(os.path.join(resized_path,class_label))

    filenames = os.listdir(os.path.join(original_dataset_path,class_label))
    
    for fname1 in filenames:
        f1=os.path.join(original_dataset_path,class_label,fname1)
        f2=os.path.join(resized_path,class_label,fname1)
        resize_and_save(f1,f2)
    

Tomato_Bacterial_Spot
Tomato_Early_Blight
Tomato_Healthy
Tomato_Late_Blight
Tomato_Leaf_Mold


UnidentifiedImageError: cannot identify image file 'D:\\Machine Learning Datasets\\Tomato Plant Disease Classification Dataset - Kanchana\\Tomato_Leaf_Mold\\Crnl_L.Mold 8643.JPG'

In [33]:
os.listdir(original_dataset_path)[4:]

['Tomato_Leaf_Mold',
 'Tomato_Spider_Mite_Damage',
 'Tomato_Target_Spot',
 'Tomato_Yellow_Leaf_Curl_Virus']

In [35]:
for class_label in os.listdir(original_dataset_path)[4:]:
    print(class_label)
    mkdir(os.path.join(resized_path,class_label))

    filenames = os.listdir(os.path.join(original_dataset_path,class_label))
    
    for fname1 in filenames:
        f1=os.path.join(original_dataset_path,class_label,fname1)
        f2=os.path.join(resized_path,class_label,fname1)
        try:
            resize_and_save(f1,f2)
        except UnidentifiedImageError as e:
            print('UnidentifiedImageError',e)

Tomato_Leaf_Mold
Tomato_Spider_Mite_Damage
Tomato_Target_Spot
Tomato_Yellow_Leaf_Curl_Virus


## Train-Validation-Test Split [90%-5%-5%]

In [37]:
import random

In [58]:
from shutil import copyfile

In [17]:
trainset_path = os.path.join(new_dataset_path,'train')
valset_path = os.path.join(new_dataset_path,'val')
testset_path = os.path.join(new_dataset_path,'test')

mkdir(trainset_path)
mkdir(valset_path)
mkdir(testset_path)

In [59]:
for class_label in os.listdir(resized_path):
    print(class_label)
    
    mkdir(os.path.join(trainset_path,class_label))
    mkdir(os.path.join(valset_path,class_label))
    mkdir(os.path.join(testset_path,class_label))

    filenames = os.listdir(os.path.join(resized_path,class_label))
    random.shuffle(filenames)
    
    N=len(filenames)
    idx1 = int(N*0.9)
    idx2 = int(N*0.95)
    
    train_filenames=filenames[:idx1]
    val_filenames=filenames[idx1:idx2]
    test_filenames=filenames[idx2:]
    
    print("Total: %d,  Train: %d,  Validation: %d,  Test: %d \n"%(N,len(train_filenames),len(val_filenames),len(test_filenames)))
    
    #Train set
    for fname1 in train_filenames:
        f1=os.path.join(resized_path,class_label,fname1)
        f2=os.path.join(trainset_path,class_label,fname1)
        copyfile(f1,f2)
    
    #Validation set
    for fname1 in val_filenames:
        f1=os.path.join(resized_path,class_label,fname1)
        f2=os.path.join(valset_path,class_label,fname1)
        copyfile(f1,f2)
        
    #Test set
    for fname1 in test_filenames:
        f1=os.path.join(resized_path,class_label,fname1)
        f2=os.path.join(testset_path,class_label,fname1)
        copyfile(f1,f2)

Tomato_Bacterial_Spot
Total: 2128,  Train: 1915,  Validation: 106,  Test: 107 

Tomato_Early_Blight
Total: 999,  Train: 899,  Validation: 50,  Test: 50 

Tomato_Healthy
Total: 1000,  Train: 900,  Validation: 50,  Test: 50 

Tomato_Late_Blight
Total: 917,  Train: 825,  Validation: 46,  Test: 46 

Tomato_Leaf_Mold
Total: 821,  Train: 738,  Validation: 41,  Test: 42 

Tomato_Spider_Mite_Damage
Total: 1568,  Train: 1411,  Validation: 78,  Test: 79 

Tomato_Target_Spot
Total: 1404,  Train: 1263,  Validation: 70,  Test: 71 

Tomato_Yellow_Leaf_Curl_Virus
Total: 2739,  Train: 2465,  Validation: 137,  Test: 137 

