# Image Classification with SciKit-Learn

The function below will iterate through the plant images (classess) subfolders and prepare a train dataset along with a labels array and a class names list.

In [1]:
def prep_data (folder):
    # iterate through folders, assembling feature, label, and classname data objects
    import os
    import numpy as np
    import matplotlib.pyplot as plt

    class_id = 0
    features = []
    labels = np.array([])
    classnames = []
    for root, dirs, filenames in os.walk(folder):
        for d in sorted(dirs):
            print("Reading data from", d)
            # use the folder name as the class name for this label
            classnames.append(d)
            files = os.listdir(os.path.join(root,d))
            for f in files:
                # Load the image file
                imgFile = os.path.join(root,d, f)
                img = plt.imread(imgFile)
                # The image array is a multidimensional numpy array
                # - flatten it to a single array of pixel values for scikit-learn
                # - and add it to the list of features
                features.append(img.ravel())
                
                # Add it to the numpy array of labels
                labels = np.append(labels, class_id )
            class_id  += 1
            
    # Convert the list of features into a numpy array
    features = np.array(features)
    
    return features, labels, classnames

In [4]:
# The images are in a folder named 'shapes/training'
training_folder_name = "../utilities/training_data/raw"

In [5]:
# Prepare the image data
features, labels, classnames = prep_data(training_folder_name)
print(len(features), 'features')
print(len(labels), 'labels')
print(len(classnames), 'classes:', classnames)

Reading data from Campanula_persicifolia_L
Reading data from Cichorium_intybus_L
Reading data from Lepidium_draba_L
Reading data from Leucanthemum_vulgare_(Vaill)_Lam
Reading data from Malva_sylvestris_L
14799 features
14799 labels
5 classes: ['Campanula_persicifolia_L', 'Cichorium_intybus_L', 'Lepidium_draba_L', 'Leucanthemum_vulgare_(Vaill)_Lam', 'Malva_sylvestris_L']


In [6]:
print('Feature Shape:',features.shape)
print('Labels Shape:',labels.shape)

Feature Shape: (14799, 22500)
Labels Shape: (14799,)


Now that the image data is prepared, we can split it into training (70%) and validation (30%) subsets:

In [7]:
# split into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.30)

print('Training records:',y_train.size)
print('Validation records:',y_val.size)

Training records: 10359
Validation records: 4440


In [10]:
X_train[0]

array([157, 155, 156, ...,  69,  67,  84], dtype=uint8)