In [1]:
import os
from pathlib import Path
from skimage import io, img_as_ubyte
from skimage.transform import resize, rotate
from skimage.exposure import adjust_gamma
import pandas as pd
from sklearn.model_selection import train_test_split

LOAD DATA

In [2]:
input_folder_path = 'data-original'

Create DATAFRAME

In [3]:
images_list = []
labels_list = []
    
quality_list = os.listdir(input_folder_path)
    
for quality in quality_list:
    fruit_type = os.listdir(os.path.join(input_folder_path, quality))
    for fruit in fruit_type:
        #if 'Banana' in fruit:
        fruit_path = os.path.join(input_folder_path, quality, fruit)
        images = os.listdir(fruit_path)
        for image in images:
            image_path = os.path.join(fruit_path, image)
            print(image_path)
            i = io.imread(image_path)
            if len(i.shape) == 2 or (len(i.shape) == 3 and i.shape[2] == 1):
                print(f"Grayscale {image_path}")
                continue  
            images_list.append(image_path)
            # Check if the image name contains '_Good' or '_Bad' and assign the label accordingly
            if "_Good" in fruit or "_Bad" in fruit:
                labels_list.append(f"{fruit}")
            else:
                labels_list.append(f"{fruit}_Mixed")
                    
images_series = pd.Series(images_list, name="images_paths")
labels_series = pd.Series(labels_list, name="labels")
df = pd.concat([images_series, labels_series], axis=1)


data-original\Bad Quality_Fruits\Apple_Bad\IMG20200728175856.jpg
data-original\Bad Quality_Fruits\Apple_Bad\IMG20200728175907.jpg
data-original\Bad Quality_Fruits\Apple_Bad\IMG20200728175908.jpg
data-original\Bad Quality_Fruits\Apple_Bad\IMG20200728175909.jpg
data-original\Bad Quality_Fruits\Apple_Bad\IMG20200728175910.jpg
data-original\Bad Quality_Fruits\Apple_Bad\IMG20200728175912.jpg
data-original\Bad Quality_Fruits\Apple_Bad\IMG20200728175913.jpg
data-original\Bad Quality_Fruits\Apple_Bad\IMG20200728175917.jpg
data-original\Bad Quality_Fruits\Apple_Bad\IMG20200728175919.jpg
data-original\Bad Quality_Fruits\Apple_Bad\IMG20200728175922.jpg
data-original\Bad Quality_Fruits\Apple_Bad\IMG20200728175926.jpg
data-original\Bad Quality_Fruits\Apple_Bad\IMG20200728175927.jpg
data-original\Bad Quality_Fruits\Apple_Bad\IMG20200728175936.jpg
data-original\Bad Quality_Fruits\Apple_Bad\IMG20200728175940.jpg
data-original\Bad Quality_Fruits\Apple_Bad\IMG20200728175946.jpg
data-original\Bad Quality

In [4]:
df

Unnamed: 0,images_paths,labels
0,data-original\Bad Quality_Fruits\Apple_Bad\IMG...,Apple_Bad
1,data-original\Bad Quality_Fruits\Apple_Bad\IMG...,Apple_Bad
2,data-original\Bad Quality_Fruits\Apple_Bad\IMG...,Apple_Bad
3,data-original\Bad Quality_Fruits\Apple_Bad\IMG...,Apple_Bad
4,data-original\Bad Quality_Fruits\Apple_Bad\IMG...,Apple_Bad
...,...,...
18318,data-original\Mixed Qualit_Fruits\Pomegranate\...,Pomegranate_Mixed
18319,data-original\Mixed Qualit_Fruits\Pomegranate\...,Pomegranate_Mixed
18320,data-original\Mixed Qualit_Fruits\Pomegranate\...,Pomegranate_Mixed
18321,data-original\Mixed Qualit_Fruits\Pomegranate\...,Pomegranate_Mixed


Print classes 

In [5]:
groups = df.groupby('labels')
    
train_df = pd.DataFrame()
test_df = pd.DataFrame()

# For each group, split its data into train and test, then append to train_df and test_df
for label, group in groups:
    train, test = train_test_split(group, test_size=0.2, random_state=42)
    train_df = pd.concat([train_df, train])
    test_df = pd.concat([test_df, test])

# Shuffle the datasets
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

# Checking the balance in the datasets
print("Training set class distribution:")
print(train_df['labels'].value_counts())

print("\nTesting set class distribution:")
print(test_df['labels'].value_counts())

Training set class distribution:
labels
Pomegranate_Good     3801
Orange_Good           972
Pomegranate_Bad       949
Orange_Bad            927
Guava_Good            921
Apple_Bad             912
Apple_Good            907
Guava_Bad             903
Banana_Good           890
Lime_Good             875
Banana_Bad            869
Lime_Bad              868
Banana_Mixed          228
Lemon_Mixed           222
Guava_Mixed           118
Orange_Mixed          100
Pomegranate_Mixed     100
Apple_Mixed            90
Name: count, dtype: int64

Testing set class distribution:
labels
Pomegranate_Good     951
Orange_Good          244
Pomegranate_Bad      238
Orange_Bad           232
Guava_Good           231
Apple_Bad            229
Apple_Good           227
Guava_Bad            226
Banana_Good          223
Lime_Good            219
Banana_Bad           218
Lime_Bad             217
Banana_Mixed          57
Lemon_Mixed           56
Guava_Mixed           30
Orange_Mixed          25
Pomegranate_Mixed     25
A

TRIM

In [6]:
def trim (df, max_size, min_size, column):
    df=df.copy()
    original_class_count= len(list(df[column].unique()))
    print ('Original Number of classes in dataframe: ', original_class_count)
    sample_list=[] 
    groups=df.groupby(column)
    for label in df[column].unique():        
        group=groups.get_group(label)
        sample_count=len(group)         
        if sample_count> max_size :
            strat=group[column]
            samples,_=train_test_split(group, train_size=max_size, shuffle=True, random_state=123, stratify=strat)            
            sample_list.append(samples)
        elif sample_count>= min_size:
            sample_list.append(group)
    df=pd.concat(sample_list, axis=0).reset_index(drop=True)
    final_class_count= len(list(df[column].unique())) 
    if final_class_count != original_class_count:
        print ('*** WARNING***  dataframe has a reduced number of classes' )
    balance=list(df[column].value_counts())
    print (balance)
    return df   

Trim the train dataset to 400 max samples

In [7]:
max_samples = 400
min_samples = 0
column = 'labels'
train_df = trim(train_df, max_samples, min_samples, column)
print(train_df)

Original Number of classes in dataframe:  18
[400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 228, 222, 118, 100, 100, 90]
                                           images_paths       labels
0     data-original\Good Quality_Fruits\Apple_Good\2...   Apple_Good
1     data-original\Good Quality_Fruits\Apple_Good\2...   Apple_Good
2     data-original\Good Quality_Fruits\Apple_Good\2...   Apple_Good
3     data-original\Good Quality_Fruits\Apple_Good\I...   Apple_Good
4     data-original\Good Quality_Fruits\Apple_Good\I...   Apple_Good
...                                                 ...          ...
5653  data-original\Mixed Qualit_Fruits\Apple\IMG202...  Apple_Mixed
5654  data-original\Mixed Qualit_Fruits\Apple\IMG202...  Apple_Mixed
5655  data-original\Mixed Qualit_Fruits\Apple\IMG202...  Apple_Mixed
5656  data-original\Mixed Qualit_Fruits\Apple\IMG202...  Apple_Mixed
5657  data-original\Mixed Qualit_Fruits\Apple\IMG202...  Apple_Mixed

[5658 rows x 2 columns]
[400, 400, 4

AUGMENT

In [8]:
def augment_image(image, output_path, image_name, transformations, file_extension='.jpg'):
    augmented_images = []
    # Apply each transformation and save the result
    for trans_name, trans_func in transformations.items():
        transformed_image = trans_func(image)
        transformed_image = img_as_ubyte(transformed_image)  # Convert to uint8
        augmented_image_name = f'{trans_name}_{image_name}{file_extension}'
        augmented_image_path = output_path / augmented_image_name
        io.imsave(augmented_image_path, transformed_image)
        print(f'Augmented image saved to: {augmented_image_path}')
        augmented_images.append(augmented_image_path)
    return augmented_images

def augment(df, output_path, resize_shape):
    # Define transformations
    transformations = {
        'rotated_90': lambda x: rotate(x, 90),
        'rotated_180': lambda x: rotate(x, 180),
        'rotated_270': lambda x: rotate(x, 270),
        'gamma_0.8': lambda x: adjust_gamma(x, 0.8),
        'gamma_1.2': lambda x: adjust_gamma(x, 1.2)
    }

    # Calculate max count for each label and group by label
    max_count = 500
    grouped = df.groupby('labels')

    new_rows = []  # To store new augmented image paths and labels

    for label in df['labels'].unique():
        # Determine how many images to augment for this label
        group = grouped.get_group(label)
        
        augment_count = max_count - len(group)
        print(f"{label} length: {len(group)}")
        augmentation_per_image = len(transformations)

        while augment_count > 0:
            for _, row in group.iterrows():
                if augment_count <= 0:
                        break

                image_path = row['images_paths']
                image_name = label + "_" +Path(image_path).stem

                # Load and preprocess image
                image = io.imread(image_path)
                if resize_shape is not None:
                    image = resize(image, resize_shape, anti_aliasing=True)
                    print(f"Images Resized {resize_shape}")
                image = img_as_ubyte(image)  # Convert to uint8

                # Perform augmentation and get augmented image paths
                augmented_image_paths = augment_image(image, Path(output_path), image_name, transformations)
                
                # Update DataFrame with new rows
                for aug_path in augmented_image_paths:
                    augment_count -= 1
                    if augment_count <= 0:
                        new_rows.append({'images_paths': str(aug_path), 'labels': label})
                        break
                    new_rows.append({'images_paths': str(aug_path), 'labels': label})
                    

    # Append new rows to the original DataFrame
    new_df = pd.DataFrame(new_rows)
    df = pd.concat([df, new_df], ignore_index=True)
    return df

Add 100 augmented images per class

In [10]:
train_df = augment(train_df, 'output_augment', resize_shape=(256, 129))
balance=list(train_df['labels'].value_counts())
print (balance)

Apple_Good length: 400
Images Resized (256, 129)
Augmented image saved to: output_augment\rotated_90_Apple_Good_20190809_122336.jpg
Augmented image saved to: output_augment\rotated_180_Apple_Good_20190809_122336.jpg
Augmented image saved to: output_augment\rotated_270_Apple_Good_20190809_122336.jpg
Augmented image saved to: output_augment\gamma_0.8_Apple_Good_20190809_122336.jpg
Augmented image saved to: output_augment\gamma_1.2_Apple_Good_20190809_122336.jpg
Images Resized (256, 129)
Augmented image saved to: output_augment\rotated_90_Apple_Good_20190809_153604.jpg
Augmented image saved to: output_augment\rotated_180_Apple_Good_20190809_153604.jpg
Augmented image saved to: output_augment\rotated_270_Apple_Good_20190809_153604.jpg
Augmented image saved to: output_augment\gamma_0.8_Apple_Good_20190809_153604.jpg
Augmented image saved to: output_augment\gamma_1.2_Apple_Good_20190809_153604.jpg
Images Resized (256, 129)
Augmented image saved to: output_augment\rotated_90_Apple_Good_201908

Features

In [3]:
from skimage.feature import local_binary_pattern, graycomatrix, graycoprops
from skimage.color import rgb2gray
import numpy as np
import cv2
from sklearn.preprocessing import StandardScaler
from skimage import io, img_as_ubyte
from skimage.feature import hog
from skimage.filters import gabor
from skimage.measure import regionprops, label as label_image
from cv2 import findContours, contourArea, moments
from skimage.color import rgb2hsv
from skimage.measure import find_contours

''' 
def extract_color_histogram_rgb(image, bins=256):
    # Convert image to RGB
    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    # Calculate histograms for each channel
    hist_r = cv2.calcHist([rgb_image], [0], None, [bins], [0, 256]).ravel()
    hist_g = cv2.calcHist([rgb_image], [1], None, [bins], [0, 256]).ravel()
    hist_b = cv2.calcHist([rgb_image], [2], None, [bins], [0, 256]).ravel()
    # Normalize histograms
    hist_r /= hist_r.sum()
    hist_g /= hist_g.sum()
    hist_b /= hist_b.sum()
    return np.concatenate([hist_r, hist_g, hist_b])
def extract_haralick_features(image):
    gray_image = rgb2gray(image)
    glcm = graycomatrix(img_as_ubyte(gray_image), distances=[5], angles=[0], levels=256, symmetric=True, normed=True)
    haralick_features = graycoprops(glcm, 'contrast').ravel()
    # Other properties like 'dissimilarity', 'homogeneity', 'energy', and 'correlation' can also be used
    return haralick_features
def extract_advanced_shape_features(image):
    gray_image = rgb2gray(image)
    contours, _ = findContours(img_as_ubyte(gray_image), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    shape_features = []
    for cnt in contours:
        area = contourArea(cnt)
        M = moments(cnt)

        # Check if the area (M["m00"]) is not zero to avoid division by zero
        if M["m00"] != 0:
            centroid_x = int(M["m10"] / M["m00"])
            centroid_y = int(M["m01"] / M["m00"])
            shape_features.extend([area, centroid_x, centroid_y])
        else:
            # Handle the case when M["m00"] is zero, e.g., by skipping this contour or setting default values
            # For example, skip the contour or use default values
            continue  # Skip this contour

    return np.array(shape_features)

'''


# Function to extract color histograms from the HSV color space
def extract_color_histogram_hsv(image, bins=256):
    hsv_image = rgb2hsv(image)
    hist_hue = cv2.calcHist([img_as_ubyte(hsv_image)], [0], None, [bins], [0, 256]).ravel()
    hist_sat = cv2.calcHist([img_as_ubyte(hsv_image)], [1], None, [bins], [0, 256]).ravel()
    # Normalize histograms
    hist_hue /= hist_hue.sum()
    hist_sat /= hist_sat.sum()
    return np.concatenate([hist_hue, hist_sat])

# Function to extract texture features using Local Binary Patterns
def extract_lbp_features(image, P=8, R=1):
    gray_image = rgb2gray(image)
    gray_image = img_as_ubyte(gray_image)
    lbp = local_binary_pattern(gray_image, P, R, method='uniform')
    lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, P * R + 3), range=(0, P * R + 2))
    lbp_hist = lbp_hist.astype('float') / (lbp_hist.sum() + 1e-6)
    return lbp_hist

# Function to extract shape features using contours
def extract_shape_features(image):
    # Convert to grayscale and find contours
    gray_image = rgb2gray(image)
    contours = find_contours(gray_image, level=0.8)

    # Example feature: number of contours
    num_contours = len(contours)

    # Other possible features: area of the largest contour, total area of all contours, etc.
    # This is just a placeholder for whatever shape features you decide are relevant
    shape_features = [num_contours]  # Replace with actual feature calculations

    return np.array(shape_features)

def extract_all_features(image):
    color_hist_hsv = extract_color_histogram_hsv(image)
    #color_hist_rgb = extract_color_histogram_rgb(image)
    lbp_features = extract_lbp_features(image)
    #haralick_features = extract_haralick_features(image)
    shape_features = extract_shape_features(image)

    # Combine features into a single feature vector
    #features = np.concatenate([color_hist_hsv, color_hist_rgb, lbp_features, haralick_features, shape_features])
    features = np.concatenate([color_hist_hsv, lbp_features, shape_features])
    return features


Remove background

In [4]:
import cv2
import numpy as np
from sklearn.cluster import KMeans

def get_optimized_rectangle(width, height):
    # Define margins as a percentage of the image's dimensions
    margin_x = int(width * 0.1)  # 10% of width
    margin_y = int(height * 0.1)  # 10% of height

    # Calculate rectangle coordinates
    x = margin_x
    y = margin_y
    rect_width = width - 2 * margin_x
    rect_height = height - 2 * margin_y

    return (x, y, rect_width, rect_height)

def remove_background(image):
    # Read the image
    height, width = image.shape[:2]

    # Define the initial mask
    mask = np.zeros(image.shape[:2], np.uint8)

    # Define the rectangle for the initial guess of the foreground
    rect = get_optimized_rectangle(width, height)

    # Allocate space for two arrays used by the GrabCut algorithm
    bgdModel = np.zeros((1,65), np.float64)
    fgdModel = np.zeros((1,65), np.float64)

    # Apply the GrabCut algorithm
    cv2.grabCut(image, mask, rect, bgdModel, fgdModel, 5, cv2.GC_INIT_WITH_RECT)

    # Modify mask to get the final result
    mask = np.where((mask==2)|(mask==0), 0, 1).astype('uint8')
    result = image * mask[:, :, np.newaxis]

    return result

Extract features for the training set 

In [14]:
feature_save_path = "features_train"
resize_shape=(256, 192)
for index, row in train_df.iterrows():
    # Load image
    image_path = row['images_paths']
    image = io.imread(image_path)
    if len(image.shape) == 2 or (len(image.shape) == 3 and image.shape[2] == 1):
        continue  
    image = remove_background(image)
    image = resize(image, resize_shape, anti_aliasing=True)
    image = img_as_ubyte(image)
  
    
    # Extract features
    features = extract_all_features(image)

    # Extract label
    label = row['labels']
    

    # Save features and label to a file
    data = {
        'features': features,
        'label': label
    }
    feature_file_path = Path(feature_save_path) / f'{Path(image_path).stem}_data.npy'
    np.save(feature_file_path, data)
    print(f'Data saved to: {feature_file_path}')

Data saved to: features_train\20190809_122336_data.npy
Data saved to: features_train\20190809_153604_data.npy
Data saved to: features_train\20190809_115723_data.npy
Data saved to: features_train\IMG20200728181009_data.npy
Data saved to: features_train\IMG_9482_data.npy
Data saved to: features_train\20190809_153900_data.npy
Data saved to: features_train\IMG_9424_data.npy
Data saved to: features_train\IMG_9563_data.npy
Data saved to: features_train\IMG20200728181133_data.npy
Data saved to: features_train\20190809_165305_data.npy
Data saved to: features_train\20190809_122137_data.npy
Data saved to: features_train\20190809_164858_data.npy
Data saved to: features_train\IMG_9576_data.npy
Data saved to: features_train\20190809_165048_data.npy
Data saved to: features_train\20190809_121501_data.npy
Data saved to: features_train\20190809_164646_data.npy
Data saved to: features_train\IMG_9582_data.npy
Data saved to: features_train\20190809_153558_data.npy
Data saved to: features_train\IMG_9585_da

Extract features for the test set

In [15]:
feature_save_path = "features_test"
for index, row in test_df.iterrows():
    # Load image
    image_path = row['images_paths']
    image = io.imread(image_path)
    if len(image.shape) == 2 or (len(image.shape) == 3 and image.shape[2] == 1):
        continue    
    image = remove_background(image)
    image = resize(image, resize_shape, anti_aliasing=True)
    image = img_as_ubyte(image)
    

    
    # Extract features
    features = extract_all_features(image)

    # Extract label
    label = row['labels']

    # Save features and label to a file
    data = {
        'features': features,
        'label': label
    }
    feature_file_path = Path(feature_save_path) / f'{Path(image_path).stem}_data.npy'
    np.save(feature_file_path, data)
    print(f'Data saved to: {feature_file_path}')

Data saved to: features_test\IMG_20190824_181401_1_data.npy
Data saved to: features_test\IMG_20190902_104158_data.npy
Data saved to: features_test\IMG_8080_data.npy
Data saved to: features_test\IMG_8064_data.npy
Data saved to: features_test\20190820_153150_25529_data.npy
Data saved to: features_test\IMG20200728131516_data.npy
Data saved to: features_test\IMG_8959_data.npy
Data saved to: features_test\IMG_20190923_182210_data.npy
Data saved to: features_test\20190820_145758_23806_data.npy
Data saved to: features_test\20190820_153018_data.npy
Data saved to: features_test\IMG_20190824_180729_data.npy
Data saved to: features_test\20190820_154156_26358_data.npy
Data saved to: features_test\IMG_8150_data.npy
Data saved to: features_test\20190820_144922_23521_data.npy
Data saved to: features_test\IMG_20190902_111055_data.npy
Data saved to: features_test\IMG_6989_data.npy
Data saved to: features_test\IMG_20190902_102409_data.npy
Data saved to: features_test\IMG_20190902_111415_data.npy
Data sa

LINEAR SVM

In [1]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np
import os
from pathlib import Path
from sklearn import svm
from sklearn.model_selection import train_test_split
from joblib import dump
from sklearn.model_selection import GridSearchCV
def load_data(directory):
    features = []
    labels = []
    for file in os.listdir(directory):
        if file.endswith('.npy'):
            file_path = Path(directory) / file
            data = np.load(file_path, allow_pickle=True).item()
            features.append(data['features'])
            labels.append(data['label'])
    return np.array(features), np.array(labels)

# Load training and testing data
train_data_dir = 'features_train'
test_data_dir = 'features_test_all'
X_train, y_train = load_data(train_data_dir)
X_test, y_test = load_data(test_data_dir)

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Define parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10, 100],  # Example range, adjust based on your needs
}

# Create a GridSearchCV object
grid_search = GridSearchCV(svm.SVC(kernel='linear', class_weight='balanced'), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best estimator
best_clf = grid_search.best_estimator_

# Evaluate the model using additional metrics
from sklearn.metrics import classification_report

y_pred = best_clf.predict(X_test)
print(classification_report(y_test, y_pred))

# Save the best model and the scaler
dump(best_clf, 'svm_model_best_2.pkl')
dump(scaler, 'scaler_best_2.pkl')
dump(label_encoder, 'label_encoder_best_2.pkl')
print("Best model, scaler, and label encoder saved.")


              precision    recall  f1-score   support

           0       0.77      0.82      0.80       227
           1       0.53      0.48      0.51       226
           2       0.69      0.96      0.80        23
           3       0.89      0.88      0.88       218
           4       0.95      0.86      0.90       214
           5       0.60      0.95      0.73        57
           6       0.83      0.82      0.83       226
           7       0.80      0.72      0.76       220
           8       0.39      0.60      0.47        30
           9       0.65      0.86      0.74        56
          10       0.78      0.79      0.79       217
          11       0.93      0.90      0.92       219
          12       0.87      0.81      0.84       232
          13       0.80      0.80      0.80       244
          14       0.51      0.84      0.64        25
          15       0.77      0.86      0.81       238
          16       0.99      0.91      0.94       934
          17       0.35    

NON-LINEAR SVM

In [11]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np
import os
from pathlib import Path
from sklearn import svm
from sklearn.model_selection import train_test_split
from joblib import dump
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

def load_data(directory):
    features = []
    labels = []
    for file in os.listdir(directory):
        if file.endswith('.npy'):
            file_path = Path(directory) / file
            data = np.load(file_path, allow_pickle=True).item()
            features.append(data['features'])
            labels.append(data['label'])
    return np.array(features), np.array(labels)

# Load training and testing data
train_data_dir = 'features_train'
test_data_dir = 'features_test'
X_train, y_train = load_data(train_data_dir)
X_test, y_test = load_data(test_data_dir)

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Specify SVM parameters and range for grid search
param_grid = {
    'C': [0.1, 0.5, 0.8, 1],
    'gamma': ['scale', 'auto', 0.1, 0.01, 0.001, 0.0001],
    'class_weight': [None, 'balanced']
}

# Create a GridSearchCV object
grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train, y_train)

# Get the best estimator
best_clf = grid_search.best_estimator_

# Evaluate the model
y_pred = best_clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Save the model and the scaler
dump(best_clf, 'svm_model_rbf_best_2.pkl')
dump(scaler, 'scaler_rbf_2.pkl')
dump(label_encoder, 'label_encoder_rbf_2.pkl')
print("Best model and scaler saved.")


Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END ..............C=0.1, class_weight=None, gamma=scale; total time=  30.3s
[CV] END ..............C=0.1, class_weight=None, gamma=scale; total time=  31.2s
[CV] END ..............C=0.1, class_weight=None, gamma=scale; total time=  30.3s
[CV] END ..............C=0.1, class_weight=None, gamma=scale; total time=  31.0s
[CV] END ..............C=0.1, class_weight=None, gamma=scale; total time=  29.0s
[CV] END ...............C=0.1, class_weight=None, gamma=auto; total time=  26.9s
[CV] END ...............C=0.1, class_weight=None, gamma=auto; total time=  29.7s
[CV] END ...............C=0.1, class_weight=None, gamma=auto; total time=  30.2s
[CV] END ...............C=0.1, class_weight=None, gamma=auto; total time=  30.8s
[CV] END ...............C=0.1, class_weight=None, gamma=auto; total time=  33.2s
[CV] END ................C=0.1, class_weight=None, gamma=0.1; total time= 1.1min
[CV] END ................C=0.1, class_weight=No

SMOTE

In [49]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
import numpy as np
import os
from pathlib import Path
from sklearn import svm
from sklearn.model_selection import train_test_split
from joblib import dump

def load_data(directory):
    features = []
    labels = []
    for file in os.listdir(directory):
        if file.endswith('.npy'):
            file_path = Path(directory) / file
            data = np.load(file_path, allow_pickle=True).item()
            features.append(data['features'])
            labels.append(data['label'])
    return np.array(features), np.array(labels)

# Load training and testing data
train_data_dir = 'features_train'
test_data_dir = 'features_test'
X, y = load_data(train_data_dir)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split your data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Normalize features after SMOTE
scaler = StandardScaler()
X_train_smote = scaler.fit_transform(X_train_smote)
X_test = scaler.transform(X_test)  # Use the same scaler on the test set

# Train the SVM model with RBF kernel
clf = svm.SVC(kernel='rbf', C=2, gamma='scale', class_weight='balanced')
clf.fit(X_train_smote, y_train_smote)  # Train with the resampled dataset

# Evaluate the model on the original test set
accuracy = clf.score(X_test, y_test)
print(f"Model accuracy on test data: {accuracy:.2f}")

# Save the model and the scaler
dump(clf, 'svm_model_rbf_smote.pkl')
dump(scaler, 'scaler_rbf_smote.pkl')
dump(label_encoder, 'label_encoder_rbf_smote.pkl')
print("Model and scaler saved.")


Model accuracy on test data: 0.95
Model and scaler saved.


Predict

In [6]:
from joblib import load
from skimage import io
from skimage.transform import resize
from skimage.util import img_as_ubyte
import numpy as np
import sys

np.set_printoptions(threshold=sys.maxsize)

# Ensure to define or import your extract_all_features function here

# Function to load saved model, scaler, and label encoder
def load_model_scaler_encoder(model_path, scaler_path, encoder_path):
    model = load(model_path)
    scaler = load(scaler_path)
    label_encoder = load(encoder_path)
    return model, scaler, label_encoder

# Function to preprocess and predict new data
def predict_new_data(input_features, model, scaler, label_encoder):
    # Normalize features
    input_features_scaled = scaler.transform([input_features])

    # Predict using the model
    predictions = model.predict(input_features_scaled)
    # Decode predictions back to original labels
    decoded_predictions = label_encoder.inverse_transform(predictions)

    return decoded_predictions

# Load model, scaler, and label encoder
model, scaler, label_encoder = load_model_scaler_encoder('svm_model_best_2.pkl', 'scaler_best_2.pkl', 'label_encoder_best_2.pkl')
#model, scaler, label_encoder = load_model_scaler_encoder('svm_model_rbf_best_2.pkl', 'scaler_rbf_2.pkl', 'label_encoder_rbf_2.pkl')
resize_shape=(256, 192)

# Example: Predicting new data
for i in range(1, 11):   
    image_path = f'testing/apple_bad_{i}.jpg'
    image = io.imread(image_path)
    image = remove_background(image)
    io.imsave(f'0.apple_bad_{i}.jpg', image)
    image = resize(image, resize_shape, anti_aliasing=True)
    image = img_as_ubyte(image)

    new_features = extract_all_features(image)
    predicted_labels = predict_new_data(new_features, model, scaler, label_encoder)
    print(f"Predicted label for {image_path}:", predicted_labels)

for i in range(1, 5):
    image_path = f'testing/banana_{i}.jpg'
    image = io.imread(image_path)
    image = remove_background(image)
    io.imsave(f'0.banana_{i}.jpg', image)
    image = resize(image, resize_shape, anti_aliasing=True)
    image = img_as_ubyte(image)

    new_features = extract_all_features(image)
    predicted_labels = predict_new_data(new_features, model, scaler, label_encoder)
    print(f"Predicted label for {image_path}:", predicted_labels)


Predicted label for testing/apple_bad_1.jpg: ['Orange_Good']
Predicted label for testing/apple_bad_2.jpg: ['Banana_Bad']
Predicted label for testing/apple_bad_3.jpg: ['Apple_Bad']
Predicted label for testing/apple_bad_4.jpg: ['Apple_Bad']
Predicted label for testing/apple_bad_5.jpg: ['Apple_Bad']
Predicted label for testing/apple_bad_6.jpg: ['Apple_Bad']
Predicted label for testing/apple_bad_7.jpg: ['Apple_Good']
Predicted label for testing/apple_bad_8.jpg: ['Apple_Bad']
Predicted label for testing/apple_bad_9.jpg: ['Apple_Good']
Predicted label for testing/apple_bad_10.jpg: ['Apple_Bad']
Predicted label for testing/banana_1.jpg: ['Orange_Bad']
Predicted label for testing/banana_2.jpg: ['Lime_Bad']
Predicted label for testing/banana_3.jpg: ['Banana_Bad']
Predicted label for testing/banana_4.jpg: ['Apple_Good']
