In [1]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import joblib  # Import the joblib library for saving the model

In [2]:
# Define constants
TRAIN_DIR = 'dataset/train/train'
TEST_DIR = 'dataset/test'

# Using a smaller image size for faster training with SVM
IMG_SIZE = 64 

# Using a smaller sample of images to ensure training completes in a reasonable time
SAMPLE_COUNT = 20000

In [3]:
def load_data_from_subfolders(base_folder_path, sample_size=None):
    """
    Loads images from class-specific subfolders (e.g., 'cats', 'dogs').
    This is the standard best practice.
    """
    data = []
    labels = []
    all_files = []
    
    categories = ['cats', 'dogs'] # Assumes subfolders are named 'cats' and 'dogs'
    
    print(f"Scanning for images in '{base_folder_path}'...")
    for category in categories:
        folder_path = os.path.join(base_folder_path, category)
        if not os.path.exists(folder_path):
            print(f"Warning: Folder not found at '{folder_path}'. Skipping.")
            continue
            
        label = categories.index(category) # cats -> 0, dogs -> 1
        
        for filename in os.listdir(folder_path):
            img_path = os.path.join(folder_path, filename)
            all_files.append((img_path, label))

    if sample_size and len(all_files) > sample_size:
        all_files = random.sample(all_files, sample_size)
    
    print(f"Loading and processing {len(all_files)} sampled images...")
    for img_path, label in tqdm(all_files, desc="Processing Images"):
        try:
            image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            if image is None: continue
            
            resized_image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
            flattened_image = resized_image.flatten() / 255.0
            
            data.append(flattened_image)
            labels.append(label)
        except Exception:
            pass

    return np.array(data, dtype=np.float32), np.array(labels)

In [4]:
X, y = load_data_from_subfolders(TRAIN_DIR, sample_size=SAMPLE_COUNT)

Scanning for images in 'dataset/train/train'...
Loading and processing 20000 sampled images...


Processing Images: 100%|██████████| 20000/20000 [02:28<00:00, 134.44it/s]


In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
print(f"\nData split complete.")
print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")


Data split complete.
Training data shape: (16000, 4096)
Validation data shape: (4000, 4096)


In [6]:
print("\n--- Starting SVM Training ---")
print(f"WARNING: This will train on {X_train.shape[0]} images and will likely take SEVERAL HOURS.")

svm_classifier = SVC(
    kernel='rbf', 
    C=1.0, 
    gamma='scale', 
    probability=True, 
    random_state=42,
)

svm_classifier.fit(X_train, y_train)
print("--- Training Complete ---")


--- Starting SVM Training ---
--- Training Complete ---


In [7]:
print("\nEvaluating the model on the validation set...")
y_pred = svm_classifier.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"\nValidation Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=['Cat', 'Dog']))


Evaluating the model on the validation set...

Validation Accuracy: 0.6465

Classification Report:
              precision    recall  f1-score   support

         Cat       0.65      0.64      0.64      2006
         Dog       0.64      0.66      0.65      1994

    accuracy                           0.65      4000
   macro avg       0.65      0.65      0.65      4000
weighted avg       0.65      0.65      0.65      4000



In [8]:
# saving the trained model
model_filename = 'svm_cat_dog_classifier_20k.joblib'
print(f"\nSaving the trained model to '{model_filename}'...")
joblib.dump(svm_classifier, model_filename)
print("Model saved successfully.")


Saving the trained model to 'svm_cat_dog_classifier_20k.joblib'...
Model saved successfully.


In [None]:
print("\nMaking predictions on a few unlabeled test images...")
if not os.path.exists(TEST_DIR) or not os.listdir(TEST_DIR):
    print("Test directory not found or is empty. Skipping prediction display.")
else:
    test_files = os.listdir(TEST_DIR)
    label_map = {0: 'Cat', 1: 'Dog'}

    plt.figure(figsize=(12, 8))
    for i, file in enumerate(random.sample(test_files, min(9, len(test_files)))):
        img_path = os.path.join(TEST_DIR, file)
        test_image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if test_image is None: continue

        resized_test_image = cv2.resize(test_image, (IMG_SIZE, IMG_SIZE))
        flattened_test_image = resized_test_image.flatten().reshape(1, -1) / 255.0
        
        prediction = svm_classifier.predict(flattened_test_image)
        predicted_label = label_map[prediction[0]]
        
        plt.subplot(3, 3, i + 1)
        plt.imshow(cv2.cvtColor(test_image, cv2.COLOR_GRAY2RGB))
        plt.title(f'Prediction: {predicted_label}')
        plt.axis('off')
        
    plt.tight_layout()
    plt.show()