## Imports

In [9]:
import os
import cv2
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

## Load and preprocess train/test data with histogram equalization

In [10]:
# 1) Load and preprocess train/test data with histogram equalization
def load_dataset(base_path, img_size=(64, 64)):
    X, y = [], []
    classes = ['Cat', 'Dog']
    for label, cls in enumerate(classes):
        folder = os.path.join(base_path, cls)
        for fname in os.listdir(folder):
            path = os.path.join(folder, fname)
            img = cv2.imread(path)
            if img is None:
                continue
            img = cv2.resize(img, img_size)
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            gray = cv2.equalizeHist(gray)         
            X.append(gray.flatten())
            y.append(label)
    return np.array(X), np.array(y)

X_train, y_train = load_dataset('./train')
X_test, y_test   = load_dataset('./test')


## Scale features

In [11]:
# 2) Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

## Hyperparameter tuning for kNN (k from 1 to 15)

In [12]:
# 3) Hyperparameter tuning for kNN (k from 1 to 15)
best_k, best_knn_acc = None, 0.0
knn_results = {}
for k in [1, 3, 5, 7, 9, 11, 13, 15]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    acc = accuracy_score(y_test, knn.predict(X_test_scaled))
    knn_results[k] = acc
    if acc > best_knn_acc:
        best_k, best_knn_acc = k, acc

print(f"Best k for kNN: {best_k} (accuracy={best_knn_acc:.2f})")

Best k for kNN: 7 (accuracy=0.80)


## Hyperparameter tuning for Logistic Regression (C values)

In [13]:
# 4) Hyperparameter tuning for Logistic Regression (C values)
best_C, best_log_acc = None, 0.0
for C in [0.01, 0.1, 1, 10]:
    logreg = LogisticRegression(C=C, max_iter=1000, random_state=42)
    logreg.fit(X_train_scaled, y_train)
    acc = accuracy_score(y_test, logreg.predict(X_test_scaled))
    if acc > best_log_acc:
        best_C, best_log_acc = C, acc

print(f"Best C for LogisticRegression: {best_C} (accuracy={best_log_acc:.2f})")

Best C for LogisticRegression: 0.01 (accuracy=0.40)


## Hyperparameter tuning for Perceptron (alpha values)

In [14]:
# 5) Hyperparameter tuning for Perceptron (alpha values)
best_alpha, best_perc_acc = None, 0.0
for alpha in [1e-4, 1e-3, 1e-2]:
    perc = SGDClassifier(loss='perceptron', alpha=alpha, max_iter=1000, tol=1e-3, random_state=42)
    perc.fit(X_train_scaled, y_train)
    acc = accuracy_score(y_test, perc.predict(X_test_scaled))
    if acc > best_perc_acc:
        best_alpha, best_perc_acc = alpha, acc

print(f"Best alpha for Perceptron: {best_alpha} (accuracy={best_perc_acc:.2f})")

Best alpha for Perceptron: 0.0001 (accuracy=0.50)


## Select and train the best overall model

In [15]:
# 6) Select and train the best overall model
all_results = {
    f'kNN (k={best_k})': best_knn_acc,
    'LogisticRegression': best_log_acc,
    'Perceptron': best_perc_acc
}
best_name = max(all_results, key=all_results.get)

if best_name.startswith('kNN'):
    best_model = KNeighborsClassifier(n_neighbors=best_k).fit(X_train_scaled, y_train)
elif best_name == 'LogisticRegression':
    best_model = LogisticRegression(C=best_C, max_iter=1000, random_state=42).fit(X_train_scaled, y_train)
else:
    best_model = SGDClassifier(loss='perceptron', alpha=best_alpha, max_iter=1000, tol=1e-3, random_state=42).fit(X_train_scaled, y_train)

print(f"Selected best model: {best_name} with accuracy {all_results[best_name]:.2f}")

Selected best model: kNN (k=7) with accuracy 0.80


## Save the best model & scaler

In [16]:
# 7) Save the best model and scaler
joblib.dump({'model': best_model, 'scaler': scaler}, 'best_cat_dog_model.joblib')
print("Saved best model and scaler to 'best_cat_dog_model.joblib'")

Saved best model and scaler to 'best_cat_dog_model.joblib'


## External predictions 

In [17]:
# 8) External predictions 
ext_folder = './external_test'
print("\nExternal image predictions:")
for fname in os.listdir(ext_folder):
    path = os.path.join(ext_folder, fname)
    img = cv2.imread(path)
    if img is None:
        continue
    img = cv2.resize(img, (64, 64))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = cv2.equalizeHist(gray)       
    flat = gray.flatten().reshape(1, -1)
    flat_scaled = scaler.transform(flat)
    pred = best_model.predict(flat_scaled)[0]
    label = 'Dog' if pred == 1 else 'Cat'
    print(f"{fname}: predicted as {label}")


External image predictions:
billi.jpg: predicted as Cat
Catt.jpg: predicted as Cat
kuku.jpg: predicted as Cat
Pet.jpg: predicted as Cat
Pet3.jpg: predicted as Dog
Random Pet.jpg: predicted as Cat
rnadom.jpg: predicted as Cat


- Tested on 7 external cat/dog images and got **4/7 correct (~57% accuracy)**.  
- Model generalizes moderately above chance but still misclassifies nearly half.  
- Errors are driven by domain shift (lighting, pose and background variations).  
