In [None]:
import os
import numpy as np
import joblib
import matplotlib.pyplot as plt
from skimage.io import imread
from skimage.transform import resize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

# -------------------------
# SETTINGS
# -------------------------
DATA_DIR = "doodle"
CATEGORIES = ["apple", "bicycle", "cat", "car", "airplane"]
IMG_SIZE = 28
N_PER_CLASS = 2000

# -------------------------
# LOAD DATA
# -------------------------
X, y = [], []
for idx, category in enumerate(CATEGORIES):
    folder = os.path.join(DATA_DIR, category)
    files = os.listdir(folder)[:N_PER_CLASS]
    for file in files:
        img = imread(os.path.join(folder, file), as_gray=True)
        img = resize(img, (IMG_SIZE, IMG_SIZE))
        X.append(img.flatten())
        y.append(idx)

X, y = np.array(X), np.array(y)
print("Data shape:", X.shape, "Labels:", y.shape)

# -------------------------
# SPLIT
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------
# SCALE
# -------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -------------------------
# MODELS
# -------------------------
logreg = LogisticRegression(max_iter=1000, n_jobs=-1)
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
et = ExtraTreesClassifier(n_estimators=100, random_state=42, n_jobs=-1)

voting = VotingClassifier(
    estimators=[("logreg", logreg), ("rf", rf), ("et", et)],
    voting="hard",
    n_jobs=-1,
)

models = {
    "Logistic Regression": logreg,
    "Random Forest": rf,
    "Extra Trees": et,
    "Voting Classifier": voting,
}

# -------------------------
# TRAIN + EVAL
# -------------------------
best_name, best_model, best_acc = None, None, 0

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_scaled, y_train)
    score = model.score(X_test_scaled, y_test)
    print(f"{name} Accuracy: {score:.4f}")

    if score > best_acc:
        best_name, best_model, best_acc = name, model, score

print(f"\nBest model: {best_name} with accuracy {best_acc:.4f}")

# -------------------------
# CONFUSION MATRIX + REPORT
# -------------------------
y_pred = best_model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm, display_labels=CATEGORIES).plot(cmap="Blues")
plt.show()

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=CATEGORIES))

# -------------------------
# SAVE
# -------------------------
joblib.dump(best_model, "doodle_model.joblib")
joblib.dump(scaler, "doodle_scaler.joblib")
joblib.dump(CATEGORIES, "doodle_classes.joblib")