# Pre-processing

In [3]:
import cv2
import os
import numpy as np
from tqdm import tqdm

def load_and_preprocess_data(base_dir, image_size=(128, 128)):
    X = []
    y = []

    labels = {'Male': 0, 'Female': 1}

    # Dùng Haar cascade để phát hiện khuôn mặt
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

    for label_name in ['Male', 'Female']:
        folder = os.path.join(base_dir, label_name)
        label = labels[label_name]

        for filename in tqdm(os.listdir(folder), desc=f"Processing {label_name}"):
            path = os.path.join(folder, filename)

            img = cv2.imread(path)
            if img is None:
                continue  # Bỏ ảnh lỗi

            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=4)

            if len(faces) == 0:
                continue

            # Chọn khuôn mặt lớn nhất
            (x, y_top, w, h) = sorted(faces, key=lambda b: b[2]*b[3], reverse=True)[0]
            face = gray[y_top:y_top+h, x:x+w]
            face_resized = cv2.resize(face, image_size)
            face_normalized = face_resized / 255.0

            X.append(face_normalized)
            y.append(label)

    X = np.array(X).reshape(-1, image_size[0], image_size[1], 1)
    y = np.array(y)

    return X, y


In [4]:
X, y = load_and_preprocess_data('/Users/softann/Documents/genderclassification/Training')
print("Shape ảnh:", X.shape)
print("Shape nhãn:", y.shape)


Processing Male: 100%|██████████| 23766/23766 [00:37<00:00, 638.53it/s]
Processing Female: 100%|██████████| 23243/23243 [00:37<00:00, 615.33it/s]


Shape ảnh: (13904, 128, 128, 1)
Shape nhãn: (13904,)


In [5]:
from sklearn.model_selection import train_test_split

# Giả sử X, y đã được tạo từ hàm tiền xử lý
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Nếu dùng mô hình học máy cổ điển: flatten ảnh
X_train_flat = X_train.reshape(len(X_train), -1)
X_test_flat = X_test.reshape(len(X_test), -1)

# Huấn luyện mô hình

In [6]:
import time
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

def train_and_compare_models(X_train, X_test, y_train, y_test):
    results = []

    X_train_flat = X_train.reshape(len(X_train), -1)
    X_test_flat = X_test.reshape(len(X_test), -1)

    # 1. Logistic Regression
    print("\n🔷 Logistic Regression")
    start = time.time()
    model_log = LogisticRegression(max_iter=1000)
    model_log.fit(X_train_flat, y_train)
    y_pred = model_log.predict(X_test_flat)
    acc = accuracy_score(y_test, y_pred)
    duration = time.time() - start
    print(f"Accuracy: {acc:.4f}")
    print(f"Training time: {duration:.2f} s")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    results.append(("Logistic Regression", acc, duration))

    # 2. KNN
    print("\n🔷 K-Nearest Neighbors")
    start = time.time()
    model_knn = KNeighborsClassifier(n_neighbors=5)
    model_knn.fit(X_train_flat, y_train)
    y_pred = model_knn.predict(X_test_flat)
    acc = accuracy_score(y_test, y_pred)
    duration = time.time() - start
    print(f"Accuracy: {acc:.4f}")
    print(f"Training time: {duration:.2f} s")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    results.append(("KNN", acc, duration))

    # 3. SVM
    print("\n🔷 Support Vector Machine")
    start = time.time()
    model_svm = SVC(kernel='linear')
    model_svm.fit(X_train_flat, y_train)
    y_pred = model_svm.predict(X_test_flat)
    acc = accuracy_score(y_test, y_pred)
    duration = time.time() - start
    print(f"Accuracy: {acc:.4f}")
    print(f"Training time: {duration:.2f} s")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    results.append(("SVM", acc, duration))

    # 4. CNN
    print("\n🔷 Convolutional Neural Network (CNN)")
    start = time.time()
    model_cnn = Sequential([
        Conv2D(32, (3,3), activation='relu', input_shape=(128,128,1)),
        MaxPooling2D(2,2),
        Conv2D(64, (3,3), activation='relu'),
        MaxPooling2D(2,2),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model_cnn.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=32, verbose=1)
    loss, acc = model_cnn.evaluate(X_test, y_test, verbose=0)
    duration = time.time() - start
    print(f"Accuracy: {acc:.4f}")
    print(f"Training time: {duration:.2f} s")

    # Dự đoán để in classification report
    y_pred_prob = model_cnn.predict(X_test)
    y_pred_cnn = (y_pred_prob > 0.5).astype(int)
    print("Classification Report:")
    print(classification_report(y_test, y_pred_cnn))

    results.append(("CNN", acc, duration))

    # Tổng kết bảng kết quả
    print("\n📊 BẢNG SO SÁNH KẾT QUẢ:")
    print("{:<25} | {:<10} | {:<10}".format("Mô hình", "Accuracy", "Thời gian (s)"))
    print("-" * 50)
    for name, acc, duration in results:
        print("{:<25} | {:.4f}     | {:.2f}".format(name, acc, duration))


In [7]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
train_and_compare_models(X_train, X_test, y_train, y_test)


🔷 Logistic Regression
Accuracy: 0.8788
Training time: 389.45 s
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.87      0.87      1334
           1       0.88      0.89      0.88      1447

    accuracy                           0.88      2781
   macro avg       0.88      0.88      0.88      2781
weighted avg       0.88      0.88      0.88      2781


🔷 K-Nearest Neighbors
Accuracy: 0.8688
Training time: 6.84 s
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1334
           1       0.91      0.83      0.87      1447

    accuracy                           0.87      2781
   macro avg       0.87      0.87      0.87      2781
weighted avg       0.87      0.87      0.87      2781


🔷 Support Vector Machine
Accuracy: 0.8587
Training time: 839.57 s
Classification Report:
              precision    recall  f1-score   support

           0       0.86      

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 133ms/step - accuracy: 0.6951 - loss: 0.5851 - val_accuracy: 0.9148 - val_loss: 0.2265
Epoch 2/5
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 131ms/step - accuracy: 0.9062 - loss: 0.2439 - val_accuracy: 0.9292 - val_loss: 0.1880
Epoch 3/5
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 131ms/step - accuracy: 0.9285 - loss: 0.1980 - val_accuracy: 0.9396 - val_loss: 0.1763
Epoch 4/5
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 132ms/step - accuracy: 0.9463 - loss: 0.1546 - val_accuracy: 0.9457 - val_loss: 0.1548
Epoch 5/5
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 134ms/step - accuracy: 0.9510 - loss: 0.1385 - val_accuracy: 0.9479 - val_loss: 0.1484
Accuracy: 0.9479
Training time: 234.41 s
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step
Classification Report:
              precision    recall  f1-scor

In [12]:
model_cnn = Sequential([
        Conv2D(32, (3,3), activation='relu', input_shape=(128,128,1)),
        MaxPooling2D(2,2),
        Conv2D(64, (3,3), activation='relu'),
        MaxPooling2D(2,2),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_cnn.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=32, verbose=1)

loss, acc = model_cnn.evaluate(X_test, y_test, verbose=0)
print(f"Accuracy: {acc:.4f}")
model_cnn.save("cnn_model.h5")

Epoch 1/5
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 133ms/step - accuracy: 0.7025 - loss: 0.5824 - val_accuracy: 0.8910 - val_loss: 0.2838
Epoch 2/5
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 139ms/step - accuracy: 0.8922 - loss: 0.2700 - val_accuracy: 0.9180 - val_loss: 0.2224
Epoch 3/5
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 131ms/step - accuracy: 0.9200 - loss: 0.2098 - val_accuracy: 0.9302 - val_loss: 0.2000
Epoch 4/5
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 131ms/step - accuracy: 0.9337 - loss: 0.1748 - val_accuracy: 0.9443 - val_loss: 0.1713
Epoch 5/5
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 129ms/step - accuracy: 0.9432 - loss: 0.1518 - val_accuracy: 0.9443 - val_loss: 0.1640


<keras.src.callbacks.history.History at 0x4846fb750>



Accuracy: 0.9443
