In [90]:
# 📌 Step 1: Import Necessary Libraries
import cv2
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [96]:
import cv2
print(cv2.__version__)

4.9.0


In [99]:
import xgboost
print(xgboost.__version__)
import catboost
print(catboost.__version__)
import lightgbm
print(lightgbm.__version__)

2.1.4
1.2.7
4.6.0


In [4]:
# Define dataset paths
TRAIN_DIR = 'face_images/train/'
TEST_DIR = 'face_images/test/'

In [5]:
# Function to load dataset
def load_dataset(directory):
    image_paths, labels = [], []
    for label in os.listdir(directory):
        label_path = os.path.join(directory, label)
        if os.path.isdir(label_path):
            for filename in os.listdir(label_path):
                image_paths.append(os.path.join(label_path, filename))
                labels.append(label)
    return image_paths, labels

# Load dataset
train_images, train_labels = load_dataset(TRAIN_DIR)
test_images, test_labels = load_dataset(TEST_DIR)

print("Total train images:", len(train_images))
print("Total test images:", len(test_images))

Total train images: 28273
Total test images: 7067


In [11]:
# Initialize HOG descriptor with parameters
hog = cv2.HOGDescriptor((48, 48), (16, 16), (8, 8), (8, 8), 9)

def extract_hog_features(image_path):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        print(f"Error loading image: {image_path}")
        return None  # Skip this image

    img = cv2.resize(img, (48, 48))  # Resize image to match HOG descriptor
    features = hog.compute(img)  # Extract HOG features
    return features.flatten()

In [12]:
# Extract features efficiently, avoiding multiple function calls
train_features = []
for img in tqdm(train_images):
    features = extract_hog_features(img)
    if features is not None:
        train_features.append(features)

train_features = np.array(train_features)

  0%|          | 0/28273 [00:00<?, ?it/s]

In [13]:
# Extract HOG features for test dataset
test_features = []
for img in tqdm(test_images):
    features = extract_hog_features(img)
    if features is not None:
        test_features.append(features)

test_features = np.array(test_features)

  0%|          | 0/7067 [00:00<?, ?it/s]

In [33]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(train_features)
x_test = scaler.transform(test_features)

In [35]:
# Convert labels to numerical values
le = LabelEncoder()
y_train, y_test = le.fit_transform(train_labels), le.transform(test_labels)

In [91]:
import time
import os
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
from imblearn.over_sampling import SMOTE
from tqdm_joblib import tqdm_joblib
from joblib import Parallel, delayed
from tqdm import tqdm

# Ensure CatBoost has a valid working directory
os.environ["CATBOOST_WORKING_DIR"] = "catboost_tmp"
os.makedirs("catboost_tmp", exist_ok=True)

In [63]:
# 📌 Step 2: Handle Class Imbalance using SMOTE
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

# 📌 Step 3: Apply PCA for Dimensionality Reduction
pca = PCA(n_components=100)
x_train_pca = pca.fit_transform(x_train_resampled)
x_test_pca = pca.transform(x_test)

# 📌 Step 4: Define Base Models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, verbose=0),
    "XGBoost": XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42, use_label_encoder=False, eval_metric='logloss'),
    "CatBoost": CatBoostClassifier(n_estimators=100, learning_rate=0.1, random_state=42, verbose=0, train_dir="catboost_tmp")
}

In [59]:
# 📌 Step 5: Define Hyperparameter Grid for Tuning
param_grid = {
    "Random Forest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    },
    "XGBoost": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 6, 9]
    },
    "CatBoost": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 0.2],
        "depth": [4, 6, 10]
    }
}

In [65]:
# 📌 Step 6: Hyperparameter Tuning with RandomizedSearchCV
best_models = {}

for name, model in models.items():
    if name in param_grid:
        print(f"🔍 Tuning {name}...")
        
        # Apply RandomizedSearchCV
        search = RandomizedSearchCV(
            model, param_grid[name], n_iter=10, cv=3, 
            scoring="accuracy", n_jobs=-1, random_state=42, verbose=1
        )
        
        # Fit model with resampled training data
        search.fit(x_train_pca, y_train_resampled)
        
        # Save best model
        best_models[name] = search.best_estimator_
        print(f"✅ Best params for {name}: {search.best_params_}")
    else:
        best_models[name] = model  # Use default model if no tuning parameters

print("🎯 Hyperparameter tuning completed!")

🔍 Tuning Random Forest...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
✅ Best params for Random Forest: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30}
🔍 Tuning XGBoost...
Fitting 3 folds for each of 10 candidates, totalling 30 fits


Parameters: { "use_label_encoder" } are not used.



✅ Best params for XGBoost: {'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.1}
🔍 Tuning CatBoost...
Fitting 3 folds for each of 10 candidates, totalling 30 fits


7 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
7 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\projj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\projj\AppData\Local\Programs\Python\Python312\Lib\site-packages\catboost\core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_model,
  File "C:\Users\projj\AppData\Local\Programs\Python\Python312\Lib\site-packages\catboost\core.py", line 2410, in _fit
    

✅ Best params for CatBoost: {'n_estimators': 50, 'learning_rate': 0.2, 'depth': 10}
🎯 Hyperparameter tuning completed!


In [69]:
print(best_models.keys())  # Debugging step

dict_keys(['Random Forest', 'XGBoost', 'CatBoost'])


In [70]:
# 📌 Step 7: Create an Ensemble Model
ensemble_model = VotingClassifier(
    estimators=[('rf', best_models["Random Forest"]),
                ('xgb', best_models["XGBoost"]),
                ('cat', best_models["CatBoost"])], 
    voting='soft'  # Soft voting uses probabilities for better results
)

In [71]:
# 📌 Step 8: Train Ensemble Model
ensemble_model.fit(x_train_pca, y_train_resampled)

Parameters: { "use_label_encoder" } are not used.



In [72]:
# 📌 Step 9: Evaluate Ensemble Model
ensemble_preds = ensemble_model.predict(x_test_pca)
ensemble_accuracy = accuracy_score(y_test, ensemble_preds)
print(f"🚀 Ensemble Model Accuracy: {ensemble_accuracy:.4f}")

🚀 Ensemble Model Accuracy: 0.5353


In [79]:
# 📌 Step 10: Define Additional Models for Parallel Training
extra_models = {
    "SVM": SVC(kernel='linear', max_iter=1000),
    "Logistic Regression": LogisticRegression(max_iter=500, random_state=42, n_jobs=-1, verbose=0),
    "LightGBM": lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1, verbose=-1)
}

In [83]:
from sklearn.metrics import classification_report, confusion_matrix

# 📌 Step 11: Function to Train and Evaluate Each Model
def train_and_evaluate(name, model):
    print(f"Training {name}...")
    start_time = time.time()
    
    model.fit(x_train_pca, y_train_resampled)  # Train model
    
    preds = model.predict(x_test_pca)  # Predict on test set
    acc = accuracy_score(y_test, preds)  # Compute accuracy
    
    elapsed_time = time.time() - start_time
    print(f"{name} Accuracy: {acc:.4f} - Training Time: {elapsed_time:.2f} seconds\n")
    
    return name, acc, elapsed_time, model  # Return trained model

In [84]:
# 📌 Step 12: Train Extra Models in Parallel with Progress Bar
trained_models = {}  # Dictionary to store trained models
with tqdm_joblib(tqdm(desc="Training Extra Models", total=len(extra_models))):
    results = Parallel(n_jobs=-1)(
        delayed(train_and_evaluate)(name, model) for name, model in extra_models.items()
    )

# 📌 Step 13: Store trained models
for name, acc, time_taken, trained_model in results:
    trained_models[name] = trained_model  # Store trained model
    print(f"{name}: Accuracy = {acc:.4f}, Time = {time_taken:.2f} sec")

Training Extra Models:   0%|                                                                     | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

SVM: Accuracy = 0.1965, Time = 28.08 sec
Logistic Regression: Accuracy = 0.4404, Time = 1.11 sec
LightGBM: Accuracy = 0.4981, Time = 7.53 sec


In [85]:
# 📌 Step 14: Function to Compute Additional Metrics
def evaluate_extra_models(models, x_test, y_test):
    for name, model in models.items():
        print(f"\n🔹 {name} Performance Metrics:\n")
        
        # Predict on test set
        preds = model.predict(x_test)
        
        # Compute confusion matrix
        conf_matrix = confusion_matrix(y_test, preds)
        
        # Compute precision, recall, and F1-score
        class_report = classification_report(y_test, preds, digits=4)
        
        # Print metrics
        print("📌 Confusion Matrix:\n", conf_matrix)
        print("\n📊 Classification Report:\n", class_report)

# 📌 Step 15: Evaluate the Trained Models
evaluate_extra_models(trained_models, x_test_pca, y_test)


🔹 SVM Performance Metrics:



Training Extra Models:   0%|                                                                     | 0/3 [00:36<?, ?it/s]

📌 Confusion Matrix:
 [[279 182 232  86 140  39]
 [295 178 237 110 152  52]
 [403 276 510 130 385  70]
 [287 287 272 154 190  43]
 [417 260 200 124 211  35]
 [185 141 276  79  93  57]]

📊 Classification Report:
               precision    recall  f1-score   support

           0     0.1495    0.2912    0.1976       958
           1     0.1344    0.1738    0.1516      1024
           2     0.2953    0.2875    0.2913      1774
           3     0.2255    0.1249    0.1608      1233
           4     0.1802    0.1692    0.1745      1247
           5     0.1926    0.0686    0.1012       831

    accuracy                         0.1965      7067
   macro avg     0.1962    0.1859    0.1795      7067
weighted avg     0.2077    0.1965    0.1926      7067


🔹 Logistic Regression Performance Metrics:

📌 Confusion Matrix:
 [[ 324  126  130  131  159   88]
 [ 177  246  142  131  168  160]
 [ 182  104 1133  115  162   78]
 [ 156  132  146  518  181  100]
 [ 204  161  180  246  376   80]
 [  62   93   6




In [76]:
import joblib
# Save the trained ensemble model
joblib.dump(ensemble_model, "image_ensemble_model.pkl")

print("✅ Model saved successfully as 'image_ensemble_model.pkl'!")

Training Extra Models:   0%|                                                                     | 0/4 [34:16<?, ?it/s]


✅ Model saved successfully as 'image_ensemble_model.pkl'!


In [86]:
# Save the trained LightGBM model
lightgbm_model = trained_models.get("LightGBM")  # Retrieve trained LightGBM model
if lightgbm_model:
    joblib.dump(lightgbm_model, "image_lightgbm_model.pkl")  # Save the model
    print("✅ LightGBM model saved successfully as 'lightgbm_model.pkl'")
else:
    print("⚠️ LightGBM model not found!")

✅ LightGBM model saved successfully as 'lightgbm_model.pkl'


In [92]:
# Save the fitted StandardScaler
joblib.dump(scaler, "image_scaler.pkl")

# Save the fitted LabelEncoder
joblib.dump(le, "image_label_encoder.pkl")

print("✅ StandardScaler and LabelEncoder saved successfully!")

✅ StandardScaler and LabelEncoder saved successfully!


In [94]:
joblib.dump(pca, "image_model_pca.pkl")

['image_model_pca.pkl']