In [1]:
# Step 3: 03_feature_extraction.ipynb

# OBJECTIVES
# modularize HOG, edge detection, and raw pixel features 
# allow defining and testing multiple feature combinations 
# optionally apply PCA per combo
# save features and labels for model training 

In [2]:
import os 
import cv2
import numpy as np
import matplotlib.pyplot as plt 
import glob
from tqdm import tqdm 
from skimage.feature import hog
from sklearn.decomposition import PCA
import joblib # for saving features 
from sklearn.preprocessing import StandardScaler

In [3]:
# paths 
INPUT_ROOT = "../data/processed/NEU-DET"
OUTPUT_ROOT = "../data/features/NEU-DET"
SPLITS = ["train", "validation"]
IMAGE_SIZE = (128, 128) # should match the preprocessing step
PCA_COMPONENTS = 100

In [4]:
# feature combination recipes
# FIXED: hog was defined twice
feature_combos = {
    "hog": ["hog"],
    "edge": ["edge"],
    "raw": ["raw"],
    "hog_edge": ["hog", "edge"],
    "hog_pca": ["hog", "pca"],
    "hog_edge_pca": ["hog", "edge", "pca"]
}

In [5]:
# feature extraction functions 
def extract_hog(img): 
    return hog(img,
        orientations=9, 
               pixels_per_cell=(8, 8), 
               cells_per_block=(2, 2), 
               block_norm="L2-Hys")

def extract_edge(img): 
    edge_map = cv2.Canny(img, 100, 200) 
    return edge_map.flatten() / 255.0

def extract_raw(img): 
    return img.flatten() / 255.0

# combine features based on the combo setting
def extract_features(img, combo): 
    feats = []
    if "hog" in combo: 
        feats.append(extract_hog(img))
    if "edge" in combo: 
        feats.append(extract_edge(img))
    if "raw" in combo: 
        feats.append(extract_raw(img))
    return np.concatenate(feats)

In [6]:
# now loop over each combination
for combo_name, combo_parts in feature_combos.items(): 
    print(f"\n🔄 Extracting features for combo: {combo_name}")

    all_features = {}
    all_labels = {}

    for split in SPLITS: 
        input_dir = os.path.join(INPUT_ROOT, split, "images")
        image_paths = glob.glob(os.path.join(input_dir, "**", "*.jpg"), recursive=True)

        features = []
        labels = []

        print(f"   ⏳ Processing {split} ({len(image_paths)} images...)")
        for path in tqdm(image_paths): 
            img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
            if img is None: 
                print(f"❌ Could not load image: {path}")
                continue

            try: 
                feat = extract_features(img, combo_parts)
                label = os.path.basename(os.path.dirname(path))
                features.append(feat)
                labels.append(label)
            except Exception as e: 
                print(f"❌ Error processing {path} in combo '{combo_name}': {e}")

        all_features[split] = np.array(features)
        all_labels[split] = np.array(labels)
        print(f"   ✅ {split} feature shape: {all_features[split].shape}")

    # BUG FIX: ensure output_dir is defined before saving PCA or features 
    output_dir = os.path.join(OUTPUT_ROOT, combo_name)
    os.makedirs(output_dir, exist_ok=True)

    # OPTIONAL: add standardization or normalization before applying PCA
    print("   Applying StandardScaler...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(all_features["train"])
    X_val_scaled = scaler.transform(all_features["validation"])

    all_features["train"] = X_train_scaled
    all_features["validation"] = X_val_scaled
    
    # OPTIONAL: apply PCA
    if "pca" in combo_parts: 
        print(f"   Applying PCA...")
        pca = PCA(n_components=PCA_COMPONENTS)
        X_train_pca = pca.fit_transform(all_features["train"])
        X_val_pca = pca.transform(all_features["validation"])

        all_features["train"] = X_train_pca
        all_features["validation"] = X_val_pca

        # save PCA model
        joblib.dump(pca, os.path.join(output_dir, "pca_model.pkl"))
        print(f"   💾 Saved PCA model to {output_dir}/pca_model.pkl")
    else: 
        pca = None

    # save outputs
    output_dir = os.path.join(OUTPUT_ROOT, combo_name)
    os.makedirs(output_dir, exist_ok=True)

    joblib.dump(all_features["train"], os.path.join(output_dir, f"X_train.pkl"))
    joblib.dump(all_labels["train"], os.path.join(output_dir, f"y_train.pkl"))
    joblib.dump(all_features["validation"], os.path.join(output_dir, f"X_validation.pkl"))
    joblib.dump(all_labels["validation"], os.path.join(output_dir, f"y_validation.pkl"))

    joblib.dump(scaler, os.path.join(output_dir, "scaler.pkl"))
    
    print(f"   💾 Saved combo '{combo_name}' to {output_dir}")


🔄 Extracting features for combo: hog
   ⏳ Processing train (1440 images...)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1440/1440 [00:03<00:00, 405.76it/s]


   ✅ train feature shape: (1440, 8100)
   ⏳ Processing validation (360 images...)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 360/360 [00:00<00:00, 397.18it/s]


   ✅ validation feature shape: (360, 8100)
   Applying StandardScaler...
   💾 Saved combo 'hog' to ../data/features/NEU-DET/hog

🔄 Extracting features for combo: edge
   ⏳ Processing train (1440 images...)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1440/1440 [00:00<00:00, 4076.34it/s]


   ✅ train feature shape: (1440, 16384)
   ⏳ Processing validation (360 images...)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 360/360 [00:00<00:00, 4279.91it/s]


   ✅ validation feature shape: (360, 16384)
   Applying StandardScaler...
   💾 Saved combo 'edge' to ../data/features/NEU-DET/edge

🔄 Extracting features for combo: raw
   ⏳ Processing train (1440 images...)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1440/1440 [00:00<00:00, 8328.42it/s]


   ✅ train feature shape: (1440, 16384)
   ⏳ Processing validation (360 images...)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 360/360 [00:00<00:00, 8193.51it/s]


   ✅ validation feature shape: (360, 16384)
   Applying StandardScaler...
   💾 Saved combo 'raw' to ../data/features/NEU-DET/raw

🔄 Extracting features for combo: hog_edge
   ⏳ Processing train (1440 images...)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1440/1440 [00:03<00:00, 377.41it/s]


   ✅ train feature shape: (1440, 24484)
   ⏳ Processing validation (360 images...)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 360/360 [00:00<00:00, 371.23it/s]


   ✅ validation feature shape: (360, 24484)
   Applying StandardScaler...
   💾 Saved combo 'hog_edge' to ../data/features/NEU-DET/hog_edge

🔄 Extracting features for combo: hog_pca
   ⏳ Processing train (1440 images...)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1440/1440 [00:03<00:00, 401.85it/s]


   ✅ train feature shape: (1440, 8100)
   ⏳ Processing validation (360 images...)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 360/360 [00:00<00:00, 396.84it/s]


   ✅ validation feature shape: (360, 8100)
   Applying StandardScaler...
   Applying PCA...
   💾 Saved PCA model to ../data/features/NEU-DET/hog_pca/pca_model.pkl
   💾 Saved combo 'hog_pca' to ../data/features/NEU-DET/hog_pca

🔄 Extracting features for combo: hog_edge_pca
   ⏳ Processing train (1440 images...)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1440/1440 [00:03<00:00, 363.58it/s]


   ✅ train feature shape: (1440, 24484)
   ⏳ Processing validation (360 images...)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 360/360 [00:01<00:00, 359.65it/s]


   ✅ validation feature shape: (360, 24484)
   Applying StandardScaler...
   Applying PCA...
   💾 Saved PCA model to ../data/features/NEU-DET/hog_edge_pca/pca_model.pkl
   💾 Saved combo 'hog_edge_pca' to ../data/features/NEU-DET/hog_edge_pca


In [7]:
# sanity check 
combo_to_check = "hog_edge_pca"
combo_path = os.path.join(OUTPUT_ROOT, combo_to_check)

X_train = joblib.load(os.path.join(combo_path, "X_train.pkl"))
y_train = joblib.load(os.path.join(combo_path, "y_train.pkl"))

print("Loaded combo: ", combo_to_check)
print("Feature shape: ", X_train.shape)
print("Label sample: ", y_train[:6])

Loaded combo:  hog_edge_pca
Feature shape:  (1440, 100)
Label sample:  ['pitted_surface' 'pitted_surface' 'pitted_surface' 'pitted_surface'
 'pitted_surface' 'pitted_surface']
