In [2]:
!pip install mahotas

Collecting mahotas
  Downloading mahotas-1.4.18-cp310-cp310-win_amd64.whl.metadata (14 kB)
Downloading mahotas-1.4.18-cp310-cp310-win_amd64.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.7 MB ? eta -:--:--
   ------------ --------------------------- 0.5/1.7 MB 2.1 MB/s eta 0:00:01
   ------------------ --------------------- 0.8/1.7 MB 1.5 MB/s eta 0:00:01
   ------------------------ --------------- 1.0/1.7 MB 1.4 MB/s eta 0:00:01
   ------------------------------ --------- 1.3/1.7 MB 1.3 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 1.3 MB/s eta 0:00:00
Installing collected packages: mahotas
Successfully installed mahotas-1.4.18


In [8]:
import numpy as np
import cv2
import mahotas
import os
import pickle
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Set the image directory path manually (change this as needed)
IMDIR = r"C:\Users\prave\Final Year project\data"

# Output file paths
BOVW = "model/bovw_codebook_600.pickle"
DICT_SIZE = 600
DATA = 'model/data_600.npy'
LABEL = 'model/label_600.npy'

# Feature extractors
def fd_hu_moments(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return cv2.HuMoments(cv2.moments(gray)).flatten()

def fd_haralick(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return mahotas.features.haralick(gray).mean(axis=0)

def fd_histogram(image):
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    cv2.normalize(hist, hist)
    return hist.flatten()

def feature_extract(im):
    gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
    feature = bowDiction.compute(gray, sift.detect(gray))
    return feature.squeeze() if feature is not None else np.zeros(DICT_SIZE)

# Get the absolute image path
base = Path(IMDIR).resolve()
print(f"📁 Image directory: {base}")

# Initialize SIFT and BOW trainer
sift = cv2.SIFT_create()
BOW = cv2.BOWKMeansTrainer(DICT_SIZE)

print("🔍 Collecting descriptors...")
for file in base.glob('**/*.*'):
    fpath = Path(file).resolve()
    image = cv2.imread(str(fpath))
    if image is None:
        print(f"❌ Could not read image: {fpath}")
        continue

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    kp, dsc = sift.detectAndCompute(gray, None)
    if dsc is not None:
        BOW.add(dsc)
    else:
        print(f"⚠️ No descriptors found in: {fpath.name}")

descriptors_list = BOW.getDescriptors()
total_desc = sum(d.shape[0] for d in descriptors_list) if descriptors_list else 0
print(f"✅ Total descriptors collected: {total_desc}")

if total_desc == 0:
    print("❌ No descriptors found in any image. Exiting.")
    exit()

# Cluster descriptors and create vocabulary
print("📊 Clustering descriptors to form visual vocabulary...")
dictionary = BOW.cluster()
print(f"✅ Dictionary shape: {dictionary.shape}")

# Save vocabulary
with open(BOVW, "wb") as f:
    pickle.dump(dictionary, f)

# Load vocabulary and set up BOW extractor
with open(BOVW, "rb") as f:
    dictionary = pickle.load(f)

sift2 = cv2.SIFT_create()
bowDiction = cv2.BOWImgDescriptorExtractor(sift2, cv2.BFMatcher(cv2.NORM_L2))
bowDiction.setVocabulary(dictionary)

print("🔎 Feature Extraction for all images...")
x_data = []
x_label = []

for file in base.glob('**/*.*'):
    fpath = Path(file).resolve()
    image = cv2.imread(str(fpath))
    if image is None:
        print(f"❌ Could not read image: {fpath}")
        continue

    humo = fd_hu_moments(image)
    harl = fd_haralick(image)
    hist = fd_histogram(image)
    bovw = feature_extract(image)

    features = np.hstack([humo, harl, hist, bovw])
    x_data.append(features)
    x_label.append(int(fpath.parent.name))

# Scale features
scaler = MinMaxScaler(feature_range=(0, 1))
x_data = scaler.fit_transform(x_data)

# Encode labels
encoder = LabelEncoder()
x_label = encoder.fit_transform(x_label)

# Save features and labels
np.save(DATA, np.array(x_data))
np.save(LABEL, np.array(x_label))

print("✅ Feature extraction complete. Data saved.")


📁 Image directory: C:\Users\prave\Final Year project\data
🔍 Collecting descriptors...
✅ Total descriptors collected: 493256
📊 Clustering descriptors to form visual vocabulary...
✅ Dictionary shape: (600, 128)
🔎 Feature Extraction for all images...
✅ Feature extraction complete. Data saved.
