In [None]:
import os
import cv2
import pandas as pd
import numpy as np
from PIL import Image
from skimage.measure import shannon_entropy
from tqdm import tqdm

def compute_image_stats(image_path):
    try:
        img = cv2.imread(image_path, cv2.IMREAD_COLOR)
        if img is None or img.size == 0:
            return {"valid": False}

        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        mean = float(gray.mean())
        std = float(gray.std())
        entropy = shannon_entropy(gray)

        return {
            "valid": True,
            "mean": mean,
            "std": std,
            "entropy": entropy,
        }
    except Exception as e:
        return {"valid": False}

def scan_folder(folder):
    records = []
    for filename in tqdm(os.listdir(folder)):
        if not filename.lower().endswith((".jpg")):
            continue
        path = os.path.join(folder, filename)
        stats = compute_image_stats(path)
        stats["filename"] = filename
        records.append(stats)

    df = pd.DataFrame(records)
    return df

def flag_outliers(df):
    df["flag_low_std"] = df["std"] < 5
    df["flag_dark"] = df["mean"] < 30
    df["flag_bright"] = df["mean"] > 240
    df["flag_low_entropy"] = df["entropy"] < 2.0
    df["suspect"] = df[["flag_low_std", "flag_dark", "flag_bright", "flag_low_entropy"]].any(axis=1)
    return df

In [None]:
folder_path = "/home/masc12/dev/masc12-mthesis/data/poc_multiple/wiler-turm-resized"  # Change this

df = scan_folder(folder_path)
df = flag_outliers(df)
df["hour"] = df["filename"].apply(lambda x: int(x.split("_")[1].split("-")[0]))
print(f"\n✅ Done. {df['suspect'].sum()} suspect images found out of {len(df)}")

100%|██████████| 16580/16580 [00:43<00:00, 380.25it/s]


✅ Done. 0 suspect images found out of 16580





In [41]:
df[(df['hour'] > 5) &
   (df['hour'] < 23) &
   (df['suspect'] == True)].sort_values("filename")

Unnamed: 0,valid,mean,std,entropy,filename,flag_low_std,flag_dark,flag_bright,flag_low_entropy,suspect,hour
