In [13]:
import os
import cv2
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from skimage.feature import canny
from skimage.measure import shannon_entropy

In [14]:

# Load image in grayscale 

def load_gray(file_path):
    img = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
    return img

In [15]:
# Extract features from each image 

def extract_features(img, file_path, class_label, pixel_density):
    # Basic info
    height, width = img.shape
    aspect_ratio = width / height
    file_size_kb = os.path.getsize(file_path) / 1024

    # Intensity stats
    mean_intensity = np.mean(img)
    std_intensity = np.std(img)
    skewness = skew(img.flatten())
    kurt = kurtosis(img.flatten())

    # Entropy and edge density
    entropy = shannon_entropy(img)
    edges = canny(img)
    edge_density = np.sum(edges) / (width * height)

    # Return as list in same order as DataFrame columns
    return [
        os.path.basename(file_path), class_label, pixel_density,
        width, height, aspect_ratio, file_size_kb,
        mean_intensity, std_intensity, skewness,
        kurt, entropy, edge_density
    ]


In [16]:
# Main function: Build metadata 

def build_metadata(root_folder):
    columns = [
        "file_name", "class_label", "pixel_density", "width", "height",
        "aspect_ratio", "file_size_kb", "mean_intensity", "std_intensity",
        "skewness", "kurtosis", "entropy", "edge_density"
    ]

    all_features = []

    for class_folder in os.listdir(root_folder):
        class_path = os.path.join(root_folder, class_folder)
        if not os.path.isdir(class_path):
            continue

        for pixel_folder in os.listdir(class_path):
            pixel_path = os.path.join(class_path, pixel_folder)
            if not os.path.isdir(pixel_path):
                continue

            for file_name in tqdm(os.listdir(pixel_path), desc=f"{class_folder}/{pixel_folder}"):
                if file_name.lower().endswith(".tif"):
                    file_path = os.path.join(pixel_path, file_name)
                    img = load_gray(file_path)
                    features = extract_features(img, file_path, class_folder, pixel_folder)
                    all_features.append(features)

    metadata = pd.DataFrame(all_features, columns=columns)
    return metadata

In [24]:
# Run the function 

#root_folder = r"C:\Users\Rishabh\OneDrive\Desktop\Dataset\Canon120-1-20251104T122320Z-1-001.zip"

#metadata_df = build_metadata(root_folder)

In [18]:
# Display Output 

print("\n Metadata table generated successfully!\n")
print(metadata_df.head())  # show first few rows as a table


 Metadata table generated successfully!

Empty DataFrame
Columns: [file_name, class_label, pixel_density, width, height, aspect_ratio, file_size_kb, mean_intensity, std_intensity, skewness, kurtosis, entropy, edge_density]
Index: []


In [None]:
import os
import zipfile
import pandas as pd
from tqdm import tqdm

def extract_zip(zip_path):
    """Extracts a ZIP file if not already extracted, returns the folder path."""
    extract_to = zip_path.replace('.zip', '')
    if not os.path.exists(extract_to):
        print(f"Extracting: {zip_path}")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
    return extract_to


def build_metadata_for_all_zips(main_folder):
    """Scans all ZIP files in main_folder, extracts them, and builds metadata."""
    all_metadata = []

    # Loop through all files in main folder
    for file_name in os.listdir(main_folder):
        if file_name.lower().endswith(".zip"):
            zip_path = os.path.join(main_folder, file_name)
            
            # Extract ZIP and get extracted folder path
            extracted_folder = extract_zip(zip_path)
            
            # Build metadata for that extracted folder
            metadata_df = build_metadata(extracted_folder)
            all_metadata.append(metadata_df)

    # Combine all metadata into one DataFrame
    if all_metadata:
        final_metadata = pd.concat(all_metadata, ignore_index=True)
        return final_metadata
    else:
        print("No ZIP files found.")
        return pd.DataFrame()
    
   


NameError: name 'main_folder' is not defined

In [23]:
main_folder = r"C:\Users\Rishabh\OneDrive\Desktop\Dataset"
metadata_df = build_metadata_for_all_zips(main_folder)

metadata_df.to_csv("metadata_all.csv", index=False)
print("✅ Metadata created for all ZIPs and saved as 'metadata_all.csv'")


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\Canon120-1-20251104T122320Z-1-001.zip


Canon120-1/150: 100%|██████████| 100/100 [00:38<00:00,  2.58it/s]
Canon120-1/300: 100%|██████████| 79/79 [02:19<00:00,  1.76s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\Canon120-2-20251104T130247Z-1-001.zip


Canon120-2/150: 100%|██████████| 84/84 [00:30<00:00,  2.72it/s]
Canon120-2/300: 100%|██████████| 66/66 [01:56<00:00,  1.77s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\Canon120-2-20251104T130247Z-1-002.zip


Canon120-2/150: 100%|██████████| 16/16 [00:06<00:00,  2.47it/s]
Canon120-2/300: 100%|██████████| 34/34 [00:59<00:00,  1.74s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\Canon220-20251104T125653Z-1-002.zip


Canon220/300: 100%|██████████| 37/37 [01:05<00:00,  1.77s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\Canon220-20251104T144155Z-1-001.zip


Canon220/150: 100%|██████████| 23/23 [00:09<00:00,  2.52it/s]
Canon220/300: 100%|██████████| 84/84 [02:34<00:00,  1.84s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\Canon9000-1-20251104T130957Z-1-001.zip


Canon9000-1/150: 100%|██████████| 52/52 [00:21<00:00,  2.40it/s]
Canon9000-1/300: 100%|██████████| 94/94 [02:52<00:00,  1.84s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\Canon9000-1-20251104T130957Z-1-002.zip


Canon9000-1/150: 100%|██████████| 48/48 [00:17<00:00,  2.73it/s]
Canon9000-1/300: 100%|██████████| 6/6 [00:10<00:00,  1.74s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\Canon9000-2-20251104T131609Z-1-001.zip


Canon9000-2/150: 100%|██████████| 9/9 [00:03<00:00,  2.52it/s]
Canon9000-2/300: 100%|██████████| 86/86 [02:30<00:00,  1.76s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\Canon9000-2-20251104T131609Z-1-002.zip


Canon9000-2/150: 100%|██████████| 91/91 [00:33<00:00,  2.70it/s]
Canon9000-2/300: 100%|██████████| 14/14 [00:24<00:00,  1.74s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\EpsonV370-1-20251104T133729Z-1-001.zip


EpsonV370-1/150: 100%|██████████| 10/10 [00:03<00:00,  2.53it/s]
EpsonV370-1/300: 100%|██████████| 83/83 [02:36<00:00,  1.88s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\EpsonV370-1-20251104T133729Z-1-002.zip


EpsonV370-1/150: 100%|██████████| 90/90 [00:38<00:00,  2.34it/s]
EpsonV370-1/300: 100%|██████████| 17/17 [00:33<00:00,  1.99s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\EpsonV370-2-20251104T134827Z-1-001.zip


EpsonV370-2/150: 100%|██████████| 100/100 [00:43<00:00,  2.27it/s]
EpsonV370-2/300: 100%|██████████| 71/71 [01:43<00:00,  1.46s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\EpsonV370-2-20251104T134827Z-1-002.zip


EpsonV370-2/300: 100%|██████████| 29/29 [00:45<00:00,  1.57s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\EpsonV39-1-20251104T132145Z-1-001.zip


EpsonV39-1/150: 100%|██████████| 100/100 [00:33<00:00,  2.95it/s]
EpsonV39-1/300: 100%|██████████| 63/63 [01:32<00:00,  1.46s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\EpsonV39-1-20251104T132145Z-1-002.zip


EpsonV39-1/300: 100%|██████████| 37/37 [01:08<00:00,  1.86s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\EpsonV39-2-20251104T133029Z-1-001.zip


EpsonV39-2/150: 100%|██████████| 21/21 [00:07<00:00,  2.64it/s]
EpsonV39-2/300: 100%|██████████| 87/87 [02:05<00:00,  1.44s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\EpsonV39-2-20251104T133029Z-1-002.zip


EpsonV39-2/150: 100%|██████████| 79/79 [00:32<00:00,  2.44it/s]
EpsonV39-2/300: 100%|██████████| 13/13 [00:18<00:00,  1.42s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\EpsonV550-20251104T135923Z-1-001.zip


EpsonV550/150: 100%|██████████| 8/8 [00:03<00:00,  2.06it/s]
EpsonV550/300: 100%|██████████| 81/81 [01:56<00:00,  1.44s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\EpsonV550-20251104T135923Z-1-002.zip


EpsonV550/150: 100%|██████████| 92/92 [00:34<00:00,  2.68it/s]
EpsonV550/300: 100%|██████████| 19/19 [00:25<00:00,  1.36s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\HP-20251104T141025Z-1-001.zip


HP/150: 100%|██████████| 74/74 [00:31<00:00,  2.38it/s]
HP/300: 100%|██████████| 71/71 [01:41<00:00,  1.42s/it]


Extracting: C:\Users\Rishabh\OneDrive\Desktop\Dataset\HP-20251104T141025Z-1-002.zip


HP/150: 100%|██████████| 26/26 [00:11<00:00,  2.18it/s]
HP/300: 100%|██████████| 29/29 [00:45<00:00,  1.56s/it]

✅ Metadata created for all ZIPs and saved as 'metadata_all.csv'



