In [6]:
import os
import zipfile
import h5py
import shutil
import cv2
import numpy as np
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from sklearn.cluster import KMeans

In [16]:
ZIP_FILE_PATH = r""
EXTRACTION_PATH = r""
OUTPUT_PATH = r""
SELECTED_CLASSES = ["pizza", "sushi", "ice_cream", "fried_rice"]
IMAGES_PER_CLASS = 50
IMAGE_SIZE = (224, 224)
N_CLUSTERS = 5

In [8]:
def extract_zip(zip_path, extraction_path):
    if not os.path.exists(extraction_path):
        os.makedirs(extraction_path)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extraction_path)
    print(f"Files extracted to {extraction_path}")


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = models.resnet50(pretrained=True)
model = nn.Sequential(*list(model.children())[:-1])  # حذف لایه‌ی Fully Connected
model = model.to(device)
model.eval()

In [10]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(IMAGE_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


In [11]:
def extract_features(image_path):
    image = cv2.imread(image_path)
    if image is None:
        raise ValueError(f"Unable to read image: {image_path}")
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = transform(image).unsqueeze(0).to(device)
    with torch.no_grad():
        features = model(image).squeeze().cpu().numpy()
    return features


In [12]:
def list_h5_files(extraction_path):
    h5_files = []
    for root, dirs, files in os.walk(extraction_path):
        for file in files:
            if file.endswith(".h5"):
                h5_files.append(os.path.join(root, file))
    return h5_files


In [13]:
def read_h5_file(h5_file_path):
    with h5py.File(h5_file_path, 'r') as h5_file:
        print(f"Reading {h5_file_path}:")
        metadata = {}
        for key in h5_file.keys():
            metadata[key] = h5_file[key][:]
        return metadata


In [14]:
def create_subset():
    
    for class_name in SELECTED_CLASSES:
        class_path = os.path.join(EXTRACTION_PATH, "images", class_name)
        output_class_path = os.path.join(OUTPUT_PATH, "images", class_name)
        os.makedirs(output_class_path, exist_ok=True)

        
        features = []
        image_paths = []
        for img_name in os.listdir(class_path):
            img_path = os.path.join(class_path, img_name)
            try:
                feat = extract_features(img_path)
                features.append(feat)
                image_paths.append(img_path)
            except Exception as e:
                print(f"Error processing {img_path}: {e}")

        features = np.array(features)

        kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42)
        cluster_labels = kmeans.fit_predict(features)

        selected_images = []
        for cluster_id in range(N_CLUSTERS):
            cluster_indices = np.where(cluster_labels == cluster_id)[0]
            np.random.shuffle(cluster_indices)
            selected_indices = cluster_indices[:IMAGES_PER_CLASS // N_CLUSTERS]
            selected_images.extend([image_paths[i] for i in selected_indices])

        for img_path in selected_images:
            shutil.copy(img_path, output_class_path)

        print(f"Class {class_name}: {len(selected_images)} images selected.")
        
    meta_path = os.path.join(EXTRACTION_PATH, "meta")
    output_meta_path = os.path.join(OUTPUT_PATH, "meta")
    if os.path.exists(meta_path):
        shutil.copytree(meta_path, output_meta_path, dirs_exist_ok=True)
        print(f"Metadata copied to {output_meta_path}")


In [17]:
if __name__ == "__main__":
    
    extract_zip(ZIP_FILE_PATH, EXTRACTION_PATH)

    h5_files = list_h5_files(EXTRACTION_PATH)
    if h5_files:
        metadata = read_h5_file(h5_files[0])
        print(f"Example metadata: {list(metadata.keys())}")

    create_subset()


Files extracted to D:\AI projects\calorie_counter\food101
Reading D:\AI projects\calorie_counter\food101\food_c101_n1000_r384x384x3.h5:
Example metadata: ['category', 'category_names', 'images']
Class pizza: 50 images selected.
Class sushi: 50 images selected.
Class ice_cream: 50 images selected.
Class fried_rice: 50 images selected.
Metadata copied to D:\AI projects\calorie_counter\food101-subset\meta
