In [4]:
import os
import shutil

In [45]:
SOURCE_DIR = "SUNRGBD"
OUTPUT_DIR = "dataset"
SPLITTED_OUTPUT_DIR = "splitted_dataset"

In [6]:
VALID_CLASSES = {
    "bedroom": "bedroom",
    "bathroom": "bathroom",
    "kitchen": "kitchen",
    "living_room": "living_room"
}

In [34]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

for root, dirs, files in os.walk(SOURCE_DIR):
    if "scene.txt" in files:
        scene_path = os.path.join(root, "scene.txt")

        with open(scene_path, "r") as f:
            label = f.read().strip().lower()

        if label not in VALID_CLASSES:
            continue

        image_dir = os.path.join(root, "image")
        if not os.path.exists(image_dir):
            continue

        images = os.listdir(image_dir)
        if len(images) == 0:
            continue

        img_path = os.path.join(image_dir, images[0])

        out_dir = os.path.join(OUTPUT_DIR, label)
        os.makedirs(out_dir, exist_ok=True)

        new_name = f"{os.path.basename(root)}.jpg"
        shutil.copy(img_path, os.path.join(out_dir, new_name))

In [35]:
from PIL import Image

In [52]:
def clean_corrupted_images(folder):
    for root, _, files in os.walk(folder):
        for f in files:
            path = os.path.join(root, f)
            try:
                img = Image.open(path)
                img.verify()
            except Exception as error:
                print(error)
                print("Removing:", path)
                os.remove(path)

clean_corrupted_images(OUTPUT_DIR)
clean_corrupted_images(SPLITTED_OUTPUT_DIR)

cannot identify image file 'splitted_dataset/test/.DS_Store'
Removing: splitted_dataset/test/.DS_Store


In [37]:
!pip install opencv-python



In [38]:
import cv2
import numpy as np
import os

def is_too_dark(img_path, threshold=25):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    return img.mean() < threshold

for cls in os.listdir(OUTPUT_DIR):
    cls_path = f"dataset/{cls}"
    for img in os.listdir(cls_path):
        img_path = os.path.join(cls_path, img)
        if is_too_dark(img_path):
            os.remove(img_path)
            print("Removing:", img_path)

Removing: dataset/bedroom/NYU1040.jpg
Removing: dataset/bedroom/0001540-000051594771.jpg
Removing: dataset/bedroom/NYU1149.jpg
Removing: dataset/bedroom/0001020-000034163159.jpg
Removing: dataset/bedroom/0000485-000016286857.jpg
Removing: dataset/bathroom/0000956-000032017365.jpg
Removing: dataset/bathroom/0000496-000016600005.jpg
Removing: dataset/living_room/NYU0603.jpg
Removing: dataset/living_room/0001252-000085021004.jpg
Removing: dataset/living_room/NYU0154.jpg
Removing: dataset/kitchen/0005957-001031848999.jpg


In [39]:
import imagehash

In [40]:
!pip install imagehash



In [41]:
import imagehash

In [42]:
hashes = {}
for cls in os.listdir(OUTPUT_DIR):
    for img in os.listdir(f"{OUTPUT_DIR}/{cls}"):
        path = f"{OUTPUT_DIR}/{cls}/{img}"
        h = imagehash.phash(Image.open(path))
        if h in hashes:
            os.remove(path)
            print("Removing:", path)
        else:
            hashes[h] = path

In [49]:
!pip install split-folders

Collecting split-folders
  Downloading split_folders-0.5.1-py3-none-any.whl.metadata (6.2 kB)
Downloading split_folders-0.5.1-py3-none-any.whl (8.4 kB)
Installing collected packages: split-folders
Successfully installed split-folders-0.5.1


In [50]:
import splitfolders

splitfolders.ratio(
    OUTPUT_DIR,
    output=SPLITTED_OUTPUT_DIR,
    seed=42,
    ratio=(0.7, 0.2, 0.1)
)