anomaly_dataset/normal/      # Only forest images

anomaly_dataset/anomaly/     # All other categories


In [12]:
import random, numpy as np
random.seed(2)
np.random.seed(2)
from sklearn.model_selection import train_test_split

In [13]:
from pathlib import Path
import shutil

In [14]:
# Set base directory of the dataset
base_dir = "archive/seg_test/seg_test"
# Define classes
normal_class = "forest"
anomaly_classes = ["buildings", "glacier", "mountain", "sea", "street"]

In [15]:
# 1) Paths & classes
base_dir        = Path("archive/seg_test/seg_test")
normal_class    = "forest"
anomaly_classes = ["buildings", "glacier", "mountain", "sea", "street"]
output_dir      = Path("split_anomaly_dataset")

# 2) Helper to copy files into a folder
def make_split(file_list, target_dir):
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    for f in file_list:
        shutil.copy(f, target_dir / f.name)

# 3) Gather all file paths
normal_files   = list((base_dir / normal_class).glob("*.jpg"))
anomaly_files  = []
for cls in anomaly_classes:
    anomaly_files += list((base_dir / cls).glob("*.jpg"))

# 4) Split normals: 60% train, 20% val, 20% test
n_train = int(len(normal_files) * 0.6)
train_norm, temp_norm = train_test_split(normal_files, train_size=n_train, random_state=42)
val_norm, test_norm   = train_test_split(temp_norm, test_size=0.5,     random_state=42)

# 5) Split anomalies: 50% val, 50% test (no anomalies in train)
val_anom, test_anom = train_test_split(anomaly_files, test_size=0.5, random_state=42)

# ↓ only take *up to* N anomalies in each split ↓
MAX_ANOM_VAL  = 20
MAX_ANOM_TEST = 20

val_anom  = random.sample(val_anom,  min(len(val_anom),  MAX_ANOM_VAL))
test_anom = random.sample(test_anom, min(len(test_anom), MAX_ANOM_TEST))

# 6) Copy into folder structure (unchanged)
for split, normals, anoms in [
    ("train", train_norm, []),
    ("val",   val_norm,   val_anom),
    ("test",  test_norm,  test_anom),
]:
    make_split(normals, output_dir / split / "normal")
    make_split(anoms,    output_dir / split / "anomaly")

print("Dataset split complete:")
print(f"  Train → normal={len(train_norm)}, anomaly=0")
print(f"   Val  → normal={len(val_norm)}, anomaly={len(val_anom)}")
print(f"   Test → normal={len(test_norm)}, anomaly={len(test_anom)}")

Dataset split complete:
  Train → normal=284, anomaly=0
   Val  → normal=95, anomaly=20
   Test → normal=95, anomaly=20


In [16]:
import os
import shutil
from pathlib import Path
import random



# Define classes
normal_class = "forest"
anomaly_classes = ["buildings", "glacier", "mountain", "sea", "street"]

# Output directories
output_dir = "anomaly_dataset"
normal_dir = os.path.join(output_dir, "normal")
anomaly_dir = os.path.join(output_dir, "anomaly")

# Create output folders
os.makedirs(normal_dir, exist_ok=True)
os.makedirs(anomaly_dir, exist_ok=True)

# Copy normal class images
for img_name in os.listdir(os.path.join(base_dir, normal_class)):
    src = os.path.join(base_dir, normal_class, img_name)
    dst = os.path.join(normal_dir, img_name)
    shutil.copy(src, dst)

# Copy anomaly class images
for cls in anomaly_classes:
    class_path = os.path.join(base_dir, cls)
    for img_name in os.listdir(class_path):
        src = os.path.join(class_path, img_name)
        dst = os.path.join(anomaly_dir, f"{cls}_{img_name}")
        shutil.copy(src, dst)

print("Anomaly dataset created!")


Anomaly dataset created!
