In [1]:
# setup
import json
import numpy as np
import shutil
from pathlib import Path
from sklearn.model_selection import train_test_split

# defining paths
data_dir = Path("../data/processed")
norm_dir = data_dir / "norm"
patches_dir = data_dir / "patches"
splits_dir = data_dir / "splits"

# Create directories
patches_dir.mkdir(exist_ok=True)
splits_dir.mkdir(exist_ok=True)

print(f"Source directory: {norm_dir}")
print(f"Target patches directory: {patches_dir}")
print(f"Target splits directory: {splits_dir}")

Source directory: ..\data\processed\norm
Target patches directory: ..\data\processed\patches
Target splits directory: ..\data\processed\splits


In [2]:
# scanning and categorizing files
print("\n" + "="*60)
print("SCANNING FILES")
print("="*60)

all_files = list(norm_dir.glob("*.npy"))
print(f"Total .npy files found: {len(all_files)}")

# separate normal vs flare
normal_files = [f for f in all_files if f.name.startswith("normal_")]
flare_files = [f for f in all_files if not f.name.startswith("normal_")]

print(f"Normal files: {len(normal_files)}")
print(f"Flare files: {len(flare_files)}")

# verify counts match expected ones
if len(normal_files) != 2000:
    print(f"[WARNING] Expected 2000 normal files, found {len(normal_files)}")
if len(flare_files) != 300:
    print(f"[WARNING] Expected 300 flare files, found {len(flare_files)}")


SCANNING FILES
Total .npy files found: 2300
Normal files: 2000
Flare files: 300


In [3]:
# rename and move
print("\n" + "="*60)
print("REORGANIZING FILES")
print("="*60)

# new naming scheme: patch_XXXX.npy (sequential)
file_mapping = {}  # old_name -> new_name

# process normal files
for i, old_path in enumerate(normal_files):
    new_name = f"patch_{i:04d}.npy"
    new_path = patches_dir / new_name
    shutil.copy2(old_path, new_path)  # copy instead of move for safety
    file_mapping[old_path.name] = new_name

# process flare files (continue numbering from 2000)
for i, old_path in enumerate(flare_files):
    new_name = f"patch_{2000 + i:04d}.npy"
    new_path = patches_dir / new_name
    shutil.copy2(old_path, new_path)
    file_mapping[old_path.name] = new_name

print(f"Copied {len(file_mapping)} files to {patches_dir}")

# save mapping for reference
mapping_path = data_dir / "filename_mapping.json"
with open(mapping_path, "w") as f:
    json.dump(file_mapping, f, indent=2)
print(f"Saved filename mapping to {mapping_path}")


REORGANIZING FILES
Copied 2300 files to ..\data\processed\patches
Saved filename mapping to ..\data\processed\filename_mapping.json


In [4]:
# create split files in JSON serializable format
print("\n" + "="*60)
print("CREATING TRAIN/VAL/TEST SPLITS")
print("="*60)

# load all patch metadata from the renamed files in patches_dir
all_patches = []
for patch_file in patches_dir.glob("*.npy"):
    idx = int(patch_file.stem.split("_")[1])  # extract number from patch_XXXX.npy
    label = 0 if idx < 2000 else 1  # first 2000 = normal, rest = flare
    
    all_patches.append({
        "path": f"patches/{patch_file.name}",
        "label": label
    })

# separate into lists of dictionaries (both JSON serializable)
normal_patches = [p for p in all_patches if p["label"] == 0]
flare_patches = [p for p in all_patches if p["label"] == 1]

print(f"Normal patches: {len(normal_patches)}")
print(f"Flare patches: {len(flare_patches)}")

# anomaly detection split: train/val = normal only, test = normal + flare
train_files = normal_patches[:1000]
val_files = normal_patches[1000:1300]
test_normal = normal_patches[1300:]
test_flares = flare_patches[:200]

# both are lists of dicts so its safe to concatenate
test_files = test_normal + test_flares

print(f"\nTrain: {len(train_files)} samples (all normal)")
print(f"Val: {len(val_files)} samples (all normal)")
print(f"Test: {len(test_files)} samples ({len(test_normal)} normal + {len(test_flares)} flare)")

# save splits
def save_split_json(split_name, files):
    split_path = splits_dir / f"{split_name}_files.json"
    with open(split_path, "w") as f:
        json.dump(files, f, indent=2)
    print(f"Saved {split_name}_files.json: {len(files)} samples")

save_split_json("train", train_files)
save_split_json("val", val_files)
save_split_json("test", test_files)


CREATING TRAIN/VAL/TEST SPLITS
Normal patches: 2000
Flare patches: 300

Train: 1000 samples (all normal)
Val: 300 samples (all normal)
Test: 900 samples (700 normal + 200 flare)
Saved train_files.json: 1000 samples
Saved val_files.json: 300 samples
Saved test_files.json: 900 samples


In [5]:
# verify Splits 
print("\n" + "="*60)
print("VERIFYING SPLITS")
print("="*60)

def verify_split(split_name):
    split_file = splits_dir / f"{split_name}_files.json"
    with open(split_file, "r") as f:
        files = json.load(f)
    
    labels = [f["label"] for f in files]
    print(f"{split_name}: {len(files)} total, {sum(labels)} flares")

verify_split("train")
verify_split("val")
verify_split("test")

# critical check
train_labels = [f["label"] for f in json.load(open(splits_dir / "train_files.json"))]
val_labels = [f["label"] for f in json.load(open(splits_dir / "val_files.json"))]

if sum(train_labels) == 0 and sum(val_labels) == 0:
    print("\n✓ SUCCESS: Train and Val are pure normal (correct for anomaly detection)")
else:
    print("\n✗ FAILURE: Train/Val contain flares!")


VERIFYING SPLITS
train: 1000 total, 0 flares
val: 300 total, 0 flares
test: 900 total, 200 flares

✓ SUCCESS: Train and Val are pure normal (correct for anomaly detection)


In [6]:
# after verification, you can delete the old norm/ directory to save space