In [None]:
import os
import json
from pathlib import Path
from dotenv import load_dotenv
import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

In [None]:
# Load paths from .env
load_dotenv()

DATASET_DIR = os.getenv('DATASET_CLEANED')         
ANNOTATION_JSON = os.getenv('ANNOTATION_CLEANED')
OUTPUT_DIR = os.getenv('OUTPUT_DIR')       
os.makedirs(OUTPUT_DIR, exist_ok=True)

TRAIN_JSON = os.path.join(OUTPUT_DIR, 'train_split.json')
VAL_JSON = os.path.join(OUTPUT_DIR, 'val_split.json')

VAL_SPLIT = 0.2
TRUST_THRESHOLD = 0.5


In [None]:
# Load annotations
with open(ANNOTATION_JSON, 'r') as f:
    annotations = json.load(f)
    
print(f"Loaded {len(annotations)} images from JSON.")


In [None]:
# Prepare multi-label matrix
all_parts = set()
for img_data in annotations.values():
    all_parts.update(img_data['parts'].keys())
all_parts = sorted(list(all_parts))

# Create multi-label array
img_ids = list(annotations.keys())
X = np.array(img_ids)  # dummy X (image IDs)
y = np.zeros((len(img_ids), len(all_parts)), dtype=int)

for i, img_id in enumerate(img_ids):
    for j, part_name in enumerate(all_parts):
        if part_name in annotations[img_id]['parts']:
            state = annotations[img_id]['parts'][part_name]['object_state_class']
            y[i, j] = state  


In [None]:
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=VAL_SPLIT, random_state=42)

for train_idx, val_idx in msss.split(X, y):
    train_ids = X[train_idx]
    val_ids = X[val_idx]

train_annotations = {k: annotations[k] for k in train_ids}
val_annotations = {k: annotations[k] for k in val_ids}

print(f"Train images: {len(train_annotations)}")
print(f"Validation images: {len(val_annotations)}")


In [None]:
with open(TRAIN_JSON, 'w') as f:
    json.dump(train_annotations, f, indent=2)

with open(VAL_JSON, 'w') as f:
    json.dump(val_annotations, f, indent=2)

print(f"Saved train/validation splits in {OUTPUT_DIR}")


In [None]:
# Check distribution for each part
def part_distribution(data):
    dist = {}
    for part in all_parts:
        counts = {}
        for img in data.values():
            if part in img['parts']:
                cls = img['parts'][part]['object_state_class']
                counts[cls] = counts.get(cls, 0) + 1
        dist[part] = counts
    return dist

print("Train distribution per part:")
print(part_distribution(train_annotations))
print("\nValidation distribution per part:")
print(part_distribution(val_annotations))


In [None]:
load_dotenv()  

# Read paths
TRAIN_JSON = os.getenv("TRAIN_JSON")
VAL_JSON = os.getenv("VAL_JSON")

# Verify
print(f"Train JSON path: {TRAIN_JSON}")
print(f"Validation JSON path: {VAL_JSON}")

with open(TRAIN_JSON, 'r') as f:
    train_data = json.load(f)

with open(VAL_JSON, 'r') as f:
    val_data = json.load(f)

In [None]:
# Object state labels
state_labels = {0: "intact", 1: "damaged", 2: "absent", 3: "occluded"}

def overall_class_counts(data):
    counts = {k: 0 for k in state_labels.keys()}
    for img_data in data.values():
        for part in img_data['parts'].values():
            obj_state_class = part['object_state_class']
            counts[obj_state_class] += 1
    return counts

train_counts = overall_class_counts(train_data)
val_counts = overall_class_counts(val_data)

# Convert counts to percentages
def counts_to_pct(counts):
    total = sum(counts.values())
    return {state_labels[k]: v/total*100 for k,v in counts.items()}

train_pct = counts_to_pct(train_counts)
val_pct = counts_to_pct(val_counts)

print("Train percentages:", train_pct)
print("Validation percentages:", val_pct)


In [None]:
labels = list(train_pct.keys())
train_vals = [train_pct[l] for l in labels]
val_vals = [val_pct[l] for l in labels]

x = np.arange(len(labels))
width = 0.35

fig, ax = plt.subplots(figsize=(8,5))
ax.bar(x - width/2, train_vals, width, label="Train")
ax.bar(x + width/2, val_vals, width, label="Validation")

ax.set_ylabel("Percentage (%)")
ax.set_title("Overall Class Distribution (all parts)")
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

plt.show()


In [None]:
# Object state labels
state_labels = {0: "intact", 1: "damaged", 2: "absent", 3: "occluded"}

def part_class_percentages(data):
    """Return a dict: part -> {class_label: percentage}"""
    part_counts = {}
    
    for img_data in data.values():
        for part_name, part in img_data['parts'].items():
            cls = part['object_state_class']
            if part_name not in part_counts:
                part_counts[part_name] = {k: 0 for k in state_labels.keys()}
            part_counts[part_name][cls] += 1
    
    # Convert to percentages
    part_pct = {}
    for part, counts in part_counts.items():
        total = sum(counts.values())
        part_pct[part] = {state_labels[k]: counts[k]/total*100 for k in state_labels.keys()}
    
    return part_pct

train_part_pct = part_class_percentages(train_data)
val_part_pct = part_class_percentages(val_data)


In [None]:
parts = list(train_part_pct.keys())
num_parts = len(parts)
cols = 4 
rows = int(np.ceil(num_parts / cols))

fig, axes = plt.subplots(rows, cols, figsize=(18, rows*3))
axes = axes.flatten()

for i, part in enumerate(parts):
    labels = list(train_part_pct[part].keys())
    train_vals = [train_part_pct[part][l] for l in labels]
    val_vals = [val_part_pct.get(part, {l:0 for l in labels})[l] for l in labels]

    x = np.arange(len(labels))
    width = 0.35

    axes[i].bar(x - width/2, train_vals, width, label='Train')
    axes[i].bar(x + width/2, val_vals, width, label='Validation')
    axes[i].set_title(part)
    axes[i].set_xticks(x)
    axes[i].set_xticklabels(labels, rotation=45)
    axes[i].set_ylim(0, 100)
    if i == 0:
        axes[i].legend()

# Remove empty subplots
for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

fig.tight_layout()
plt.show()


In [None]:

load_dotenv()

# Paths
DATASET_CLEANED = Path(os.getenv("DATASET_CLEANED"))
TRAIN_JSON = Path(os.getenv("TRAIN_JSON"))
VAL_JSON = Path(os.getenv("VAL_JSON"))
TRAIN_CLEANED = Path(os.getenv("TRAIN_CLEANED"))
VAL_CLEANED = Path(os.getenv("VAL_CLEANED"))

# Make sure output folders exist
TRAIN_CLEANED.mkdir(parents=True, exist_ok=True)
VAL_CLEANED.mkdir(parents=True, exist_ok=True)

print(f"Dataset folder: {DATASET_CLEANED}")
print(f"Train JSON: {TRAIN_JSON}")
print(f"Val JSON: {VAL_JSON}")
print(f"Train folder: {TRAIN_CLEANED}")
print(f"Val folder: {VAL_CLEANED}")


In [None]:
# Load train/val JSON
with open(TRAIN_JSON, 'r') as f:
    train_data = json.load(f)

with open(VAL_JSON, 'r') as f:
    val_data = json.load(f)

print(f"Number of train images: {len(train_data)}")
print(f"Number of val images: {len(val_data)}")


In [None]:
def copy_images(image_keys, src_folder, dst_folder):
    copied = 0
    for img_name in image_keys:
        src_path = src_folder / img_name
        dst_path = dst_folder / img_name
        if not src_path.exists():
            print(f"⚠️ Missing image: {img_name}")
            continue
        if src_path.resolve() == dst_path.resolve():
            # Same file, skip
            continue
        shutil.copy(src_path, dst_path)
        copied += 1
    print(f"✅ Copied {copied}/{len(image_keys)} images to {dst_folder}")


In [None]:
from pathlib import Path
import shutil

# Ensure src/dst are Path objects
DATASET_CLEANED = Path(DATASET_CLEANED)  
TRAIN_CLEANED = Path(TRAIN_CLEANED)      
VAL_CLEANED = Path(VAL_CLEANED)          

# Make sure destination folders exist
TRAIN_CLEANED.mkdir(parents=True, exist_ok=True)
VAL_CLEANED.mkdir(parents=True, exist_ok=True)

def copy_images(image_keys, src_folder, dst_folder):
    copied = 0
    for img_name in image_keys:
        src_path = src_folder / img_name
        dst_path = dst_folder / img_name
        if not src_path.exists():
            print(f"⚠️ Missing image: {img_name}")
            continue
        if src_path.resolve() == dst_path.resolve():
            # Same file, skip
            continue
        shutil.copy(src_path, dst_path)
        copied += 1
    print(f"✅ Copied {copied}/{len(image_keys)} images to {dst_folder}")

# Copy train and val images
copy_images(train_data.keys(), DATASET_CLEANED, TRAIN_CLEANED)
copy_images(val_data.keys(), DATASET_CLEANED, VAL_CLEANED)
