In [7]:
# ============================
# RDD2022 - EDA Script
# ============================

import os
import random
import cv2
import matplotlib.pyplot as plt
import pandas as pd

# ============================
# Dataset path (update if needed)
# ============================
DATASET_DIR = r"C:\Users\risho\Documents\GitHub\RoadSense\rdd2022\train"
IMG_DIR = os.path.join(DATASET_DIR, "images")
LBL_DIR = os.path.join(DATASET_DIR, "labels")

# ============================
# Class mapping for RDD2022
# ============================
CLASS_NAMES = {
    0: "D00 (Longitudinal Crack)",
    1: "D10 (Transverse Crack)",
    2: "D20 (Alligator Crack)",
    3: "D40 (Pothole)"
}

# ============================
# Helper function: Load labels
# ============================
def load_labels(label_path, img_w, img_h):
    """Load YOLO annotations and convert to pixel bounding boxes."""
    boxes = []
    if not os.path.exists(label_path):
        return boxes

    with open(label_path, "r") as f:
        for line in f.readlines():
            cls, x_c, y_c, w, h = map(float, line.strip().split())
            cls = int(cls)

            # Convert from YOLO format to pixel coordinates
            x_c, y_c, w, h = x_c * img_w, y_c * img_h, w * img_w, h * img_h
            x1 = int(x_c - w / 2)
            y1 = int(y_c - h / 2)
            x2 = int(x_c + w / 2)
            y2 = int(y_c + h / 2)

            boxes.append((cls, x1, y1, x2, y2))
    return boxes

# ============================
# Show random samples
# ============================
def show_random_samples(n=5):
    img_files = os.listdir(IMG_DIR)
    chosen = random.sample(img_files, n)

    for file in chosen:
        img_path = os.path.join(IMG_DIR, file)
        lbl_path = os.path.join(LBL_DIR, file.replace(".jpg", ".txt").replace(".png", ".txt"))

        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        h, w, _ = img.shape

        boxes = load_labels(lbl_path, w, h)

        # Draw boxes
        for cls, x1, y1, x2, y2 in boxes:
            color = (255, 0, 0)
            cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
            cv2.putText(img, CLASS_NAMES[cls], (x1, y1 - 5),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

        plt.figure(figsize=(8, 6))
        plt.imshow(img)
        plt.title(f"Image: {file}")
        plt.axis("off")
        plt.show()

# ============================
# Dataset statistics
# ============================
def dataset_stats():
    stats = {cls: 0 for cls in CLASS_NAMES}
    total_labels = 0

    for lbl_file in os.listdir(LBL_DIR):
        with open(os.path.join(LBL_DIR, lbl_file), "r") as f:
            for line in f.readlines():
                cls = int(line.split()[0])
                stats[cls] += 1
                total_labels += 1

    df = pd.DataFrame({
        "Class": [CLASS_NAMES[c] for c in stats.keys()],
        "Count": list(stats.values())
    })
    print(f"Total images: {len(os.listdir(IMG_DIR))}")
    print(f"Total annotations: {total_labels}")
    print(df)

    # Plot distribution
    plt.figure(figsize=(8, 6))
    plt.bar(df["Class"], df["Count"], color="skyblue")
    plt.xticks(rotation=30, ha="right")
    plt.title("Class Distribution in RDD2022")
    plt.ylabel("Count")
    plt.show()


# ============================
# Run EDA
# ============================
if __name__ == "__main__":
    print("Checking dataset paths...")
    print("Images exist:", os.path.exists(IMG_DIR))
    print("Labels exist:", os.path.exists(LBL_DIR))

    dataset_stats()
    show_random_samples(n=5)


Checking dataset paths...
Images exist: False
Labels exist: False


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\risho\\Documents\\GitHub\\RoadSense\\rdd2022\\train\\labels'