In [18]:
import os
import torch
from torch.utils.data import DataLoader
import sys
sys.path.append('/net/cremi/sasifchaudhr/espaces/travail/Semantic-Segmentation-for-Autonomous-Driving')
from src.dataset import ProjectDatasets  # Ensure this path matches your project structure
import numpy as np

# Load configuration
import yaml
with open("../experiments/configs/config.yaml", "r") as f:
    config = yaml.safe_load(f)

DATA_PATH = config["paths"]["data"]
BATCH_SIZE = config["hyperparameters"]["batch_size"]
NUM_WORKERS = config["data"]["num_workers"]

# Initialize dataset
print("Initializing dataset...")
dataset = ProjectDatasets(root_path=DATA_PATH)

# Split dataset (assuming a simple split for demonstration purposes)
data_length = len(dataset)
train_length = int(data_length * config["dataset_split"]["train"])
val_length = int(data_length * config["dataset_split"]["val"])
test_length = data_length - train_length - val_length

data_splits = torch.utils.data.random_split(dataset, [train_length, val_length, test_length])
train_dataset, val_dataset, test_dataset = data_splits

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=config["data"]["shuffle"], num_workers=NUM_WORKERS)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

# Display Dataset Information
print(f"Dataset split: {len(train_dataset)} training samples, {len(val_dataset)} validation samples, {len(test_dataset)} test samples")

# Analyze Dataset
class_names = config["class_names"]
num_classes = len(class_names)

# Initialize counters
pixel_counts = np.zeros(num_classes, dtype=np.int64)
class_presence = np.zeros(num_classes, dtype=np.int64)

def analyze_dataset(loader):
    global pixel_counts, class_presence
    for batch in loader:
        _, masks, _, _ = batch
        masks = masks.numpy()  # Convert to numpy for easier processing

        # Count pixels per class
        for class_idx in range(num_classes):
            pixel_counts[class_idx] += np.sum(masks == class_idx)

        # Check presence of classes in each mask
        unique_classes = np.unique(masks)
        for cls in unique_classes:
            class_presence[cls] += 1

print("Analyzing training dataset...")
analyze_dataset(train_loader)
print("Analyzing validation dataset...")
analyze_dataset(val_loader)
print("Analyzing test dataset...")
analyze_dataset(test_loader)

# Total pixel count
total_pixels = np.sum(pixel_counts)

# Class distribution (percentage of total pixels)
class_distribution = (pixel_counts / total_pixels) * 100

# Print summary
print("\nDataset Analysis Summary:")
print(f"Total images: {data_length}")
print(f"Classes present: {np.nonzero(class_presence)[0].tolist()}")
print("Pixel count per class:")
for idx, count in enumerate(pixel_counts):
    print(f"  {class_names[idx]}: {count} pixels ({class_distribution[idx]:.2f}%)")

print("\nClass presence across images:")
for idx, presence in enumerate(class_presence):
    print(f"  {class_names[idx]}: Present in {presence} images")

Initializing dataset...
Dataset split: 5936 training samples, 742 validation samples, 743 test samples
Analyzing training dataset...
Analyzing validation dataset...
Analyzing test dataset...

Dataset Analysis Summary:
Total images: 7421
Classes present: [0, 3, 4, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29]
Pixel count per class:
  Unlabeled: 45673310 pixels (9.39%)
  Static: 0 pixels (0.00%)
  Dynamic: 0 pixels (0.00%)
  Ground: 120385177 pixels (24.75%)
  Road: 7639300 pixels (1.57%)
  Sidewalk: 0 pixels (0.00%)
  Parking: 0 pixels (0.00%)
  Rail track: 87507849 pixels (17.99%)
  Building: 3042954 pixels (0.63%)
  Wall: 5054439 pixels (1.04%)
  Fence: 1 pixels (0.00%)
  Guard rail: 1 pixels (0.00%)
  Bridge: 0 pixels (0.00%)
  Tunnel: 1737313 pixels (0.36%)
  Pole: 383637 pixels (0.08%)
  Pole group: 1901335 pixels (0.39%)
  Traffic light: 76484860 pixels (15.73%)
  Traffic sign: 6979597 pixels (1.44%)
  Vegetation: 103826098 pixels (21.35%)
  Terr

In [20]:
import os
import torch
from torch.utils.data import DataLoader
from src.dataset import ProjectDatasets  # Ensure this path matches your project structure
import numpy as np

# Load configuration
import yaml
with open("../experiments/configs/config.yaml", "r") as f:
    config = yaml.safe_load(f)

DATA_PATH = config["paths"]["data"]
BATCH_SIZE = config["hyperparameters"]["batch_size"]
NUM_WORKERS = config["data"]["num_workers"]
SEED = config["hyperparameters"]["seed"]

# Set random seed for reproducibility
torch.manual_seed(SEED)
np.random.seed(SEED)

# Initialize dataset
print("Initializing dataset...")
dataset = ProjectDatasets(root_path=DATA_PATH)

# Split dataset
data_length = len(dataset)
train_length = int(data_length * config["dataset_split"]["train"])
val_length = int(data_length * config["dataset_split"]["val"])
test_length = data_length - train_length - val_length

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    dataset, [train_length, val_length, test_length]
)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=config["data"]["shuffle"], num_workers=NUM_WORKERS)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

# Display Dataset Information
print(f"Dataset split: {len(train_dataset)} training samples, {len(val_dataset)} validation samples, {len(test_dataset)} test samples")

# Analyze Dataset
class_names = config["class_names"]
num_classes = len(class_names)

# Initialize counters
statistics = {
    "train": {"sunny": {"pixel_counts": np.zeros(num_classes, dtype=np.int64), "class_presence": np.zeros(num_classes, dtype=np.int64)},
               "rainy": {"pixel_counts": np.zeros(num_classes, dtype=np.int64), "class_presence": np.zeros(num_classes, dtype=np.int64)}},
    "val": {"sunny": {"pixel_counts": np.zeros(num_classes, dtype=np.int64), "class_presence": np.zeros(num_classes, dtype=np.int64)},
             "rainy": {"pixel_counts": np.zeros(num_classes, dtype=np.int64), "class_presence": np.zeros(num_classes, dtype=np.int64)}},
    "test": {"sunny": {"pixel_counts": np.zeros(num_classes, dtype=np.int64), "class_presence": np.zeros(num_classes, dtype=np.int64)},
              "rainy": {"pixel_counts": np.zeros(num_classes, dtype=np.int64), "class_presence": np.zeros(num_classes, dtype=np.int64)}}
}

def analyze_dataset(loader, dataset_type):
    for batch in loader:
        _, masks, sources, _ = batch
        masks = masks.numpy()  # Convert to numpy for easier processing

        for i, src in enumerate(sources):
            src = src.lower()
            mask = masks[i]

            # Update stats for sunny or rainy
            if src in statistics[dataset_type]:
                for class_idx in range(num_classes):
                    statistics[dataset_type][src]["pixel_counts"][class_idx] += np.sum(mask == class_idx)

                unique_classes = np.unique(mask)
                for cls in unique_classes:
                    statistics[dataset_type][src]["class_presence"][cls] += 1

print("Analyzing training dataset...")
analyze_dataset(train_loader, "train")
print("Analyzing validation dataset...")
analyze_dataset(val_loader, "val")
print("Analyzing test dataset...")
analyze_dataset(test_loader, "test")

# Summarize statistics
def summarize_statistics(stats, category):
    print(f"\n{category.capitalize()} Dataset Analysis Summary:")

    for condition in ["sunny", "rainy"]:
        print(f"\n  Condition: {condition.capitalize()}")
        total_pixels = np.sum(stats[condition]["pixel_counts"])
        class_distribution = (stats[condition]["pixel_counts"] / total_pixels) * 100 if total_pixels > 0 else np.zeros(num_classes)

        print("  Pixel count per class:")
        for idx, count in enumerate(stats[condition]["pixel_counts"]):
            print(f"    {class_names[idx]}: {count} pixels ({class_distribution[idx]:.2f}%)")

        print("  Class presence across images:")
        for idx, presence in enumerate(stats[condition]["class_presence"]):
            print(f"    {class_names[idx]}: Present in {presence} images")

print("\nTraining Dataset Analysis:")
summarize_statistics(statistics["train"], "training")
print("\nValidation Dataset Analysis:")
summarize_statistics(statistics["val"], "validation")
print("\nTest Dataset Analysis:")
summarize_statistics(statistics["test"], "test")


Initializing dataset...
Dataset split: 5936 training samples, 742 validation samples, 743 test samples
Analyzing training dataset...
Analyzing validation dataset...
Analyzing test dataset...

Training Dataset Analysis:

Training Dataset Analysis Summary:

  Condition: Sunny
  Pixel count per class:
    Unlabeled: 16241374 pixels (8.24%)
    Static: 0 pixels (0.00%)
    Dynamic: 0 pixels (0.00%)
    Ground: 49933240 pixels (25.33%)
    Road: 3872136 pixels (1.96%)
    Sidewalk: 0 pixels (0.00%)
    Parking: 0 pixels (0.00%)
    Rail track: 6945105 pixels (3.52%)
    Building: 1241694 pixels (0.63%)
    Wall: 2143578 pixels (1.09%)
    Fence: 0 pixels (0.00%)
    Guard rail: 1 pixels (0.00%)
    Bridge: 0 pixels (0.00%)
    Tunnel: 414028 pixels (0.21%)
    Pole: 200825 pixels (0.10%)
    Pole group: 347781 pixels (0.18%)
    Traffic light: 40889517 pixels (20.74%)
    Traffic sign: 4682812 pixels (2.38%)
    Vegetation: 64138014 pixels (32.54%)
    Terrain: 261897 pixels (0.13%)
    Sky

In [23]:
# Summarize statistics into a table
import pandas as pd
def create_statistics_table(stats):
    table_data = []

    for class_idx, class_name in enumerate(class_names):
        row = {"Class": class_name, "Total Pixel Count": 0, "Total Class Presence": 0,
               "Train Sunny Pixel Count": stats["train"]["sunny"]["pixel_counts"][class_idx],
               "Train Sunny Class Presence": stats["train"]["sunny"]["class_presence"][class_idx],
               "Train Rainy Pixel Count": stats["train"]["rainy"]["pixel_counts"][class_idx],
               "Train Rainy Class Presence": stats["train"]["rainy"]["class_presence"][class_idx],
               "Val Sunny Pixel Count": stats["val"]["sunny"]["pixel_counts"][class_idx],
               "Val Sunny Class Presence": stats["val"]["sunny"]["class_presence"][class_idx],
               "Val Rainy Pixel Count": stats["val"]["rainy"]["pixel_counts"][class_idx],
               "Val Rainy Class Presence": stats["val"]["rainy"]["class_presence"][class_idx],
               "Test Sunny Pixel Count": stats["test"]["sunny"]["pixel_counts"][class_idx],
               "Test Sunny Class Presence": stats["test"]["sunny"]["class_presence"][class_idx],
               "Test Rainy Pixel Count": stats["test"]["rainy"]["pixel_counts"][class_idx],
               "Test Rainy Class Presence": stats["test"]["rainy"]["class_presence"][class_idx]}

        row["Total Pixel Count"] = row["Train Sunny Pixel Count"] + row["Train Rainy Pixel Count"] + row["Val Sunny Pixel Count"] + row["Val Rainy Pixel Count"] + row["Test Sunny Pixel Count"] + row["Test Rainy Pixel Count"]
        row["Total Class Presence"] = row["Train Sunny Class Presence"] + row["Train Rainy Class Presence"] + row["Val Sunny Class Presence"] + row["Val Rainy Class Presence"] + row["Test Sunny Class Presence"] + row["Test Rainy Class Presence"]

        table_data.append(row)

    return pd.DataFrame(table_data)


# Generate and display the table
statistics_table = create_statistics_table(statistics)

# Display the table in the notebook
from IPython.display import display
display(statistics_table)

Unnamed: 0,Class,Total Pixel Count,Total Class Presence,Train Sunny Pixel Count,Train Sunny Class Presence,Train Rainy Pixel Count,Train Rainy Class Presence,Val Sunny Pixel Count,Val Sunny Class Presence,Val Rainy Pixel Count,Val Rainy Class Presence,Test Sunny Pixel Count,Test Sunny Class Presence,Test Rainy Pixel Count,Test Rainy Class Presence
0,Unlabeled,45673310,7421,16241374,3008,20350854,2928,2130826,394,2412829,348,2016293,377,2521134,366
1,Static,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Dynamic,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Ground,120385177,7421,49933240,3008,46321954,2928,6548338,394,5488690,348,6241276,377,5851679,366
4,Road,7639300,6746,3872136,2526,2203493,2883,507139,332,266118,338,507269,311,283145,356
5,Sidewalk,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,Parking,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,Rail track,87507849,7184,6945105,2823,63393230,2927,872871,371,7535108,348,823744,350,7937791,365
8,Building,3042954,4359,1241694,2011,1178940,1501,160294,259,162901,165,150568,242,148557,181
9,Wall,5054439,4834,2143578,2197,1816096,1660,318028,284,233280,199,266479,268,276978,226


In [25]:
# Create Image Count Table
def create_image_count_table(stats):
    table_data = []

    for class_idx, class_name in enumerate(class_names):
        row = {"Class": class_name,
               "Train Sunny Image Count": stats["train"]["sunny"]["class_presence"][class_idx],
               "Train Rainy Image Count": stats["train"]["rainy"]["class_presence"][class_idx],
               "Val Sunny Image Count": stats["val"]["sunny"]["class_presence"][class_idx],
               "Val Rainy Image Count": stats["val"]["rainy"]["class_presence"][class_idx],
               "Test Sunny Image Count": stats["test"]["sunny"]["class_presence"][class_idx],
               "Test Rainy Image Count": stats["test"]["rainy"]["class_presence"][class_idx]}

        row["Total Image Count"] = row["Train Sunny Image Count"] + row["Train Rainy Image Count"] + \
                                   row["Val Sunny Image Count"] + row["Val Rainy Image Count"] + \
                                   row["Test Sunny Image Count"] + row["Test Rainy Image Count"]

        table_data.append(row)

    return pd.DataFrame(table_data)

# Create Pixel Count Table
def create_pixel_count_table(stats):
    table_data = []

    for class_idx, class_name in enumerate(class_names):
        row = {"Class": class_name,
               "Train Sunny Pixel Count": stats["train"]["sunny"]["pixel_counts"][class_idx],
               "Train Rainy Pixel Count": stats["train"]["rainy"]["pixel_counts"][class_idx],
               "Val Sunny Pixel Count": stats["val"]["sunny"]["pixel_counts"][class_idx],
               "Val Rainy Pixel Count": stats["val"]["rainy"]["pixel_counts"][class_idx],
               "Test Sunny Pixel Count": stats["test"]["sunny"]["pixel_counts"][class_idx],
               "Test Rainy Pixel Count": stats["test"]["rainy"]["pixel_counts"][class_idx]}

        row["Total Pixel Count"] = row["Train Sunny Pixel Count"] + row["Train Rainy Pixel Count"] + \
                                   row["Val Sunny Pixel Count"] + row["Val Rainy Pixel Count"] + \
                                   row["Test Sunny Pixel Count"] + row["Test Rainy Pixel Count"]

        table_data.append(row)

    return pd.DataFrame(table_data)

# Generate and display the tables
image_count_table = create_image_count_table(statistics)
pixel_count_table = create_pixel_count_table(statistics)

# # Display the table in the notebook
# from IPython.display import display
# tools.display_dataframe_to_user(name="Image Count Table", dataframe=image_count_table)
# tools.display_dataframe_to_user(name="Pixel Count Table", dataframe=pixel_count_table)


In [26]:
display(create_image_count_table(statistics))

Unnamed: 0,Class,Train Sunny Image Count,Train Rainy Image Count,Val Sunny Image Count,Val Rainy Image Count,Test Sunny Image Count,Test Rainy Image Count,Total Image Count
0,Unlabeled,3008,2928,394,348,377,366,7421
1,Static,0,0,0,0,0,0,0
2,Dynamic,0,0,0,0,0,0,0
3,Ground,3008,2928,394,348,377,366,7421
4,Road,2526,2883,332,338,311,356,6746
5,Sidewalk,0,0,0,0,0,0,0
6,Parking,0,0,0,0,0,0,0
7,Rail track,2823,2927,371,348,350,365,7184
8,Building,2011,1501,259,165,242,181,4359
9,Wall,2197,1660,284,199,268,226,4834


In [28]:
display(create_pixel_count_table(statistics))

Unnamed: 0,Class,Train Sunny Pixel Count,Train Rainy Pixel Count,Val Sunny Pixel Count,Val Rainy Pixel Count,Test Sunny Pixel Count,Test Rainy Pixel Count,Total Pixel Count
0,Unlabeled,16241374,20350854,2130826,2412829,2016293,2521134,45673310
1,Static,0,0,0,0,0,0,0
2,Dynamic,0,0,0,0,0,0,0
3,Ground,49933240,46321954,6548338,5488690,6241276,5851679,120385177
4,Road,3872136,2203493,507139,266118,507269,283145,7639300
5,Sidewalk,0,0,0,0,0,0,0
6,Parking,0,0,0,0,0,0,0
7,Rail track,6945105,63393230,872871,7535108,823744,7937791,87507849
8,Building,1241694,1178940,160294,162901,150568,148557,3042954
9,Wall,2143578,1816096,318028,233280,266479,276978,5054439
