In [None]:
# 🔎 Análisis exploratorio de imágenes - Dataset Cáncer de Mama
import os
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import numpy as np
import random

In [None]:
import yaml

# Cargar configuración
with open("../config.yaml", "r") as f:
    config = yaml.safe_load(f)

image_root = config["paths"]["absolute_data_path"]

In [None]:
# class_0: tejido benigno, class_1: tejido maligno
# Recorrer carpetas: subfolder/nombre_clase/imagen.png
counts = {"class_0": 0, "class_1": 0}
folders = os.listdir(image_root)

for folder in folders:
    subfolder_path = os.path.join(image_root, folder)
    if os.path.isdir(subfolder_path):
        for class_label in ["0", "1"]:
            class_dir = os.path.join(subfolder_path, class_label)
            if os.path.exists(class_dir):
                n_images = len(os.listdir(class_dir))
                counts[f"class_{class_label}"] += n_images

print("Cantidad de imágenes por clase:")
for k, v in counts.items():
    print(f"{k}: {v}")

# Visualización
sns.barplot(x=list(counts.keys()), y=list(counts.values()))
plt.title("Distribución de clases (total de imágenes)")
plt.show()

In [None]:
def mostrar_imagen_aleatoria(clase="0"):
    muestras = []
    for carpeta in os.listdir(image_root):
        path_clase = os.path.join(image_root, carpeta, clase)
        if os.path.exists(path_clase):
            imgs = os.listdir(path_clase)
            if imgs:
                muestras.append(os.path.join(path_clase, random.choice(imgs)))

    img_path = random.choice(muestras)
    img = Image.open(img_path)
    plt.imshow(img)
    plt.axis("off")
    plt.title(f"Ejemplo de clase {clase}")
    plt.show()

# Mostrar imágenes
mostrar_imagen_aleatoria("0")
mostrar_imagen_aleatoria("1")

In [None]:
sizes = []
formats = []

for carpeta in os.listdir(image_root):
    for clase in ["0", "1"]:
        path = os.path.join(image_root, carpeta, clase)
        if os.path.exists(path):
            for archivo in os.listdir(path):
                try:
                    img = Image.open(os.path.join(path, archivo))
                    sizes.append(img.size)
                    formats.append(img.format)
                except:
                    continue

# Tamaños
from collections import Counter
size_counts = Counter(sizes)
print("Tamaños de imágenes más comunes:")
print(size_counts.most_common(5))

# Formatos
format_counts = Counter(formats)
print("\nFormatos de imagen:")
print(format_counts)