Reduccion de Dimensionalidad

In [6]:
import os
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
from sklearn.decomposition import IncrementalPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# Rutas
PREPROCESSED_PATH = r"C:\DAVID\CS\2025 0\machine_learning\Proyecto3\P3_ML_GP6\pre_processing"
LABELS_FILE = r"C:\DAVID\CS\2025 0\machine_learning\Data_Proyect3\ISIC2018_Task3_Training_GroundTruth\ISIC2018_Task3_Training_GroundTruth\ISIC2018_Task3_Training_GroundTruth.csv"
OUTPUT_PATH = r"C:\DAVID\CS\2025 0\machine_learning\Proyecto3\P3_ML_GP6\reduced_data"

# Crear carpeta si no existe
os.makedirs(OUTPUT_PATH, exist_ok=True)

print("📥 Cargando etiquetas...")
df = pd.read_csv(LABELS_FILE, sep=';')
df['diagnosis'] = df.iloc[:, 1:].idxmax(axis=1)

# Configurar tamaño de lotes
batch_size_pca = 250  # Reducimos el tamaño del lote para PCA
batch_size_lda = 100  # Más pequeño para LDA
num_batches_pca = len(df) // batch_size_pca + 1
num_batches_lda = len(df) // batch_size_lda + 1

# Definir IncrementalPCA y LDA
pca = IncrementalPCA(n_components=50, batch_size=batch_size_pca)
lda = LDA(n_components=6)

print("⚙️ Aplicando PCA en lotes con `IncrementalPCA`...")

# 🔹 1. Entrenar PCA en lotes pequeños sin sobrecargar la RAM
for i in range(num_batches_pca):
    print(f"📦 Procesando PCA - Lote {i+1}/{num_batches_pca}...")

    X_batch = []
    y_batch = []

    for _, row in tqdm(df.iloc[i * batch_size_pca: (i + 1) * batch_size_pca].iterrows(), total=batch_size_pca):
        image_name = row['image'] + ".npy"
        image_path = os.path.join(PREPROCESSED_PATH, image_name)

        if os.path.exists(image_path):
            img = np.load(image_path).flatten()
            X_batch.append(img)
            y_batch.append(row['diagnosis'])

    if len(X_batch) == 0:
        continue

    X_batch = np.array(X_batch, dtype=np.float16)  # Convertimos a float16 para ahorrar memoria
    y_batch = np.array(y_batch)

    print("🔹 Ajustando PCA con este lote...")
    pca.partial_fit(X_batch)  # Usa `partial_fit()` en lugar de `fit()`

print("✅ PCA entrenado, guardando componentes...")
np.save(os.path.join(OUTPUT_PATH, "pca_components.npy"), pca.components_)

# 🔹 Aplicar transformación PCA en lotes
print("⚙️ Aplicando transformación PCA...")
for i in range(num_batches_pca):
    print(f"📦 Transformando PCA - Lote {i+1}/{num_batches_pca}...")

    X_batch = []
    for _, row in tqdm(df.iloc[i * batch_size_pca: (i + 1) * batch_size_pca].iterrows(), total=batch_size_pca):
        image_name = row['image'] + ".npy"
        image_path = os.path.join(PREPROCESSED_PATH, image_name)

        if os.path.exists(image_path):
            img = np.load(image_path).flatten()
            X_batch.append(img)

    if len(X_batch) == 0:
        continue

    X_batch = np.array(X_batch, dtype=np.float16)

    print("🔹 Aplicando transformación PCA...")
    X_pca_batch = pca.transform(X_batch)
    np.save(os.path.join(OUTPUT_PATH, f"pca_reduced_batch_{i}.npy"), X_pca_batch)

    print(f"✅ Lote {i+1}/{num_batches_pca} procesado y guardado para PCA.")

# 🔹 2. Procesar LDA en lotes más pequeños
print("⚙️ Aplicando LDA en lotes...")
first_batch_lda = True
for i in range(num_batches_lda):
    print(f"📦 Procesando LDA - Lote {i+1}/{num_batches_lda}...")

    X_batch = []
    y_batch = []

    for _, row in tqdm(df.iloc[i * batch_size_lda: (i + 1) * batch_size_lda].iterrows(), total=batch_size_lda):
        image_name = row['image'] + ".npy"
        image_path = os.path.join(PREPROCESSED_PATH, image_name)

        if os.path.exists(image_path):
            img = np.load(image_path).flatten()
            X_batch.append(img)
            y_batch.append(row['diagnosis'])

    if len(X_batch) == 0:
        continue

    X_batch = np.array(X_batch, dtype=np.float16)
    y_batch = np.array(y_batch)

    if first_batch_lda:
        print("🔹 Entrenando LDA con primer lote...")
        lda.fit(X_batch, y_batch)
        first_batch_lda = False

    print("🔹 Aplicando transformación LDA...")
    X_lda_batch = lda.transform(X_batch)
    np.save(os.path.join(OUTPUT_PATH, f"lda_reduced_batch_{i}.npy"), X_lda_batch)

    print(f"✅ Lote {i+1}/{num_batches_lda} procesado y guardado para LDA.")

print("🎉 Reducción de dimensionalidad completada.")


📥 Cargando etiquetas...
⚙️ Aplicando PCA en lotes con `IncrementalPCA`...
📦 Procesando PCA - Lote 1/41...


 49%|████▉     | 123/250 [00:00<00:00, 148.44it/s]


MemoryError: Unable to allocate 3.09 MiB for an array with shape (810000,) and data type float32