In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import os
import json

# Make sure the sompy library files (sompy_isom.py, codebook.py, etc.)
# are in the same directory as this notebook.
from folder.sompy_isom import SOMFactory

In [2]:


# ===================================================================
# CELL 1: Helper Functions (from your previous notebook and this analysis)
# ===================================================================

def calculate_variable_importance(data):
    """Applies PCA to data and computes variable importance."""
    pca = PCA()
    pca.fit(data)
    explained = pca.explained_variance_ratio_
    coeffs = pca.components_.T
    weights = explained / np.sum(explained)
    k = data.shape[1]
    importance = np.sum(weights[:k] * np.abs(coeffs[:, :k]), axis=1)
    importance /= np.sum(importance)
    return importance, explained

def analyze_isom_information_loss(X_df, y_s, mapsize=None, dataset_name=""):
    """Performs the full information loss analysis and plots the results."""
    print(f"\n--- Starting Information Loss Analysis for: {dataset_name} ---")
    
    # 1. PCA on Original Data
    X_orig = X_df.values
    d = X_orig.shape[1]
    X_orig_norm = (X_orig - X_orig.mean(axis=0)) / X_orig.std(axis=0)
    importance_orig, explained_orig = calculate_variable_importance(X_orig_norm)

    # 2. Train iSOM
    D = np.hstack([X_orig, y_s.values.reshape(-1, 1)])
    sm = SOMFactory.build(D, mapsize=mapsize, normalization='range', initialization='pca')
    sm.som_lininit()
    sm.train(request_id=f'info_loss_{dataset_name}', verbose=None)
    codebook = sm.codebook.matrix

    # 3. PCA on iSOM Codebook
    codebook_X = codebook[:, :-1]
    importance_map, _ = calculate_variable_importance(codebook_X)

    # 4. Compute Information Gain and Loss
    total_info_gain_pca = np.sum(explained_orig[:2]) * 100
    info_gain = total_info_gain_pca * importance_map
    info_loss = (100 / d) - info_gain
    perc_info_gain = 100 * info_gain / np.sum(info_gain)
    perc_info_loss = 100 * info_loss / np.sum(info_loss)

    # 5. Visualization
    variable_indices = X_df.columns
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    ax1.bar(variable_indices, info_gain, label='Gain', color='tab:blue')
    ax1.bar(variable_indices, info_loss, bottom=info_gain, label='Lost', color='tab:orange')
    ax1.set_title(f'Info in Variables ({dataset_name})')
    ax1.set_xlabel('Variable Index')
    ax1.set_ylabel('Information')
    ax1.tick_params(axis='x', rotation=45)
    ax1.legend()

    ax2.bar(variable_indices, perc_info_gain, label='% Gain', color='tab:blue')
    ax2.bar(variable_indices, perc_info_loss, bottom=perc_info_gain, label='% Lost', color='tab:orange')
    ax2.set_title(f'% Info in Variables ({dataset_name})')
    ax2.set_xlabel('Variable Index')
    ax2.set_ylabel('Percentage')
    ax2.tick_params(axis='x', rotation=45)
    ax2.legend()

    fig.suptitle(f'Information Loss Analysis for {dataset_name}', fontsize=16)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    fig.savefig(f"my_results_plot_{dataset_name}.png", bbox_inches='tight', dpi=150)
    plt.close(fig) # Close the figure to free up memory before the next loop
    
    print(f"Analysis Complete for {dataset_name}. Total info preserved in 2D PCA: {total_info_gain_pca:.2f}%")
    return {
        "dataset": dataset_name,
        "total_info_in_2D_PCA": total_info_gain_pca
    }

def read_data(path):
    _, ext = os.path.splitext(path)
    if ext == ".parquet": return pd.read_parquet(path)
    elif ext == ".csv": return pd.read_csv(path, index_col=0)

def load_data(base_path, dataset_name):
    with open(os.path.join(base_path, dataset_name, f"{dataset_name}.meta.json")) as f: meta = json.load(f)
    train_data = read_data(os.path.join(base_path, dataset_name, f"{dataset_name}.{meta['format']}"))
    return train_data, meta

# ===================================================================
# CELL 2: Main Loop to Analyze All Datasets
# ===================================================================

BASE_DATA_PATH = "data"
DATASETS_TO_RUN = [
    "airfoil_cl", "airfoil_cl_m", "framed_safety", "framed_validity", 
    "solar_hex", "welded_beam", "welded_beam_balanced"
]

all_results = []

for dataset_name in DATASETS_TO_RUN:
    try:
        # Load the data
        full_train_data, meta = load_data(BASE_DATA_PATH, dataset_name)
        y_column = meta["label"]

        # Binarize the target if it's continuous (as in your experiment notebook)
        if pd.api.types.is_float_dtype(full_train_data[y_column]):
            threshold = full_train_data[y_column].median()
            full_train_data[y_column] = (full_train_data[y_column] <= threshold).astype(int)

        # Separate features and target
        X_data = full_train_data.drop(columns=[y_column])
        y_data = full_train_data[y_column]
        
        # Use a reasonable subset for analysis to keep it fast
        if len(full_train_data) > 1000:
            data_subset = full_train_data.sample(n=1000, random_state=42)
            X_data = data_subset.drop(columns=[y_column])
            y_data = data_subset[y_column]

        # Run the analysis for the current dataset
        results = analyze_isom_information_loss(X_data, y_data, mapsize=[20, 20], dataset_name=dataset_name)
        all_results.append(results)

    except Exception as e:
        print(f"!!! Could not process dataset {dataset_name}. Error: {e} !!!")

# --- 3. Print a summary of the total information preserved for each dataset ---
print("\n\n===== SUMMARY OF INFORMATION PRESERVED IN 2D PCA =====")
summary_df = pd.DataFrame(all_results).set_index('dataset')
print(summary_df.sort_values(by='total_info_in_2D_PCA', ascending=False))

 pca_linear_initialization took: 0.018000 seconds
 Factor loading: 0.985989
 som_lininit took: 0.040000 seconds



--- Starting Information Loss Analysis for: airfoil_cl ---
Analysis Complete for airfoil_cl. Total info preserved in 2D PCA: 53.85%

--- Starting Information Loss Analysis for: airfoil_cl_m ---
Analysis Complete for airfoil_cl_m. Total info preserved in 2D PCA: 53.85%

--- Starting Information Loss Analysis for: framed_safety ---
Analysis Complete for framed_safety. Total info preserved in 2D PCA: 17.88%

--- Starting Information Loss Analysis for: framed_validity ---
Analysis Complete for framed_validity. Total info preserved in 2D PCA: 16.83%

--- Starting Information Loss Analysis for: solar_hex ---
Analysis Complete for solar_hex. Total info preserved in 2D PCA: 100.00%

--- Starting Information Loss Analysis for: welded_beam ---
Analysis Complete for welded_beam. Total info preserved in 2D PCA: 51.02%

--- Starting Information Loss Analysis for: welded_beam_balanced ---
Analysis Complete for welded_beam_balanced. Total info preserved in 2D PCA: 55.86%


===== SUMMARY OF INFORMATI