In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from typing import Dict, List, Tuple, Optional
from pathlib import Path

## Phase 1. Data Preprocessing
1. Data Overview
    - Load datasets, summarize data structure and information
    - Check for missing and duplicated values
    - Plot distributions of the datasets
    - Inspect biological structure: Top features and PCA plots per omic layer
2. Normalization (data has already been normalized)
3. Transformation (data has already been transformed)
4. Imputation

In [None]:
# Load datasets
proteins = pd.read_csv('data/proteins.csv')
lipids = pd.read_csv('data/lipids.csv')
metabolites = pd.read_csv('data/metabolites.csv')

"""
Since MOFA+ is an unsupervised method, we will not be using the labels for training the model.
However, we will keep them separately for PCA.
"""

# Split labels and keep it separately
pro_labels = proteins[proteins['sample'] == 'label'].iloc[0,1:].values
lipid_labels = lipids[lipids['sample'] == 'label'].iloc[0,1:].values
meta_labels = metabolites[metabolites['sample'] == 'label'].iloc[0,1:].values

# Now remove the label row
pro_nolabel = proteins[proteins['sample'] != 'label'].reset_index(drop=True)
lipid_nolabel = lipids[lipids['sample'] != 'label'].reset_index(drop=True)
meta_nolabel = metabolites[metabolites['sample'] != 'label'].reset_index(drop=True)

# Set id_cols as index
pro_nolabel = pro_nolabel.set_index('sample')
lipid_nolabel = lipid_nolabel.set_index('sample')
meta_nolabel = meta_nolabel.set_index('sample')

In [None]:
# Inspect the first few rows 
proteins.head()

In [None]:
lipids.head()

In [None]:
metabolites.head()

In [None]:
# Checking info 
proteins.info()

In [None]:
lipids.info()

In [None]:
metabolites.info()

In [None]:
# Check for missing values
missing_pro = proteins.isnull().sum().sum()
missing_lipid = lipids.isnull().sum().sum()
missing_meta = metabolites.isnull().sum().sum()

missing_values = pd.DataFrame({
    'Omics layers': ['Proteomics', 'Lipidomics', 'Metabolomics'],
    'Missing values count': [missing_pro, missing_lipid, missing_meta]
})

print('=== MISSING VALUES CHECK FOR EACH OMICS LAYER ===')
print(missing_values)
print('')

# Check for duplicated values
boolean_pro = proteins.duplicated().any() 
boolean_lipids = lipids.duplicated().any() 
boolean_meta = metabolites.duplicated().any() 

boolean_1 = pd.DataFrame({
    'Omics layers': ['Proteomics', 'Lipidomics', 'Metabolomics'],
    'Duplicates?': [boolean_pro, boolean_lipids, boolean_meta]
})
print('=== DUPLICATES CHECK FOR EACH OMICS LAYER ===')
print(boolean_1)

In [None]:
# Plot distributions and variances for each omic layer
output_dir = Path('plots/eda')
output_dir.mkdir(parents=True, exist_ok=True)

def eda_plot(
    df: pd.DataFrame, 
    omic_name: str, 
    save_path: str = None
):
    """ 
    Display  simple distribution plots for each omic layer, including
    1. A histogram plot for the overall distribution of each omic layer
    2. A box plot for comparing sample-to-sample consistency and inspecting outliers
    3. A density plot for deeper investigation into the underlying data distribution
    4. A line plot for the variances across features to inspect class imbalance among features
    """
    fig, axes = plt.subplots(1, 4, figsize=(16,3))

    df = df.apply(pd.to_numeric, errors='coerce')

    # Overall distribution 
    all_values = df.values.flatten()
    axes[0].hist(all_values, bins=50, edgecolor='black')
    axes[0].set_title(f'{omic_name}: Overall Distribution')
    axes[0].set_xlabel('Value')
    axes[0].set_ylabel('Frequency')
    
    # Box plot across samples
    axes[1].boxplot([df[col] for col in df.columns])  
    axes[1].set_title(f'{omic_name}: Sample Distributions')
    axes[1].set_xlabel('Samples')
    axes[1].set_ylabel('Value')
    axes[1].tick_params(axis='x', rotation=90)
    
    # Density plot
    for col in df.columns[:5]:  # First 5 samples
        df[col].plot(kind='density', ax=axes[2], alpha=0.5)
    axes[2].set_title(f'{omic_name}: Sample Densities')
    axes[2].set_xlabel('Value')
    
    # Variance across features
    feature_vars = df.var(axis=1).sort_values(ascending=False)
    axes[3].plot(feature_vars.values)
    axes[3].set_title(f'{omic_name}: Feature Variances')
    axes[3].set_xlabel('Feature Rank')
    axes[3].set_ylabel('Variance')
    axes[3].set_yscale('log')
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
        
    plt.show()

# Run 
eda_plot(pro_nolabel, "Proteomics", save_path=str(output_dir / 'proteomics_distribution.png'))
eda_plot(lipid_nolabel, "Lipidomics", save_path=str(output_dir / 'lipidomics_distribution.png'))
eda_plot(meta_nolabel, "Metabolomics", save_path=str(output_dir / 'metabolomics_distribution.png'))

In [None]:
# Identify and plot top variable features
def top_variable_features(
    df: pd.DataFrame, 
    omic_name: str, 
    id_col: str = 'sample', 
    n_top: int = 20,
    save_path: str = None,
):
    """Identify most variable features in each omic layer"""
    
    # Calculate variance
    df = df.apply(pd.to_numeric, errors='coerce')
    feature_variance = df.var(axis=1).sort_values(ascending=False)
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Top variable features
    top_features = feature_variance.head(n_top)
    axes[0].barh(range(len(top_features)), top_features.values)
    axes[0].set_yticks(range(len(top_features)))
    axes[0].set_yticklabels(top_features.index)
    axes[0].set_xlabel('Variance')
    axes[0].set_ylabel('Features')
    axes[0].set_title(f'{omic_name}: Top {n_top} Variable Features')
    axes[0].invert_yaxis()
    
    # Heatmap of top features
    top_feature_data = df.loc[top_features.index]
    sns.heatmap(top_feature_data, center=0, cbar_kws={'label': 'Expression'}, ax=axes[1])
    axes[1].set_title(f'{omic_name}: Top Variable Features Heatmap')
    axes[1].set_xlabel('Samples')
    axes[1].set_ylabel('Features')
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
        
    plt.show()
    
    return top_features

top_pro = top_variable_features(pro_nolabel, "Proteomics", save_path=str(output_dir / 'proteomics_top_features.png'))
top_lipid = top_variable_features(lipid_nolabel, "Lipidomics", save_path=str(output_dir / 'lipidomics_top_features.png'))
top_metab = top_variable_features(meta_nolabel, "Metabolomics",save_path=str(output_dir / 'metabolomics_top_features.png'))

In [None]:
# Plot PCA with confidence ellipses to visualize sample clustering prior to MOFA+ integration
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from matplotlib.patches import Ellipse

def confidence_ellipse(x, y, ax, n_std=2.4477, facecolor='none', edgecolor='black', linestyle='--', **kwargs):
    """
    Create a confidence ellipse for a 2D dataset
    n_std=2.4477 corresponds to ~95% confidence for 2D
    """
    if len(x) < 3:
        return None
    
    cov = np.cov(x, y)
    mean_x, mean_y = np.mean(x), np.mean(y)
    
    # Calculate eigenvalues and eigenvectors
    eigvals, eigvecs = np.linalg.eigh(cov)
    order = eigvals.argsort()[::-1]
    eigvals, eigvecs = eigvals[order], eigvecs[:, order]
    
    # Calculate angle and width/height
    angle = np.degrees(np.arctan2(*eigvecs[:, 0][::-1]))
    width, height = 2 * n_std * np.sqrt(eigvals)
    
    # Create ellipse
    ellipse = Ellipse(
        (mean_x, mean_y),
        width=width,
        height=height,
        angle=angle,
        facecolor=facecolor,
        edgecolor=edgecolor,
        linestyle=linestyle,
        linewidth=2,
        **kwargs
    )
    
    return ax.add_patch(ellipse)


def plot_pca_with_ellipses(
    df: pd.DataFrame,
    labels: np.ndarray,
    omic_name: str,
    save_path: str = None,
    figsize: tuple = (10, 8),
    annotate_samples: bool = False
) -> Dict:
    """
    Plot PCA for each omics layer with 95% confidence ellipses
    
    Parameters:
    -----------
    df : pd.DataFrame
        Data in format features × samples (as indexed)
    labels : np.ndarray
        Sample class labels
    omic_name : str
        Name of the omics layer (e.g., 'Proteomics')
    save_path : str
        Path to save the plot (optional)
    figsize : tuple
        Figure size (width, height)
    annotate_samples : bool
        Whether to add sample name labels
    
    Returns:
    --------
    pca_results : dict
        Dictionary containing PCA scores, model, and variance explained
    """
    
    # Transpose to samples × features
    df_transposed = df.T
    
    # Ensure numeric
    df_numeric = df_transposed.apply(pd.to_numeric, errors='coerce')
    
    # Drop any rows/columns with all NaN
    df_numeric = df_numeric.dropna(axis=1, how='all')
    
    # Fill remaining NaN with column mean
    df_numeric = df_numeric.fillna(df_numeric.mean())
    
    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_numeric)
    
    # PCA
    pca = PCA(n_components=2)
    components = pca.fit_transform(X_scaled)
    
    # Build PCA DataFrame
    pca_df = pd.DataFrame(
        components,
        columns=['PC1', 'PC2'],
        index=df_numeric.index
    )
    pca_df['Group'] = labels
    
    # Create plot
    fig, ax = plt.subplots(figsize=figsize)
    
    # Get unique groups and colors
    unique_groups = np.unique(labels)
    colors = sns.color_palette("husl", len(unique_groups))
    color_map = dict(zip(unique_groups, colors))
    
    # Plot scatter points for each group
    for group in unique_groups:
        group_data = pca_df[pca_df['Group'] == group]
        ax.scatter(
            group_data['PC1'],
            group_data['PC2'],
            c=[color_map[group]],
            label=group,
            s=100,
            alpha=0.7,
            edgecolors='black',
            linewidth=0.5
        )
        
        # Add confidence ellipse
        if len(group_data) >= 3:
            confidence_ellipse(
                group_data['PC1'].values,
                group_data['PC2'].values,
                ax,
                edgecolor=color_map[group],
                linestyle='--',
                alpha=0.5
            )
    
    # Annotate samples if requested
    if annotate_samples:
        sample_names = pca_df.index.tolist()
        display_names = [f"S{name}" for name in sample_names]
        for i, sample_name in enumerate(display_names):
            ax.annotate(
                sample_name,
                (pca_df['PC1'].iloc[i], pca_df['PC2'].iloc[i]),
                xytext=(3, 3),
                textcoords='offset points',
                ha='left'
            )
    
    # Labels and title
    var1 = pca.explained_variance_ratio_[0] * 100
    var2 = pca.explained_variance_ratio_[1] * 100
    
    ax.set_xlabel(f'PC1 ({var1:.1f}%)', fontsize=12, fontweight='bold')
    ax.set_ylabel(f'PC2 ({var2:.1f}%)', fontsize=12, fontweight='bold')
    ax.set_title(f'{omic_name} PCA with 95% Confidence Ellipses', 
                 fontsize=14, fontweight='bold')
    
    # Legend
    ax.legend(loc='best', frameon=True, shadow=True)
    
    # Grid
    ax.grid(True, alpha=0.3, linestyle='--')
    
    # Tight layout
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    
    plt.show()
    
    # Return results
    pca_results = {
        'scores': pca_df,
        'pca': pca,
        'explained_variance': pca.explained_variance_ratio_,
        'scaler': scaler
    }
    
    return pca_results


In [None]:
# Plot PCA for each omics layer
pca_results = {}

pca_results['Proteomics'] = plot_pca_with_ellipses(
    df=pro_nolabel,
    labels=pro_labels,
    omic_name='Proteomics',
    save_path=str(output_dir / 'pca_proteomics.png'),
    annotate_samples=True
)

pca_results['Lipidomics'] = plot_pca_with_ellipses(
    df=lipid_nolabel,
    labels=lipid_labels,
    omic_name='Lipidomics',
    save_path=str(output_dir / 'pca_lipidomics.png'),
    annotate_samples=True
)

pca_results['Metabolomics'] = plot_pca_with_ellipses(
    df=meta_nolabel,
    labels=meta_labels,
    omic_name='Metabolomics',
    save_path=str(output_dir / 'pca_metabolomics.png'),
    annotate_samples=True
)

# Print variance explained
print("\nVariance Explained by PC1 and PC2:")
for omic, results in pca_results.items():
    var = results['explained_variance']
    print(f"{omic}: PC1={var[0]:.2%}, PC2={var[1]:.2%}, Total={var.sum():.2%}")