In [5]:
# ==============================================================================
# Phase 2: Tumor Microenvironment (TME) Analysis
# This script focuses on analyzing the tumor microenvironment using xCell
# deconvolution data and performing Gene Set Enrichment Analysis (GSEA).
# It now includes statistical tests (ANOVA) to assess differences in TME
# composition across various cancer types, identifies TME clusters,
# performs Random Forest classification on these clusters, and analyzes
# immune checkpoint gene expression correlations.
#
# Before running:
# 1. Ensure you have pandas, numpy, matplotlib, seaborn, scipy, gseapy,
#    statsmodels, and umap-learn installed.
#    Install statsmodels: pip install statsmodels
#    Install gseapy: pip install gseapy
#    Install umap-learn: pip install umap-learn
# 2. Make sure 'xcell_deconvoluted.csv', 'TCGA_phenotype_denseDataOnlyDownload.tsv',
#    and 'EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena'
#    are in the same directory as this script.
# 3. Download MSigDB gene sets (e.g., 'h.all.v7.5.1.symbols.gmt' for Hallmark,
#    and 'c2.cp.kegg.v2023.1.Hs.entrez.gmt' or similar for KEGG pathways)
#    and place them in a 'results/gsea_gene_sets' directory.
# ==============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, f_oneway, levene, kruskal, ttest_ind # Added ttest_ind
import os
import gseapy as gp
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from umap import UMAP
from statsmodels.stats.multicomp import pairwise_tukeyhsd # For post-hoc test
import itertools # Added for cycling through markers
from sklearn.cluster import KMeans # For K-Means clustering
from sklearn.model_selection import train_test_split # For ML model splitting
from sklearn.ensemble import RandomForestClassifier # For Random Forest
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score # For ML evaluation
import scipy.stats as stats # For spearmanr

# --- Configuration ---
# Set DATA_DIR to the current directory where the script and data files reside
DATA_DIR = '.'
RESULTS_DIR = 'results'
GRAPHS_DIR = 'graphs' # New directory for general plots
PLOTS_CHECKPOINTS_DIR = os.path.join('plots', 'checkpoints') # New directory for checkpoint plots
GSEA_GENE_SETS_DIR = os.path.join(RESULTS_DIR, 'gsea_gene_sets')

# Updated filenames based on your screenshot
XCELL_FILE = 'xcell_deconvoluted.csv'
PHENOTYPE_FILE = 'TCGA_phenotype_denseDataOnlyDownload.tsv'
EXPRESSION_FILE = 'EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena'

# Create directories if they don't exist
os.makedirs(os.path.join(RESULTS_DIR, 'tme_analysis'), exist_ok=True)
os.makedirs(os.path.join(RESULTS_DIR, 'gsea'), exist_ok=True)
os.makedirs(GRAPHS_DIR, exist_ok=True) # Create graphs directory
os.makedirs(PLOTS_CHECKPOINTS_DIR, exist_ok=True) # Create plots/checkpoints directory
os.makedirs(GSEA_GENE_SETS_DIR, exist_ok=True) # Ensure this exists if you're placing GMT files here
os.makedirs(os.path.join(DATA_DIR, 'data'), exist_ok=True) # Ensure 'data' dir exists for cluster_labels.csv

print("Starting Phase 2: Tumor Microenvironment Analysis...")

# --- Global DataFrames to be loaded and used across sections ---
# These will be populated by load_and_explore_data
df_xcell = None # Raw xCell data (samples as columns, cell types as rows)
df_pheno = None # Raw phenotype data
df_tme = None # Merged xCell and phenotype (samples as rows, cell types + cancer_type as columns)
df_expr = None # Gene expression data (genes as rows, samples as columns)
df_xcell_aligned_for_gsea = None # xCell data aligned with phenotype for GSEA
df_pheno_aligned_for_gsea = None # Phenotype data aligned with xCell for GSEA
df_annotated = None # Dataframe with xCell, phenotype, and cluster labels

# ==============================================================================
# 2.1 Data Loading and Initial Exploration
# This section is responsible for loading all necessary data files and
# performing initial data alignment and basic checks.
# ==============================================================================
print("\n--- 2.1 Data Loading and Initial Exploration ---")

def load_and_explore_data():
    """
    Loads xCell deconvolution, phenotype, and gene expression data.
    Performs initial alignment and basic data checks.
    Returns:
        tuple: (df_xcell, df_pheno, df_tme, df_expr, df_xcell_aligned, df_pheno_aligned, df_annotated)
               Returns (None, None, None, None, None, None, None) if any file is not found or error occurs.
    """
    global df_xcell, df_pheno, df_tme, df_expr, df_xcell_aligned_for_gsea, df_pheno_aligned_for_gsea, df_annotated

    try:
        # Load the xCell deconvolution output
        # The original xCell output often has cell types as rows and samples as columns.
        # We transpose it to have samples as rows and cell types as columns for consistency.
        df_xcell_raw = pd.read_csv(os.path.join(DATA_DIR, XCELL_FILE), index_col=0)
        df_xcell = df_xcell_raw.T # Transpose for samples as rows, cell types as columns

        # Optional: Clean column names (remove leading/trailing whitespace)
        df_xcell.columns = df_xcell.columns.str.strip()

        # Confirm
        print(f"Shape of xCell matrix (samples x cell types): {df_xcell.shape}")
        print("First 5 rows of xCell data:")
        print(df_xcell.head())

        # Basic overview
        print(f"\nShape of xCell data (samples × cell types): {df_xcell.shape}")
        print(f"Number of cell types: {df_xcell.shape[1]}")
        print(f"Number of samples: {df_xcell.shape[0]}")
        print("Any missing values in xCell data:", df_xcell.isnull().sum().sum())


        # Load phenotype data to merge with xCell data
        # Using sep='\t' as it's a .tsv file
        df_pheno = pd.read_csv(os.path.join(DATA_DIR, PHENOTYPE_FILE), index_col=0, sep='\t')

        # --- IMPORTANT FIX: Rename '_primary_disease' to 'cancer_type' if it exists ---
        if '_primary_disease' in df_pheno.columns:
            df_pheno = df_pheno.rename(columns={'_primary_disease': 'cancer_type'})
            print("Renamed '_primary_disease' to 'cancer_type' in phenotype data.")
        elif 'cancer_type' not in df_pheno.columns:
            print(f"Warning: 'cancer_type' or '_primary_disease' column not found in {PHENOTYPE_FILE}.")
            print("Please check your phenotype file and ensure it contains a column for cancer types.")
            return None, None, None, None, None, None, None # Exit if critical column is missing

        # Align samples between xCell and phenotype data
        common_samples = list(set(df_xcell.index) & set(df_pheno.index))
        df_xcell_aligned_for_gsea = df_xcell.loc[common_samples]
        df_pheno_aligned_for_gsea = df_pheno.loc[common_samples]

        # Merge xCell data with phenotype data based on sample IDs
        # This allows us to analyze TME cell types in relation to cancer types.
        df_tme = pd.merge(df_xcell_aligned_for_gsea, df_pheno_aligned_for_gsea[['cancer_type']], left_index=True, right_index=True)

        # Create df_annotated for later use (clustering, RF, etc.)
        df_annotated = df_xcell.copy()
        df_annotated["cancer_type"] = df_pheno["cancer_type"].reindex(df_xcell.index)
        # Drop samples where cancer_type is NaN after reindexing
        df_annotated.dropna(subset=['cancer_type'], inplace=True)
        # Re-align df_xcell_aligned_for_gsea and df_pheno_aligned_for_gsea to df_annotated's index
        common_samples_final = list(set(df_annotated.index) & set(df_pheno.index) & set(df_xcell.index))
        df_xcell_aligned_for_gsea = df_xcell.loc[common_samples_final]
        df_pheno_aligned_for_gsea = df_pheno.loc[common_samples_final]
        df_tme = df_tme.loc[common_samples_final]


        # Load filtered gene expression data from Phase 1 (for GSEA later)
        # Using sep='\t' as it's likely a tab-separated .xena file
        df_expr = pd.read_csv(os.path.join(DATA_DIR, EXPRESSION_FILE), index_col=0, sep='\t')

        print(f"Shape of merged TME data (samples x cell types + phenotype): {df_tme.shape}")
        print("First 5 rows of merged TME data:")
        print(df_tme.head())
        print(f"Shape of loaded expression data: {df_expr.shape}")
        print(f"Shape of df_annotated (for clustering/RF): {df_annotated.shape}")


        return df_xcell, df_pheno, df_tme, df_expr, df_xcell_aligned_for_gsea, df_pheno_aligned_for_gsea, df_annotated

    except FileNotFoundError as e:
        print(f"Error loading data: {e}. Please ensure all necessary files are in the '{DATA_DIR}' directory.")
        return None, None, None, None, None, None, None # Return None for all if error
    except Exception as e:
        print(f"An error occurred during data loading: {e}")
        import traceback
        traceback.print_exc() # Print full traceback for debugging
        return None, None, None, None, None, None, None # Return None for all if error

# --- Main execution flow for loading data ---
df_xcell, df_pheno, df_tme, df_expr, df_xcell_aligned_for_gsea, df_pheno_aligned_for_gsea, df_annotated = load_and_explore_data()

# Check if dataframes are loaded successfully before proceeding with analysis functions
if df_xcell is None or df_pheno is None or df_tme is None or df_expr is None or df_annotated is None:
    print("Essential dataframes not loaded due to errors. Cannot proceed with analysis functions in Phase 2.")
else:
    # ==============================================================================
    # 2.2 Tumor Microenvironment Composition Analysis (Visualizations)
    # This section focuses on visualizing the overall TME composition and
    # distributions of TME scores across cancer types.
    # ==============================================================================
    print("\n--- 2.2 Tumor Microenvironment Composition Analysis (Visualizations) ---")

    def analyze_tme_composition(df_xcell_data, df_tme_data):
        # Calculate mean abundance of each cell type across all samples
        # Exclude score columns for this specific calculation if they are in df_xcell_data
        mean_cell_abundance = df_xcell_data.drop(columns=['ImmuneScore', 'StromaScore', 'MicroenvironmentScore'], errors='ignore').mean().sort_values(ascending=False)
        print("\nTop 10 most abundant cell types:")
        print(mean_cell_abundance.head(10))

        # Visualize overall TME composition (top N cell types)
        plt.figure(figsize=(12, 8))
        sns.barplot(x=mean_cell_abundance.head(20).values, y=mean_cell_abundance.head(20).index, palette='viridis')
        plt.title('Overall Abundance of Top 20 Immune and Stromal Cell Types', fontsize=16, weight='bold')
        plt.xlabel('Mean Abundance Score', fontsize=12)
        plt.ylabel('Cell Type', fontsize=12)
        plt.grid(axis='x', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(os.path.join(RESULTS_DIR, 'tme_analysis', 'overall_cell_abundance.png'), dpi=300)
        plt.close()
        print("Overall cell abundance plot saved.")

        # Analyze ImmuneScore, StromaScore, MicroenvironmentScore distribution across cancer types
        score_columns = ['ImmuneScore', 'StromaScore', 'MicroenvironmentScore']
        for score in score_columns:
            if score in df_tme_data.columns:
                plt.figure(figsize=(14, 7))
                sns.boxplot(x='cancer_type', y=score, data=df_tme_data, palette='coolwarm')
                plt.title(f'{score} Distribution Across Cancer Types', fontsize=16, weight='bold')
                plt.xlabel('Cancer Type', fontsize=12)
                plt.ylabel(score, fontsize=12)
                plt.xticks(rotation=90, fontsize=10)
                plt.yticks(fontsize=10)
                plt.grid(axis='y', linestyle='--', alpha=0.7)
                plt.tight_layout()
                plt.savefig(os.path.join(RESULTS_DIR, 'tme_analysis', f'{score.lower()}_distribution.png'), dpi=300)
                plt.close()
                print(f"{score} distribution plot saved.")
            else:
                print(f"Warning: {score} not found in xCell data for distribution plot.")

        # Example: Heatmap of top 10 most variable cell types across cancer types
        # First, identify top variable cell types (excluding scores)
        cell_type_cols = df_xcell_data.columns.drop(score_columns, errors='ignore')
        if not cell_type_cols.empty:
            # Ensure df_xcell_data has enough data points to compute std
            if df_xcell_data[cell_type_cols].shape[0] > 1:
                top_variable_cell_types = df_xcell_data[cell_type_cols].std().sort_values(ascending=False).head(10).index
                # Ensure all top_variable_cell_types are in df_tme_data columns before creating heatmap
                valid_top_variable_cell_types = [col for col in top_variable_cell_types if col in df_tme_data.columns]
                
                if valid_top_variable_cell_types:
                    # Corrected approach for heatmap data: select columns, group, then transpose
                    df_heatmap = df_tme_data[['cancer_type'] + valid_top_variable_cell_types].groupby('cancer_type').mean().T

                    plt.figure(figsize=(14, 10))
                    sns.heatmap(df_heatmap, cmap='YlGnBu', annot=True, fmt=".2f", linewidths=.5, linecolor='lightgray')
                    plt.title('Mean Abundance of Top 10 Variable Cell Types Across Cancer Types', fontsize=16, weight='bold')
                    plt.xlabel('Cancer Type', fontsize=12) # Swapped xlabel and ylabel for transposed heatmap
                    plt.ylabel('Cell Type', fontsize=12) # Swapped xlabel and ylabel for transposed heatmap
                    plt.xticks(rotation=45, ha='right', fontsize=10)
                    plt.yticks(fontsize=10)
                    plt.tight_layout()
                    plt.savefig(os.path.join(RESULTS_DIR, 'tme_analysis', 'top_variable_cell_types_heatmap.png'), dpi=300)
                    plt.close()
                    print("Heatmap of top variable cell types saved.")
                else:
                    print("No valid individual cell types found for variability analysis heatmap after filtering.")
            else:
                print("Not enough samples in df_xcell_data to compute standard deviation for variability analysis heatmap.")
        else:
            print("No individual cell types found for variability analysis heatmap.")
        
        return score_columns, top_variable_cell_types # Return these for use in statistical tests

    score_cols_returned, top_var_cells_returned = analyze_tme_composition(df_xcell, df_tme)


    # ==============================================================================
    # 2.3 Statistical Tests: ANOVA for TME Scores and Cell Types across Cancer Types
    # This section performs ANOVA and post-hoc Tukey HSD tests to identify
    # statistically significant differences in TME features between cancer types.
    # ==============================================================================
    print("\n--- 2.3 Statistical Tests: ANOVA for TME Scores and Cell Types ---")

    def perform_statistical_tests(df_tme_data, score_columns_list, top_variable_cell_types_list):
        # List of features to test (TME scores and a selection of cell types)
        features_to_test = score_columns_list + list(top_variable_cell_types_list)

        # Filter out features that might not be present in df_tme_data
        features_to_test = [f for f in features_to_test if f in df_tme_data.columns]

        if not features_to_test:
            print("No valid TME features to perform ANOVA. Skipping statistical tests.")
            return

        # Get unique cancer types
        cancer_types = df_tme_data['cancer_type'].unique()

        # Ensure there are at least 2 cancer types for ANOVA
        if len(cancer_types) < 2:
            print("Not enough cancer types (less than 2) to perform ANOVA. Skipping statistical tests.")
            return
        else:
            anova_results = []
            tukey_results = {}

            for feature in features_to_test:
                # Prepare data for ANOVA: list of arrays, one for each cancer type's data
                groups = [df_tme_data[df_tme_data['cancer_type'] == ct][feature].dropna() for ct in cancer_types]
                
                # Filter out empty groups (cancer types with no data for this feature)
                groups = [g for g in groups if not g.empty and len(g) > 1] # Ensure at least 2 data points per group for variance

                if len(groups) < 2:
                    print(f"Skipping ANOVA for '{feature}': Less than 2 valid groups with enough data after dropping NaNs.")
                    continue

                # Perform Levene's test for homogeneity of variances
                try:
                    levene_stat, levene_p = levene(*groups)
                    print(f"\nLevene's Test for {feature}: Statistic={levene_stat:.3f}, p={levene_p:.3e}")
                    if levene_p < 0.05:
                        print(f"  Warning: Variances are not equal for {feature} (p < 0.05).")
                except Exception as e:
                    print(f"Could not perform Levene's test for {feature}: {e}")

                # Perform one-way ANOVA
                try:
                    f_stat, p_val = f_oneway(*groups)
                    anova_results.append({'Feature': feature, 'F-statistic': f_stat, 'p-value': p_val})

                    print(f"ANOVA for '{feature}': F-statistic={f_stat:.3f}, p-value={p_val:.3e}")

                    # If ANOVA is significant, perform Tukey's HSD post-hoc test
                    if p_val < 0.05:
                        print(f"  Significant differences found for {feature} (p < 0.05). Performing Tukey HSD post-hoc test...")
                        # Prepare data for Tukey HSD: single array of data, and single array of group labels
                        data_for_tukey = df_tme_data[[feature, 'cancer_type']].dropna()
                        if not data_for_tukey.empty:
                            try:
                                # Ensure there are at least 2 unique groups for Tukey HSD
                                if len(data_for_tukey['cancer_type'].unique()) > 1:
                                    tukey_hsd = pairwise_tukeyhsd(endog=data_for_tukey[feature],
                                                                groups=data_for_tukey['cancer_type'],
                                                                alpha=0.05)
                                    tukey_results[feature] = tukey_hsd
                                    print(tukey_hsd) # Print the summary table
                                    print(f"  Tukey HSD results for {feature} printed.")
                                else:
                                    print(f"  Not enough unique groups for Tukey HSD for {feature}.")
                            except Exception as e:
                                print(f"  Error performing Tukey HSD for {feature}: {e}")
                        else:
                            print(f"  Not enough data for Tukey HSD for {feature} after dropping NaNs.")
                    else:
                        print(f"  No significant differences found for {feature} (p >= 0.05). No post-hoc test performed.")
                except Exception as e:
                    print(f"Error performing ANOVA for {feature}: {e}")

            df_anova_results = pd.DataFrame(anova_results)
            if not df_anova_results.empty:
                df_anova_results = df_anova_results.sort_values(by='p-value')
                print("\nSummary of ANOVA Results (sorted by p-value):")
                print(df_anova_results)
                df_anova_results.to_csv(os.path.join(RESULTS_DIR, 'tme_analysis', 'anova_results_tme.csv'), index=False)
                print("ANOVA results saved to 'anova_results_tme.csv'.")
            else:
                print("No ANOVA results to display or save.")

    perform_statistical_tests(df_tme, score_cols_returned, top_var_cells_returned)


    # ==============================================================================
    # 2.4 Dimension Reduction (PCA & UMAP) on TME Data
    # This section applies dimensionality reduction techniques to visualize
    # the TME landscape based on cell type abundance.
    # ==============================================================================
    print("\n--- 2.4 Dimension Reduction (PCA & UMAP) on TME Data ---")

    def perform_dimension_reduction(df_tme_data, score_columns_list):
        # Prepare data for PCA/UMAP
        # Exclude score columns and 'cancer_type' for dimensionality reduction
        tme_features = df_tme_data.drop(columns=score_columns_list + ['cancer_type'], errors='ignore')

        if tme_features.empty:
            print("No TME features available for dimensionality reduction. Skipping PCA/UMAP.")
            return

        # Standardize the features
        scaler = StandardScaler()
        tme_scaled = scaler.fit_transform(tme_features)
        tme_scaled_df = pd.DataFrame(tme_scaled, index=tme_features.index, columns=tme_features.columns)

        # Get unique cancer types to assign distinct markers
        unique_cancer_types = df_tme_data['cancer_type'].unique()
        # Define a list of FILLED markers to cycle through for better compatibility with 'style'
        markers = ['o', 's', '^', 'D', 'v', 'P', '*', 'X', 'h', 'p', '8', 'H']
        
        # Create a dictionary mapping cancer types to markers, cycling if more cancer types than markers
        marker_cycle = itertools.cycle(markers)
        cancer_type_markers = {ct: next(marker_cycle) for ct in unique_cancer_types}

        # Perform PCA
        pca = PCA(n_components=2)
        tme_pca = pca.fit_transform(tme_scaled_df)
        tme_pca_df = pd.DataFrame(data=tme_pca, index=df_tme_data.index, columns=['PC1', 'PC2'])
        tme_pca_df = pd.merge(tme_pca_df, df_tme_data[['cancer_type']], left_index=True, right_index=True)

        # Plot PCA with improved aesthetics
        plt.figure(figsize=(12, 10)) # Slightly larger figure
        sns.scatterplot(
            x='PC1', y='PC2',
            hue='cancer_type',
            style='cancer_type', # Use style to assign distinct markers
            data=tme_pca_df,
            palette='tab20', # 'tab20' is a good categorical palette for up to 20 distinct colors
            s=70, # Increased size of points
            alpha=0.8, # Slightly increased opacity
            edgecolor='black', # Added black edge for better definition
            markers=cancer_type_markers # Pass the dictionary for specific marker assignment
        )
        plt.title('PCA of Tumor Microenvironment Cell Abundance', fontsize=18, weight='bold')
        plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]*100:.2f}%)', fontsize=14)
        plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]*100:.2f}%)', fontsize=14)
        plt.legend(title='Cancer Type', bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0., fontsize=10, title_fontsize=12)
        plt.grid(True, linestyle='--', alpha=0.6)
        sns.despine() # Remove top and right spines for cleaner look
        plt.tight_layout()
        plt.savefig(os.path.join(RESULTS_DIR, 'tme_analysis', 'tme_pca_plot.png'), dpi=600) # Increased DPI
        plt.close()
        print("TME PCA plot saved with improved aesthetics.")

        # Perform UMAP
        umap_reducer = UMAP(n_components=2, random_state=42)
        tme_umap = umap_reducer.fit_transform(tme_scaled_df)
        tme_umap_df = pd.DataFrame(data=tme_umap, index=df_tme_data.index, columns=['UMAP1', 'UMAP2'])
        tme_umap_df = pd.merge(tme_umap_df, df_tme_data[['cancer_type']], left_index=True, right_index=True)

        # Plot UMAP with improved aesthetics
        plt.figure(figsize=(12, 10)) # Slightly larger figure
        sns.scatterplot(
            x='UMAP1', y='UMAP2',
            hue='cancer_type',
            style='cancer_type', # Use style to assign distinct markers
            data=tme_umap_df,
            palette='tab20', # 'tab20' is a good categorical palette
            s=70, # Increased size of points
            alpha=0.8, # Slightly increased opacity
            edgecolor='black', # Added black edge for better definition
            markers=cancer_type_markers # Pass the dictionary for specific marker assignment
        )
        plt.title('UMAP of Tumor Microenvironment Cell Abundance', fontsize=18, weight='bold')
        plt.xlabel('UMAP Component 1', fontsize=14)
        plt.ylabel('UMAP Component 2', fontsize=14)
        plt.legend(title='Cancer Type', bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0., fontsize=10, title_fontsize=12)
        plt.grid(True, linestyle='--', alpha=0.6)
        sns.despine() # Remove top and right spines for cleaner look
        plt.tight_layout()
        plt.savefig(os.path.join(RESULTS_DIR, 'tme_analysis', 'tme_umap_plot.png'), dpi=600) # Increased DPI
        plt.close()
        print("TME UMAP plot saved with improved aesthetics.")

    perform_dimension_reduction(df_tme, score_cols_returned)


    # ==============================================================================
    # 2.5 Additional TME Visualizations (Violin, Variability, Presence)
    # These plots provide further insights into the distribution and
    # characteristics of xCell-inferred cell types.
    # ==============================================================================
    print("\n--- 2.5 Additional TME Visualizations (Violin, Variability, Presence) ---")

    def plot_additional_tme_visualizations(df_xcell_data, df_tme_data): # Added df_tme_data parameter
        # Melt entire df_xcell_data for seaborn
        df_melted_all = df_xcell_data.reset_index().melt(id_vars='index')
        df_melted_all.columns = ['Sample', 'Cell_Type', 'Abundance']

        # Sort cell types by median abundance (optional for visual order)
        ordered_cells = df_melted_all.groupby("Cell_Type")["Abundance"].median().sort_values(ascending=False).index

        # Create Violin Plot: Distribution of xCell-Inferred Cell Types Across TCGA Tumors
        plt.figure(figsize=(24, 6))
        sns.violinplot(
            data=df_melted_all,
            x='Cell_Type',
            y='Abundance',
            hue='Cell_Type',
            palette='husl',
            density_norm='width',
            linewidth=0.8,
            order=ordered_cells,
            legend=False
        )

        # Styling for journals/LinkedIn
        plt.title("Distribution of xCell-Inferred Cell Types Across TCGA Tumors", fontsize=16, weight='bold')
        plt.xlabel("Cell Type", fontsize=13)
        plt.ylabel("xCell Score", fontsize=13)
        plt.xticks(rotation=90, fontsize=10)
        plt.yticks(fontsize=11)
        plt.grid(axis='y', linestyle='--', alpha=0.4)
        sns.despine()

        # Save high-resolution image
        plt.tight_layout()
        plt.savefig(os.path.join(GRAPHS_DIR, "TME_Violin_All_Celltypes_HQ.png"), dpi=600)
        plt.savefig(os.path.join(GRAPHS_DIR, "TME_Violin_All_Celltypes_HQ.pdf"))
        plt.close()
        print("Violin plot of all cell types saved.")

        # Prepare data for Top 10 Most Variable Cell Types
        cell_variability = df_xcell_data.std().sort_values(ascending=False)
        top10_cells = cell_variability.head(10).reset_index()
        top10_cells.columns = ['Cell_Type', 'Standard_Deviation']

        # Plot Top 10 Most Variable Cell Types (xCell Scores)
        plt.figure(figsize=(10, 5))
        sns.barplot(
            data=top10_cells,
            x='Cell_Type',
            y='Standard_Deviation',
            hue='Cell_Type', 
            palette='viridis',
            dodge=False,
            legend=False
        )

        # Aesthetics
        plt.title("Top 10 Most Variable Cell Types (xCell Scores)", fontsize=16, weight='bold')
        plt.ylabel("Standard Deviation", fontsize=13)
        plt.xlabel("Cell Type", fontsize=13)
        plt.xticks(rotation=45, ha='right', fontsize=11)
        plt.yticks(fontsize=11)
        plt.grid(axis='y', linestyle='--', alpha=0.4)
        sns.despine()

        # Save high-res
        plt.tight_layout()
        plt.savefig(os.path.join(GRAPHS_DIR, "TME_Top10_Variable_Celltypes_Barplot.png"), dpi=600)
        plt.savefig(os.path.join(GRAPHS_DIR, "TME_Top10_Variable_Celltypes_Barplot.pdf"))
        plt.close()
        print("Bar plot of top 10 variable cell types saved.")

        # Step 1: Calculate how many samples each cell type is detected in (non-zero values)
        cell_presence = (df_xcell_data > 0).sum().sort_values(ascending=False).reset_index()
        cell_presence.columns = ['Cell_Type', 'Presence_Count']

        # Step 2: Plot Frequency of Cell Type Detection Across Samples
        plt.figure(figsize=(14, 5))
        sns.barplot(
            data=cell_presence,
            x='Cell_Type',
            y='Presence_Count',
            hue='Cell_Type',
            palette='cubehelix',
            dodge=False,
            legend=False
        )

        # Step 3: Aesthetics
        plt.title("Frequency of Cell Type Detection Across Samples", fontsize=14, weight='bold')
        plt.ylabel("Samples\nwith Non-Zero xCell Score", fontsize=10, labelpad=15)
        plt.xlabel("Cell Type", fontsize=12)
        plt.xticks(rotation=90, fontsize=10)
        plt.yticks(fontsize=11)
        plt.grid(axis='y', linestyle='--', alpha=0.4)
        sns.despine()

        # Step 4: Save high-resolution image
        plt.tight_layout()
        plt.savefig(os.path.join(GRAPHS_DIR, "TME_Celltype_Presence_Counts.png"), dpi=600)
        plt.savefig(os.path.join(GRAPHS_DIR, "TME_Celltype_Presence_Counts.pdf"))
        plt.close()
        print("Cell type presence count plot saved.")

        # Heatmap: TME Landscape: All Cell Types vs 33 TCGA Cancer Types
        # Group by cancer type and average each cell type
        tme_by_cancer = df_tme_data.groupby("cancer_type").mean(numeric_only=True).T # Added numeric_only=True

        plt.figure(figsize=(18, 10))
        sns.heatmap(tme_by_cancer, cmap="coolwarm", linewidths=0.3, linecolor='gray')
        plt.title("TME Landscape: All Cell Types vs TCGA Cancer Types", fontsize=15, weight='bold')
        plt.xlabel("Cancer Type", fontsize=12)
        plt.ylabel("Cell Type", fontsize=12)
        plt.xticks(rotation=90, fontsize=10)
        plt.yticks(fontsize=10)
        plt.tight_layout()
        plt.savefig(os.path.join(GRAPHS_DIR, "TME_Heatmap_AllCells_vs_CancerTypes.png"), dpi=600)
        plt.savefig(os.path.join(GRAPHS_DIR, "TME_Heatmap_AllCells_vs_CancerTypes.pdf"))
        plt.close()
        print("Heatmap of TME landscape saved.")

    plot_additional_tme_visualizations(df_xcell, df_tme) # Updated function call


    # ==============================================================================
    # 2.6 TME Clustering and Visualization (KMeans & UMAP)
    # This section identifies distinct TME clusters and visualizes them.
    # ==============================================================================
    print("\n--- 2.6 TME Clustering and Visualization (KMeans & UMAP) ---")

    def perform_tme_clustering(df_annotated_data):
        # Drop cancer type column (we cluster purely based on TME)
        # Ensure only numeric columns are selected for clustering
        X_clustering = df_annotated_data.select_dtypes(include=np.number).copy()
        
        # Remove any score columns if they are present and not desired for clustering
        score_columns_to_exclude = ['ImmuneScore', 'StromaScore', 'MicroenvironmentScore']
        X_clustering = X_clustering.drop(columns=[col for col in score_columns_to_exclude if col in X_clustering.columns], errors='ignore')

        if X_clustering.empty or X_clustering.shape[1] == 0:
            print("No numeric TME features available for clustering. Skipping KMeans and UMAP for clusters.")
            return None # Return None if no features for clustering

        # Standardize xCell features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_clustering)

        # Perform KMeans clustering
        # Using 3 clusters as per the provided code snippet
        kmeans = KMeans(n_clusters=3, random_state=42, n_init=10) # Set n_init explicitly
        clusters = kmeans.fit_predict(X_scaled)

        # Add cluster labels to original df_annotated
        df_annotated_data["TME_Cluster"] = clusters
        # Map cluster index to meaningful label
        cluster_labels = {
            0: "Stromal-Immune Mixed",
            1: "Stromal-rich",
            2: "Immune-active"
        }
        # Assign label to dataframe
        df_annotated_data["Cluster_Label"] = df_annotated_data["TME_Cluster"].map(cluster_labels)
        print("TME Clusters identified and labeled.")

        # Save cluster labels to CSV
        cluster_df_to_save = df_annotated_data[['TME_Cluster', 'Cluster_Label']].copy()
        cluster_df_to_save.index.name = 'sample_id'
        cluster_df_to_save.to_csv(os.path.join(DATA_DIR, "cluster_labels.csv"), index=True)
        print(f"Cluster labels saved to {os.path.join(DATA_DIR, 'cluster_labels.csv')}")

        # Plot UMAP with annotated cluster labels
        # Step 1: Get only numeric xCell features (excluding the newly added cluster columns)
        cell_type_columns_for_umap = df_annotated_data.select_dtypes(include=np.number).columns.drop(['TME_Cluster'], errors='ignore')
        df_numeric_for_umap = df_annotated_data[cell_type_columns_for_umap]

        # Standardize the features for UMAP
        scaler_umap = StandardScaler()
        X_scaled_umap = scaler_umap.fit_transform(df_numeric_for_umap)

        # Run UMAP
        reducer = UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
        embedding = reducer.fit_transform(X_scaled_umap)

        plt.figure(figsize=(10, 7))
        sns.scatterplot(
            x=embedding[:, 0],
            y=embedding[:, 1],
            hue=df_annotated_data["Cluster_Label"],
            palette="Set2",
            s=60,
            edgecolor="black",
            style=df_annotated_data["Cluster_Label"] # Use style for distinct shapes
        )
        plt.title("UMAP of Tumor Microenvironment Clusters (Annotated)", fontsize=14, weight='bold')
        plt.xlabel("UMAP 1")
        plt.ylabel("UMAP 2")
        plt.legend(title="TME Cluster", loc="best", fontsize=10)
        plt.grid(True, linestyle='--', alpha=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(GRAPHS_DIR, "TME_UMAP_Annotated_Clusters.png"), dpi=600)
        plt.close()
        print("UMAP plot of TME clusters saved.")

        # Heatmap of Top Marker Cell Types per TME Cluster
        # Only numeric xCell features (excluding cluster columns)
        cell_type_columns_for_heatmap = df_annotated_data.select_dtypes(include=np.number).columns.drop(['TME_Cluster'], errors='ignore')
        df_numeric_for_heatmap = df_annotated_data[cell_type_columns_for_heatmap]

        # Mean cell type score per cluster
        cluster_profiles = df_numeric_for_heatmap.groupby(df_annotated_data["TME_Cluster"]).mean().T

        # Top 5 cell types per cluster
        # Ensure that cluster_profiles is not empty before attempting to get top_markers
        if not cluster_profiles.empty:
            top_markers = cluster_profiles.apply(lambda col: col.sort_values(ascending=False).head(5).index.tolist())
            top_unique = set(sum(top_markers.values.tolist(), []))

            # Heatmap
            plt.figure(figsize=(10, 6))
            sns.heatmap(cluster_profiles.loc[list(top_unique)], cmap="YlGnBu", annot=True, fmt=".2f")
            plt.title("Top Marker Cell Types per TME Cluster", fontsize=14, weight='bold')
            plt.xlabel("TME Cluster")
            plt.ylabel("Cell Type")
            plt.tight_layout()
            plt.savefig(os.path.join(GRAPHS_DIR, "TME_TopMarkers_per_Cluster.png"), dpi=600)
            plt.close()
            print("Heatmap of top marker cell types per cluster saved.")
        else:
            print("No cluster profiles to generate heatmap for top markers.")

        return df_annotated_data # Return the updated df_annotated with cluster labels

    df_annotated = perform_tme_clustering(df_annotated)


    # ==============================================================================
    # 2.7 Random Forest Classification for TME Cluster Prediction
    # This section trains a Random Forest model to predict TME clusters
    # and evaluates its performance.
    # ==============================================================================
    print("\n--- 2.7 Random Forest Classification for TME Cluster Prediction ---")

    def perform_rf_classification(df_annotated_data):
        if "TME_Cluster" not in df_annotated_data.columns:
            print("TME_Cluster column not found. Skipping Random Forest Classification.")
            return None, None # Return None for clf and X if clustering wasn't successful

        # 1. Extract features and labels
        # Ensure to drop 'Cluster_Label' as it's directly derived from 'TME_Cluster'
        X_rf = df_annotated_data.select_dtypes(include=np.number).drop(columns=["TME_Cluster"], errors='ignore')
        y_rf = df_annotated_data["TME_Cluster"]

        if X_rf.empty or y_rf.empty:
            print("Insufficient data for Random Forest Classification. Skipping.")
            return None, None

        # 2. Train/test split
        X_train, X_test, y_train, y_test = train_test_split(X_rf, y_rf, test_size=0.2, random_state=42, stratify=y_rf) # stratify for balanced classes

        # 3. Train Random Forest
        clf = RandomForestClassifier(n_estimators=200, random_state=42)
        clf.fit(X_train, y_train)
        print("Random Forest Classifier trained.")

        # 4. Predict and evaluate
        y_pred = clf.predict(X_test)
        print("\nRandom Forest Classification Report:")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))

        # Get feature importances
        importances = clf.feature_importances_
        feature_names = X_rf.columns
        feature_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
        feature_df = feature_df.sort_values(by='Importance', ascending=False).head(20)

        # Plot Top 20 Predictive Cell Types for TME Cluster
        plt.figure(figsize=(10, 6))
        sns.barplot(data=feature_df, x='Importance', y='Feature', palette='viridis')
        plt.title("Top 20 Predictive Cell Types for TME Cluster", fontsize=14, weight='bold')
        plt.xlabel("Random Forest Feature Importance", fontsize=12)
        plt.ylabel("Cell Type", fontsize=12)
        plt.tight_layout()
        plt.savefig(os.path.join(GRAPHS_DIR, "TME_RF_Top20_Features.png"), dpi=600)
        plt.close()
        print("Top 20 RF features bar plot saved.")

        # Define xCell category groups for coloring RF feature importance plot
        immune_cell_types = [
            'aDC', 'B-cells', 'Basophils', 'CD4+ memory T-cells', 'CD4+ naive T-cells',
            'CD4+ T-cells', 'CD4+ Tcm', 'CD4+ Tem', 'CD8+ naive T-cells',
            'CD8+ T-cells', 'CD8+ Tcm', 'CD8+ Tem', 'Class-switched memory B-cells',
            'cDC', 'DC', 'Eosinophils', 'iDC', 'Macrophages', 'Macrophages M1',
            'Macrophages M2', 'Mast cells', 'Memory B-cells', 'Monocytes', 'naive B-cells',
            'Neutrophils', 'NK cells', 'NKT', 'pDC', 'Plasma cells', 'pro B-cells',
            'Tgd cells', 'Th1 cells', 'Th2 cells', 'Tregs', 'CLP', 'CMP'
        ]

        stromal_cell_types = [
            'Adipocytes', 'Astrocytes', 'Chondrocytes', 'Endothelial cells',
            'Epithelial cells', 'Erythrocytes', 'Fibroblasts', 'Hepatocytes',
            'Keratinocytes', 'ly Endothelial cells', 'Megakaryocytes', 'Melanocytes',
            'Mesangial cells', 'MSC', 'mv Endothelial cells', 'Myocytes',
            'Neurons', 'Osteoblast', 'Pericytes', 'Preadipocytes',
            'Sebocytes', 'Smooth muscle'
        ]

        progenitor_cell_types = [
            'GMP', 'HSC', 'MEP', 'MPP', 'Platelets'
        ]

        xcell_scores = [
            'ImmuneScore', 'StromaScore', 'MicroenvironmentScore'
        ]

        # Annotate feature category for coloring
        def categorize(feature):
            if feature in immune_cell_types:
                return 'Immune'
            elif feature in stromal_cell_types:
                return 'Stromal'
            elif feature in progenitor_cell_types:
                return 'Progenitor'
            elif feature in xcell_scores:
                return 'Score'
            else:
                return 'Other'

        feature_df['Type'] = feature_df['Feature'].apply(categorize)

        # Plot: Top 20 feature importances with categories
        sns.set(style="whitegrid")
        plt.figure(figsize=(10, 6))

        palette = {
            'Immune': 'orange',
            'Stromal': 'slategray',
            'Progenitor': 'purple',
            'Score': 'teal',
            'Other': 'grey'
        }

        ax = sns.barplot(
            data=feature_df,
            x='Importance', y='Feature',
            hue='Type', dodge=False,
            palette=palette
        )

        # Add exact importance values to bars
        for i, (imp, feat) in enumerate(zip(feature_df['Importance'], feature_df['Feature'])):
            plt.text(imp + 0.001, i, f"{imp:.3f}", va='center', fontsize=9)

        plt.title("Top 20 Predictive Cell Types for TME Cluster (Categorized)", fontsize=14, weight='bold')
        plt.xlabel("Random Forest Feature Importance", fontsize=12)
        plt.ylabel("Cell Type", fontsize=12)
        plt.xticks(fontsize=10)
        plt.yticks(fontsize=10)
        plt.legend(title='Cell Category', loc='lower right')
        plt.tight_layout()

        plt.savefig(os.path.join(GRAPHS_DIR, "TME_RF_Top20_FeatureCategories.png"), dpi=600)
        feature_df.to_csv(os.path.join(RESULTS_DIR, "TME_RF_Top20_Features_Annotated.csv"), index=False)
        plt.close()
        print("Top 20 RF features with categories bar plot and CSV saved.")

        return clf, X_rf # Return trained classifier and feature matrix for later use

    clf_rf, X_rf_features = perform_rf_classification(df_annotated)


    # ==============================================================================
    # 2.8 Immune Checkpoint Gene Expression Analysis
    # This section analyzes the expression of immune checkpoint genes
    # across TME clusters and their correlation with immune cell types.
    # ==============================================================================
    print("\n--- 2.8 Immune Checkpoint Gene Expression Analysis ---")

    def analyze_immune_checkpoints(df_expr_data, df_annotated_data, df_xcell_data):
        # 1. Load Gene Expression & xCell Data (already loaded, just ensure correct variables)
        expr_df_ic = df_expr_data.copy() # Use a copy to avoid modifying original
        xcell_df_ic = df_xcell_data.copy() # Use a copy

        # Define Immune Checkpoint Genes
        checkpoint_genes = [
            'PDCD1',    # PD-1
            'CD274',    # PD-L1
            'PDCD1LG2', # PD-L2
            'CTLA4',
            'LAG3',
            'TIGIT',
            'HAVCR2',   # TIM-3
            'ICOS',
            'TNFRSF9',  # 4-1BB
            'BTLA'
        ]

        # Filter genes present in expression data
        valid_genes = [g for g in checkpoint_genes if g in expr_df_ic.index]
        print(f"✅ Valid checkpoint genes found in data: {valid_genes}")

        if not valid_genes:
            print("No valid immune checkpoint genes found in expression data. Skipping checkpoint analysis.")
            return

        # 2. Prepare Long-Format Data for expression across clusters
        # Transpose expression: rows = samples, columns = genes
        expr_t_ic = expr_df_ic.T

        # Merge with cluster info (df_annotated_data contains 'TME_Cluster' and 'Cluster_Label')
        # Ensure indices match for merging
        common_samples_ic = expr_t_ic.index.intersection(df_annotated_data.index)
        merged_df_ic = expr_t_ic.loc[common_samples_ic, valid_genes].merge(
            df_annotated_data.loc[common_samples_ic, ['TME_Cluster', 'Cluster_Label']],
            left_index=True, right_index=True
        )

        expr_long_ic = merged_df_ic.reset_index().melt(
            id_vars=['index', 'TME_Cluster', 'Cluster_Label'], var_name='gene', value_name='expression'
        )
        expr_long_ic.columns = ['sample_id', 'TME_Cluster', 'Cluster_Label', 'gene', 'expression']

        # 3. Plot Violin + Strip Plots for Checkpoint Genes Across TME Clusters
        for gene in valid_genes:
            plt.figure(figsize=(8, 5))
            sns.violinplot(
                data=expr_long_ic[expr_long_ic['gene'] == gene],
                x='Cluster_Label', y='expression',
                hue='Cluster_Label', palette='Set2', dodge=False, legend=False, cut=0, inner='box'
            )
            sns.stripplot(
                data=expr_long_ic[expr_long_ic['gene'] == gene],
                x='Cluster_Label', y='expression',
                color='black', size=1.5, alpha=0.3
            )
            plt.title(f"{gene} Expression Across TME Clusters", fontsize=14, weight='bold')
            plt.xlabel("TME Cluster", fontsize=12)
            plt.ylabel("Expression (log TPM or normalized)", fontsize=12)
            plt.xticks(rotation=15)
            plt.tight_layout()
            plt.savefig(os.path.join(PLOTS_CHECKPOINTS_DIR, f"{gene}_violin.png"), dpi=300)
            plt.close()
        print(f"✅ Violin plots saved to: {PLOTS_CHECKPOINTS_DIR}/")

        # 4. Kruskal–Wallis Test (Non-parametric ANOVA for multiple groups)
        print("\n🔬 Kruskal–Wallis Test Results (per gene):\n")
        kruskal_results = []
        for gene in valid_genes:
            gene_df = expr_long_ic[expr_long_ic['gene'] == gene]
            groups = [grp['expression'].values for _, grp in gene_df.groupby('Cluster_Label')]
            
            # Ensure there are at least 2 non-empty groups for Kruskal-Wallis
            groups = [g for g in groups if len(g) > 0]
            if len(groups) > 1:
                stat, pval = kruskal(*groups)
                print(f"{gene}: H = {stat:.2f}, p = {pval:.4g}")
                kruskal_results.append({'Gene': gene, 'H-statistic': stat, 'p-value': pval})
            else:
                print(f"Skipping Kruskal-Wallis for {gene}: Not enough groups with data.")

        # Save to CSV
        if kruskal_results:
            pd.DataFrame(kruskal_results).to_csv(os.path.join(RESULTS_DIR, "checkpoint_Kruskal_test_results.csv"), index=False)
            print(f"\n📄 Kruskal–Wallis results saved to: {os.path.join(RESULTS_DIR, 'checkpoint_Kruskal_test_results.csv')}")
        else:
            print("\n⚠️ No Kruskal-Wallis results to save.")

        # 5. Heatmap: Mean Immune Checkpoint Gene Expression by TME Cluster
        # Pivot so each row = cluster, each column = gene, values = avg expression
        mean_expr = expr_long_ic.groupby(['Cluster_Label', 'gene'])['expression'].mean().reset_index()
        heatmap_df_ic = mean_expr.pivot(index='Cluster_Label', columns='gene', values='expression')

        # Z-score scaling across genes (for visualization purposes)
        if not heatmap_df_ic.empty and heatmap_df_ic.std().sum() > 0: # Check for non-zero std to avoid division by zero
            heatmap_scaled_ic = (heatmap_df_ic - heatmap_df_ic.mean()) / heatmap_df_ic.std()
        else:
            heatmap_scaled_ic = heatmap_df_ic # If no variability, use as is
            print("Warning: No variability in gene expression for Z-score scaling in checkpoint heatmap.")

        plt.figure(figsize=(10, 5))
        sns.heatmap(
            heatmap_scaled_ic,
            annot=True,
            cmap='coolwarm',
            center=0,
            linewidths=0.5,
            cbar_kws={"label": "Z-score (by gene)'"}
        )

        plt.title("Mean Immune Checkpoint Gene Expression by TME Cluster", fontsize=14, weight='bold')
        plt.ylabel("TME Cluster", fontsize=12)
        plt.xlabel("Gene", fontsize=12)
        plt.xticks(rotation=30)
        plt.tight_layout()

        plt.savefig(os.path.join(PLOTS_CHECKPOINTS_DIR, "checkpoint_cluster_heatmap.png"), dpi=300)
        plt.close()
        print("Mean immune checkpoint expression heatmap saved.")

        # 6. Correlation: Immune Checkpoint Expression vs Immune Cell Abundance
        # Define Immune Cell Types (from the provided list)
        immune_cells = [
            'aDC', 'B-cells', 'Basophils', 'CD4+ memory T-cells', 'CD4+ naive T-cells',
            'CD4+ T-cells', 'CD4+ Tcm', 'CD4+ Tem', 'CD8+ naive T-cells', 'CD8+ T-cells',
            'CD8+ Tcm', 'CD8+ Tem', 'Class-switched memory B-cells', 'cDC', 'DC',
            'Eosinophils', 'iDC', 'Macrophages', 'Macrophages M1', 'Macrophages M2',
            'Mast cells', 'Memory B-cells', 'Monocytes', 'naive B-cells',
            'Neutrophils', 'NK cells', 'NKT', 'pDC', 'Plasma cells', 'pro B-cells',
            'Tgd cells', 'Th1 cells', 'Th2 cells', 'Tregs', 'CLP', 'CMP'
        ]

        # Filter only valid immune cells present in xCell data
        immune_cells_valid = [cell for cell in immune_cells if cell in xcell_df_ic.columns]
        xcell_immune = xcell_df_ic[immune_cells_valid]

        if not immune_cells_valid:
            print("No valid immune cell types found in xCell data for correlation. Skipping correlation heatmap.")
            return

        # Find common samples between expression (transposed) and xCell immune data
        common_samples_corr = expr_t_ic.index.intersection(xcell_immune.index)
        expr_t_corr = expr_t_ic.loc[common_samples_corr, valid_genes]
        xcell_immune_corr = xcell_immune.loc[common_samples_corr]

        if expr_t_corr.empty or xcell_immune_corr.empty:
            print("Insufficient common samples for correlation analysis. Skipping correlation heatmap.")
            return

        # Compute Spearman Correlation Matrix
        corr_matrix = pd.DataFrame(index=immune_cells_valid, columns=valid_genes)

        for gene in valid_genes:
            for cell in immune_cells_valid:
                # Ensure there is variability in both series before computing correlation
                if expr_t_corr[gene].std() > 0 and xcell_immune_corr[cell].std() > 0:
                    rho, _ = stats.spearmanr(expr_t_corr[gene], xcell_immune_corr[cell])
                    corr_matrix.loc[cell, gene] = rho
                else:
                    corr_matrix.loc[cell, gene] = np.nan # Assign NaN if no variability

        # Convert to float
        corr_matrix = corr_matrix.astype(float)

        # Plot Heatmap
        plt.figure(figsize=(12, 14))
        sns.heatmap(
            corr_matrix,
            cmap="vlag",
            center=0,
            linewidths=0.5,
            cbar_kws={"label": "Spearman Correlation"},
            annot=True, # Added annotation for values
            fmt=".2f" # Format annotation to 2 decimal places
        )

        plt.title("Correlation: Immune Checkpoint Expression vs Immune Cell Abundance", fontsize=14, weight='bold')
        plt.xlabel("Immune Checkpoint Genes", fontsize=12)
        plt.ylabel("Immune Cell Types", fontsize=12)
        plt.xticks(rotation=30, ha='right')
        plt.yticks(rotation=0)
        plt.tight_layout()

        plt.savefig(os.path.join(PLOTS_CHECKPOINTS_DIR, "checkpoint_cell_correlation_heatmap.png"), dpi=300)
        plt.close()
        print(f"✅ Correlation heatmap saved to: {os.path.join(PLOTS_CHECKPOINTS_DIR, 'checkpoint_cell_correlation_heatmap.png')}")

    if df_annotated is not None: # Only run if clustering was successful and df_annotated has cluster info
        analyze_immune_checkpoints(df_expr, df_annotated, df_xcell)
    else:
        print("Skipping Immune Checkpoint Analysis as TME clustering was not successful.")


    # ==============================================================================
    # 2.9 Gene Set Enrichment Analysis (GSEA) for TME Cluster Comparisons
    # This section performs differential expression analysis and GSEA between
    # identified TME clusters.
    # ==============================================================================
    print("\n--- 2.9 Gene Set Enrichment Analysis (GSEA) for TME Cluster Comparisons ---")

    def perform_gsea_cluster_comparisons(df_expr_data, df_annotated_data, gsea_gene_sets_dir):
        if "TME_Cluster" not in df_annotated_data.columns:
            print("TME_Cluster column not found in df_annotated. Skipping GSEA for cluster comparisons.")
            return

        # Ensure expression data and cluster labels are aligned
        # Transpose expr_df_data to have samples as rows, genes as columns
        expr_t_gsea = df_expr_data.T

        common_samples_gsea = expr_t_gsea.index.intersection(df_annotated_data.index)
        merged_gsea = expr_t_gsea.loc[common_samples_gsea].merge(
            df_annotated_data.loc[common_samples_gsea, ['Cluster_Label']],
            left_index=True, right_index=True
        )

        if merged_gsea.empty:
            print("No common samples between expression data and annotated data for GSEA. Skipping.")
            return

        # Define cluster comparisons
        # Get unique cluster labels from the data
        unique_cluster_labels = merged_gsea['Cluster_Label'].unique()
        if len(unique_cluster_labels) < 2:
            print("Less than 2 unique TME clusters found. Cannot perform GSEA comparisons. Skipping.")
            return

        # Create all possible unique pairs for comparison
        cluster_pairs = list(itertools.combinations(unique_cluster_labels, 2))

        os.makedirs(os.path.join(RESULTS_DIR, 'gsea'), exist_ok=True)
        all_top10_gsea_results = []

        # Define gene set file
        gene_sets_file_kegg = os.path.join(gsea_gene_sets_dir, 'c2.cp.kegg.v2023.1.Hs.entrez.gmt') # Example KEGG gene set
        gene_sets_file_hallmark = os.path.join(gsea_gene_sets_dir, 'h.all.v7.5.1.symbols.gmt') # Example Hallmark gene set

        if not os.path.exists(gene_sets_file_kegg) and not os.path.exists(gene_sets_file_hallmark):
            print(f"Warning: Neither KEGG nor Hallmark gene set files found in '{gsea_gene_sets_dir}'. Skipping GSEA.")
            print("Please download gene sets from MSigDB and place them in 'results/gsea_gene_sets/'.")
            return

        gene_sets_to_use = []
        if os.path.exists(gene_sets_file_kegg):
            gene_sets_to_use.append(('KEGG', gene_sets_file_kegg))
        if os.path.exists(gene_sets_file_hallmark):
            gene_sets_to_use.append(('Hallmark', gene_sets_file_hallmark))
        
        if not gene_sets_to_use:
            print("No valid gene set files found. Skipping GSEA.")
            return


        for group1_label, group2_label in cluster_pairs:
            print(f"\n🔍 Running GSEA: {group1_label} vs {group2_label}")

            g1 = merged_gsea[merged_gsea["Cluster_Label"] == group1_label].drop(columns=["Cluster_Label"])
            g2 = merged_gsea[merged_gsea["Cluster_Label"] == group2_label].drop(columns=["Cluster_Label"])

            # Ensure groups are not empty and have more than one sample
            if g1.empty or g2.empty or g1.shape[0] < 2 or g2.shape[0] < 2:
                print(f"Skipping comparison {group1_label} vs {group2_label}: Insufficient samples in one or both groups.")
                continue

            # Add a small constant to avoid log(0)
            g1_mean = g1.mean(numeric_only=True) + 1e-5
            g2_mean = g2.mean(numeric_only=True) + 1e-5
            
            # Filter for genes present in both means
            common_genes = g1_mean.index.intersection(g2_mean.index)
            g1_mean = g1_mean[common_genes]
            g2_mean = g2_mean[common_genes]

            if g1_mean.empty or g2_mean.empty:
                print(f"Skipping comparison {group1_label} vs {group2_label}: No common genes with expression data.")
                continue

            log2fc = np.log2(g1_mean) - np.log2(g2_mean)

            # Prepare data for t-test, ensuring only genes with sufficient non-NaN values and variance are considered
            g1_aligned = g1[common_genes]
            g2_aligned = g2[common_genes]

            # Initialize lists to store results for valid genes
            valid_genes_for_ttest = []
            log2fc_values = []
            p_values = []
            
            # Iterate through genes to perform t-test and collect results
            for gene in common_genes:
                data_g1 = g1_aligned[gene].dropna()
                data_g2 = g2_aligned[gene].dropna()

                # Get standard deviations
                std_g1 = data_g1.std()
                std_g2 = data_g2.std()

                # Explicitly check if std_g1 or std_g2 are Series and extract scalar if so
                if isinstance(std_g1, pd.Series):
                    std_g1 = std_g1.iloc[0] if not std_g1.empty else np.nan
                if isinstance(std_g2, pd.Series):
                    std_g2 = std_g2.iloc[0] if not std_g2.empty else np.nan

                # Only perform t-test if both groups have at least 2 non-NaN values and non-zero variance
                if len(data_g1) >= 2 and len(data_g2) >= 2 and pd.notna(std_g1) and std_g1 > 0 and pd.notna(std_g2) and std_g2 > 0:
                    try:
                        t, p = ttest_ind(data_g1, data_g2, equal_var=False, nan_policy='omit')
                        valid_genes_for_ttest.append(gene)
                        log2fc_values.append(log2fc.loc[gene]) # Get log2fc for this specific gene
                        p_values.append(p)
                    except Exception as e:
                        # print(f"Warning: t-test failed for gene {gene} in {group1_label} vs {group2_label}: {e}")
                        pass # Skip this gene if t-test fails

            # Create a DataFrame from these collected lists
            if not valid_genes_for_ttest:
                print(f"Skipping comparison {group1_label} vs {group2_label}: No valid genes for t-test after filtering.")
                continue

            ranked_genes_df = pd.DataFrame({
                "gene": valid_genes_for_ttest,
                "log2fc": pd.to_numeric(log2fc_values, errors='coerce'), # Ensure numeric type, coerce errors to NaN
                "pval": pd.to_numeric(p_values, errors='coerce') # Ensure numeric type, coerce errors to NaN
            })

            # Explicitly handle inf/-inf and then drop NaNs
            # Using np.isinf directly on the Series and np.where for replacement
            # Ensure to operate on the .values to get a numpy array for np.isinf
            ranked_genes_df['log2fc'] = np.where(np.isinf(ranked_genes_df['log2fc'].values), np.nan, ranked_genes_df['log2fc'].values)
            ranked_genes_df['pval'] = np.where(np.isinf(ranked_genes_df['pval'].values), np.nan, ranked_genes_df['pval'].values)
            
            ranked_genes = ranked_genes_df.dropna()
            ranked_genes = ranked_genes.drop_duplicates(subset="gene").sort_values("log2fc", ascending=False)

            if ranked_genes.empty:
                print(f"Skipping comparison {group1_label} vs {group2_label}: No valid ranked genes after filtering.")
                continue

            comparison_name = f"{group1_label.replace(' ', '_')}_vs_{group2_label.replace(' ', '_')}"
            rank_file = os.path.join(RESULTS_DIR, 'gsea', f"ranks_{comparison_name}.rnk")

            ranked_genes['gene'] = ranked_genes['gene'].astype(str).str.upper()
            ranked_genes[["gene", "log2fc"]].to_csv(rank_file, sep="\t", index=False, header=False)
            print(f"Ranked gene list saved to {rank_file}")

            for gs_name, gs_file in gene_sets_to_use:
                try:
                    pre_res = gp.prerank(
                        rnk=rank_file,
                        gene_sets=gs_file,
                        outdir=os.path.join(RESULTS_DIR, 'gsea', comparison_name, gs_name), # Subfolder for each gene set
                        format='png',
                        min_size=5,
                        max_size=2000,
                        permutation_num=100,
                        seed=42,
                        verbose=False # Set to True for more detailed output
                    )

                    if pre_res.res2d is not None and not pre_res.res2d.empty:
                        # Rename columns to lowercase expected by plotting code
                        res_df = pre_res.res2d.reset_index(drop=True)
                        res_df = res_df.rename(columns={'Term': 'Pathway', 'NES': 'nes', 'FDR q-val': 'fdr'})

                        if 'nes' in res_df.columns and 'fdr' in res_df.columns:
                            res_top10 = res_df.sort_values(by='nes', ascending=False).head(10).copy() # Use .copy() to avoid SettingWithCopyWarning
                            res_top10['comparison'] = comparison_name
                            res_top10['gene_set'] = gs_name
                            all_top10_gsea_results.append(res_top10)

                            # Plot barplot for top pathways - REMOVED GENE SET NAME FROM TITLE
                            plt.figure(figsize=(12, max(6, 0.5 * len(res_top10))))
                            sns.barplot(
                                data=res_top10,
                                y='Pathway',
                                x='nes',
                                hue=res_top10['fdr'] < 0.05,
                                palette={True: 'green', False: 'gray'},
                                dodge=False
                            )
                            plt.axvline(0, color='black', linestyle='--', alpha=0.7) # Added alpha for better visibility
                            plt.title(f"Top Pathways: {comparison_name}", fontsize=14, weight='bold') # Modified title
                            plt.xlabel("Normalized Enrichment Score (NES)")
                            plt.ylabel("Pathway Term")
                            plt.legend(title='FDR < 0.05', loc='lower right')
                            plt.tight_layout()
                            plt.savefig(os.path.join(RESULTS_DIR, 'gsea', comparison_name, gs_name, f"{comparison_name}_{gs_name}_top10_barplot.png"), dpi=300)
                            plt.close()
                            print(f"Top 10 {gs_name} pathways bar plot saved for {comparison_name}.")
                        else:
                            print(f"⚠️ Columns 'nes' or 'fdr' missing for {gs_name} in {comparison_name}. Skipping plot.")
                    else:
                        print(f"⚠️ No enrichment results found for {gs_name} in {comparison_name}. Skipping.")

                except Exception as e:
                    print(f"❌ GSEA failed for {gs_name} in {comparison_name}: {e}")
                    # import traceback
                    # traceback.print_exc()

        # Save Summary Table
        if all_top10_gsea_results:
            summary_df_gsea = pd.concat(all_top10_gsea_results)
            summary_df_gsea.to_csv(os.path.join(RESULTS_DIR, 'gsea', "top10_pathways_all_comparisons_summary.csv"), index=False)
            print("\n✅ GSEA completed successfully.")
            print(f"📁 Summary saved: {os.path.join(RESULTS_DIR, 'gsea', 'top10_pathways_all_comparisons_summary.csv')}")

            # Plot Heatmap for NES scores across comparisons (only for significant pathways)
            sig_pathways_for_heatmap = summary_df_gsea[summary_df_gsea['fdr'] < 0.05].copy()

            if not sig_pathways_for_heatmap.empty:
                # Create a combined comparison and gene set column for pivoting
                sig_pathways_for_heatmap['comparison_gene_set'] = sig_pathways_for_heatmap['comparison'] + " (" + sig_pathways_for_heatmap['gene_set'] + ")"
                
                pivot_nes = sig_pathways_for_heatmap.pivot_table(index='Pathway', columns='comparison_gene_set', values='nes').fillna(0)

                # Sort pathways by their max NES score across all comparisons (descending)
                if not pivot_nes.empty:
                    pivot_nes['max_nes'] = pivot_nes.max(axis=1)
                    pivot_nes = pivot_nes.sort_values('max_nes', ascending=False).drop(columns='max_nes')

                    print("\nTop 20 significant pathways across all comparisons (NES scores):")
                    print(pivot_nes.head(20))

                    plt.figure(figsize=(12, max(6, 0.4 * len(pivot_nes))))
                    sns.heatmap(pivot_nes, annot=True, cmap='RdBu_r', center=0, linewidths=0.5, fmt=".2f")
                    plt.title("NES Scores of Significant Pathways Across Comparisons", fontsize=16, weight='bold')
                    plt.ylabel("Pathways")
                    plt.xlabel("Cluster Comparisons") # Modified x-axis label
                    plt.xticks(rotation=45, ha='right')
                    plt.tight_layout()

                    plt.savefig(os.path.join(RESULTS_DIR, 'gsea', "significant_pathways_comparison_heatmap.png"), dpi=300)
                    plt.close()
                    print("\n✅ Heatmap of significant pathways NES scores plotted and saved.")
                    pivot_nes.to_csv(os.path.join(RESULTS_DIR, 'gsea', "significant_pathways_comparison.csv"))
                    print("📄 Pivot table of significant pathways NES scores saved.")
                else:
                    print("No significant pathways to plot heatmap for after filtering.")
            else:
                print("\n⚠️ No significant pathways (FDR < 0.05) found across any comparisons to plot heatmap.")
        else:
            print("\n⚠️ No valid GSEA results were found. Try adjusting gene sets or min_size/max_size.")

    perform_gsea_cluster_comparisons(df_expr, df_annotated, GSEA_GENE_SETS_DIR)


    print("\nPhase 2: Tumor Microenvironment Analysis complete.")


Starting Phase 2: Tumor Microenvironment Analysis...

--- 2.1 Data Loading and Initial Exploration ---
Shape of xCell matrix (samples x cell types): (11069, 67)
First 5 rows of xCell data:
                      aDC    Adipocytes    Astrocytes   B-cells  Basophils  \
TCGA-OR-A5J1-01  0.165006  2.785005e-18  0.000000e+00  0.006513   0.051572   
TCGA-OR-A5J2-01  0.067948  0.000000e+00  0.000000e+00  0.000000   0.027080   
TCGA-OR-A5J3-01  0.021901  3.827430e-19  5.143443e-18  0.023271   0.035671   
TCGA-OR-A5J5-01  0.052409  0.000000e+00  0.000000e+00  0.052157   0.090499   
TCGA-OR-A5J6-01  0.066328  0.000000e+00  0.000000e+00  0.038889   0.024355   

                 CD4+ memory T-cells  CD4+ naive T-cells  CD4+ T-cells  \
TCGA-OR-A5J1-01         2.837257e-02            0.059759      0.034159   
TCGA-OR-A5J2-01         2.390861e-02            0.038295      0.041047   
TCGA-OR-A5J3-01         9.020576e-22            0.018377      0.024566   
TCGA-OR-A5J5-01         0.000000e+00          


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=mean_cell_abundance.head(20).values, y=mean_cell_abundance.head(20).index, palette='viridis')


Overall cell abundance plot saved.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='cancer_type', y=score, data=df_tme_data, palette='coolwarm')


ImmuneScore distribution plot saved.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='cancer_type', y=score, data=df_tme_data, palette='coolwarm')


StromaScore distribution plot saved.



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='cancer_type', y=score, data=df_tme_data, palette='coolwarm')


MicroenvironmentScore distribution plot saved.
Heatmap of top variable cell types saved.

--- 2.3 Statistical Tests: ANOVA for TME Scores and Cell Types ---

Levene's Test for ImmuneScore: Statistic=43.267, p=3.293e-254
ANOVA for 'ImmuneScore': F-statistic=266.587, p-value=0.000e+00
  Significant differences found for ImmuneScore (p < 0.05). Performing Tukey HSD post-hoc test...
                               Multiple Comparison of Means - Tukey HSD, FWER=0.05                                
                group1                                group2                meandiff p-adj   lower   upper  reject
------------------------------------------------------------------------------------------------------------------
               acute myeloid leukemia                 adrenocortical cancer   0.0108    1.0  -0.061  0.0826  False
               acute myeloid leukemia          bladder urothelial carcinoma   0.0079    1.0 -0.0397  0.0556  False
               acute myeloid leukemia      

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,


                               Multiple Comparison of Means - Tukey HSD, FWER=0.05                                
                group1                                group2                meandiff p-adj   lower   upper  reject
------------------------------------------------------------------------------------------------------------------
               acute myeloid leukemia                 adrenocortical cancer   0.1385    0.0  0.0937  0.1833   True
               acute myeloid leukemia          bladder urothelial carcinoma    0.097    0.0  0.0672  0.1267   True
               acute myeloid leukemia              brain lower grade glioma   0.0804    0.0  0.0515  0.1093   True
               acute myeloid leukemia             breast invasive carcinoma   0.1601    0.0  0.1333   0.187   True
               acute myeloid leukemia        cervical & endocervical cancer   0.0473    0.0  0.0159  0.0786   True
               acute myeloid leukemia                    cholangiocarcinoma   0.

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,


                               Multiple Comparison of Means - Tukey HSD, FWER=0.05                                
                group1                                group2                meandiff p-adj   lower   upper  reject
------------------------------------------------------------------------------------------------------------------
               acute myeloid leukemia                 adrenocortical cancer   0.1493    0.0  0.0637   0.235   True
               acute myeloid leukemia          bladder urothelial carcinoma   0.1049    0.0   0.048  0.1617   True
               acute myeloid leukemia              brain lower grade glioma   0.0129    1.0 -0.0424  0.0681  False
               acute myeloid leukemia             breast invasive carcinoma   0.1661    0.0  0.1149  0.2173   True
               acute myeloid leukemia        cervical & endocervical cancer   0.0628 0.0256  0.0029  0.1226   True
               acute myeloid leukemia                    cholangiocarcinoma   0.

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,


                               Multiple Comparison of Means - Tukey HSD, FWER=0.05                                
                group1                                group2                meandiff p-adj   lower   upper  reject
------------------------------------------------------------------------------------------------------------------
               acute myeloid leukemia                 adrenocortical cancer  -1.7915    0.0 -1.8981 -1.6849   True
               acute myeloid leukemia          bladder urothelial carcinoma  -1.7986    0.0 -1.8693 -1.7278   True
               acute myeloid leukemia              brain lower grade glioma   -1.806    0.0 -1.8748 -1.7372   True
               acute myeloid leukemia             breast invasive carcinoma  -1.7354    0.0 -1.7992 -1.6716   True
               acute myeloid leukemia        cervical & endocervical cancer  -1.7561    0.0 -1.8306 -1.6816   True
               acute myeloid leukemia                    cholangiocarcinoma  -1.

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,


                               Multiple Comparison of Means - Tukey HSD, FWER=0.05                                
                group1                                group2                meandiff p-adj   lower   upper  reject
------------------------------------------------------------------------------------------------------------------
               acute myeloid leukemia                 adrenocortical cancer  -0.8442    0.0 -0.9491 -0.7394   True
               acute myeloid leukemia          bladder urothelial carcinoma   -0.936    0.0 -1.0055 -0.8664   True
               acute myeloid leukemia              brain lower grade glioma  -1.0899    0.0 -1.1575 -1.0223   True
               acute myeloid leukemia             breast invasive carcinoma  -0.9272    0.0 -0.9899 -0.8645   True
               acute myeloid leukemia        cervical & endocervical cancer  -1.0598    0.0  -1.133 -0.9865   True
               acute myeloid leukemia                    cholangiocarcinoma  -0.

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,


                               Multiple Comparison of Means - Tukey HSD, FWER=0.05                                
                group1                                group2                meandiff p-adj   lower   upper  reject
------------------------------------------------------------------------------------------------------------------
               acute myeloid leukemia                 adrenocortical cancer   0.0274    1.0 -0.0438  0.0986  False
               acute myeloid leukemia          bladder urothelial carcinoma    0.749    0.0  0.7018  0.7963   True
               acute myeloid leukemia              brain lower grade glioma  -0.0049    1.0 -0.0508   0.041  False
               acute myeloid leukemia             breast invasive carcinoma   0.4786    0.0   0.436  0.5212   True
               acute myeloid leukemia        cervical & endocervical cancer   0.8598    0.0    0.81  0.9095   True
               acute myeloid leukemia                    cholangiocarcinoma   0.

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,


                               Multiple Comparison of Means - Tukey HSD, FWER=0.05                                
                group1                                group2                meandiff p-adj   lower   upper  reject
------------------------------------------------------------------------------------------------------------------
               acute myeloid leukemia                 adrenocortical cancer  -0.3722    0.0 -0.4597 -0.2847   True
               acute myeloid leukemia          bladder urothelial carcinoma  -0.2312    0.0 -0.2892 -0.1731   True
               acute myeloid leukemia              brain lower grade glioma  -0.4734    0.0 -0.5298  -0.417   True
               acute myeloid leukemia             breast invasive carcinoma  -0.3206    0.0 -0.3729 -0.2682   True
               acute myeloid leukemia        cervical & endocervical cancer  -0.3113    0.0 -0.3724 -0.2502   True
               acute myeloid leukemia                    cholangiocarcinoma  -0.

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,


                               Multiple Comparison of Means - Tukey HSD, FWER=0.05                                
                group1                                group2                meandiff p-adj   lower   upper  reject
------------------------------------------------------------------------------------------------------------------
               acute myeloid leukemia                 adrenocortical cancer   0.2003    0.0  0.1116   0.289   True
               acute myeloid leukemia          bladder urothelial carcinoma    0.165    0.0  0.1062  0.2239   True
               acute myeloid leukemia              brain lower grade glioma   0.2304    0.0  0.1732  0.2876   True
               acute myeloid leukemia             breast invasive carcinoma   0.3072    0.0  0.2541  0.3603   True
               acute myeloid leukemia        cervical & endocervical cancer   0.1097    0.0  0.0477  0.1717   True
               acute myeloid leukemia                    cholangiocarcinoma   0.

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,


                               Multiple Comparison of Means - Tukey HSD, FWER=0.05                                
                group1                                group2                meandiff p-adj   lower   upper  reject
------------------------------------------------------------------------------------------------------------------
               acute myeloid leukemia                 adrenocortical cancer  -0.6121    0.0 -0.6814 -0.5427   True
               acute myeloid leukemia          bladder urothelial carcinoma  -0.6132    0.0 -0.6592 -0.5672   True
               acute myeloid leukemia              brain lower grade glioma  -0.6188    0.0 -0.6636 -0.5741   True
               acute myeloid leukemia             breast invasive carcinoma  -0.6168    0.0 -0.6583 -0.5753   True
               acute myeloid leukemia        cervical & endocervical cancer  -0.6076    0.0  -0.656 -0.5591   True
               acute myeloid leukemia                    cholangiocarcinoma  -0.

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,


                               Multiple Comparison of Means - Tukey HSD, FWER=0.05                                
                group1                                group2                meandiff p-adj   lower   upper  reject
------------------------------------------------------------------------------------------------------------------
               acute myeloid leukemia                 adrenocortical cancer   0.0436 0.8414 -0.0235  0.1107  False
               acute myeloid leukemia          bladder urothelial carcinoma   0.1568    0.0  0.1123  0.2014   True
               acute myeloid leukemia              brain lower grade glioma  -0.0067    1.0 -0.0499  0.0366  False
               acute myeloid leukemia             breast invasive carcinoma   0.0823    0.0  0.0421  0.1224   True
               acute myeloid leukemia        cervical & endocervical cancer   0.2475    0.0  0.2006  0.2944   True
               acute myeloid leukemia                    cholangiocarcinoma   0.

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,


                               Multiple Comparison of Means - Tukey HSD, FWER=0.05                                
                group1                                group2                meandiff p-adj   lower   upper  reject
------------------------------------------------------------------------------------------------------------------
               acute myeloid leukemia                 adrenocortical cancer   -0.081   0.02  -0.157 -0.0051   True
               acute myeloid leukemia          bladder urothelial carcinoma  -0.0149    1.0 -0.0653  0.0356  False
               acute myeloid leukemia              brain lower grade glioma  -0.1051    0.0 -0.1541 -0.0561   True
               acute myeloid leukemia             breast invasive carcinoma   0.0182 0.9999 -0.0273  0.0637  False
               acute myeloid leukemia        cervical & endocervical cancer   0.0342 0.8542 -0.0189  0.0873  False
               acute myeloid leukemia                    cholangiocarcinoma  -0.

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,


                               Multiple Comparison of Means - Tukey HSD, FWER=0.05                                
                group1                                group2                meandiff p-adj   lower   upper  reject
------------------------------------------------------------------------------------------------------------------
               acute myeloid leukemia                 adrenocortical cancer   0.2289    0.0  0.1669  0.2909   True
               acute myeloid leukemia          bladder urothelial carcinoma   0.2802    0.0  0.2391  0.3214   True
               acute myeloid leukemia              brain lower grade glioma   0.1447    0.0  0.1048  0.1847   True
               acute myeloid leukemia             breast invasive carcinoma   0.1669    0.0  0.1298   0.204   True
               acute myeloid leukemia        cervical & endocervical cancer   0.3467    0.0  0.3034    0.39   True
               acute myeloid leukemia                    cholangiocarcinoma   0.

  warn(


TME UMAP plot saved with improved aesthetics.

--- 2.5 Additional TME Visualizations (Violin, Variability, Presence) ---
Violin plot of all cell types saved.
Bar plot of top 10 variable cell types saved.
Cell type presence count plot saved.
Heatmap of TME landscape saved.

--- 2.6 TME Clustering and Visualization (KMeans & UMAP) ---
TME Clusters identified and labeled.
Cluster labels saved to .\cluster_labels.csv


  warn(


UMAP plot of TME clusters saved.
Heatmap of top marker cell types per cluster saved.

--- 2.7 Random Forest Classification for TME Cluster Prediction ---
Random Forest Classifier trained.

Random Forest Classification Report:
Accuracy: 0.9927667269439421
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1887
           1       0.99      0.99      0.99       164
           2       0.98      0.93      0.96       161

    accuracy                           0.99      2212
   macro avg       0.99      0.97      0.98      2212
weighted avg       0.99      0.99      0.99      2212




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=feature_df, x='Importance', y='Feature', palette='viridis')


Top 20 RF features bar plot saved.
Top 20 RF features with categories bar plot and CSV saved.

--- 2.8 Immune Checkpoint Gene Expression Analysis ---
✅ Valid checkpoint genes found in data: ['PDCD1', 'CD274', 'PDCD1LG2', 'CTLA4', 'LAG3', 'TIGIT', 'HAVCR2', 'ICOS', 'TNFRSF9', 'BTLA']
✅ Violin plots saved to: plots\checkpoints/

🔬 Kruskal–Wallis Test Results (per gene):

PDCD1: H = 2046.84, p = 0
CD274: H = 1172.29, p = 2.757e-255
PDCD1LG2: H = 1583.53, p = 0
CTLA4: H = 1411.53, p = 3.086e-307
LAG3: H = 1374.84, p = 2.863e-299
TIGIT: H = 1666.28, p = 0
HAVCR2: H = 994.01, p = 1.427e-216
ICOS: H = 1912.07, p = 0
TNFRSF9: H = 1312.43, p = 1.02e-285
BTLA: H = 1854.22, p = 0

📄 Kruskal–Wallis results saved to: results\checkpoint_Kruskal_test_results.csv
Mean immune checkpoint expression heatmap saved.
✅ Correlation heatmap saved to: plots\checkpoints\checkpoint_cell_correlation_heatmap.png

--- 2.9 Gene Set Enrichment Analysis (GSEA) for TME Cluster Comparisons ---

🔍 Running GSEA: Stromal-I

  result = getattr(ufunc, method)(*inputs, **kwargs)


Ranked gene list saved to results\gsea\ranks_Stromal-Immune_Mixed_vs_Immune-active.rnk
Top 10 Hallmark pathways bar plot saved for Stromal-Immune_Mixed_vs_Immune-active.

🔍 Running GSEA: Stromal-Immune Mixed vs Stromal-rich


  result = getattr(ufunc, method)(*inputs, **kwargs)


Ranked gene list saved to results\gsea\ranks_Stromal-Immune_Mixed_vs_Stromal-rich.rnk
Top 10 Hallmark pathways bar plot saved for Stromal-Immune_Mixed_vs_Stromal-rich.

🔍 Running GSEA: Immune-active vs Stromal-rich


  result = getattr(ufunc, method)(*inputs, **kwargs)


Ranked gene list saved to results\gsea\ranks_Immune-active_vs_Stromal-rich.rnk
Top 10 Hallmark pathways bar plot saved for Immune-active_vs_Stromal-rich.

✅ GSEA completed successfully.
📁 Summary saved: results\gsea\top10_pathways_all_comparisons_summary.csv

Top 20 significant pathways across all comparisons (NES scores):
comparison_gene_set                         Immune-active_vs_Stromal-rich (Hallmark)  \
Pathway                                                                                
HALLMARK_INTERFERON_GAMMA_RESPONSE                                          2.048418   
HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION                                  1.832252   
HALLMARK_PANCREAS_BETA_CELLS                                                0.000000   
HALLMARK_ALLOGRAFT_REJECTION                                                1.871149   
HALLMARK_INTERFERON_ALPHA_RESPONSE                                          1.804994   
HALLMARK_IL2_STAT5_SIGNALING                               

  pivot_nes = sig_pathways_for_heatmap.pivot_table(index='Pathway', columns='comparison_gene_set', values='nes').fillna(0)



✅ Heatmap of significant pathways NES scores plotted and saved.
📄 Pivot table of significant pathways NES scores saved.

Phase 2: Tumor Microenvironment Analysis complete.
