In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report

# THIS FUNCTION IS NOW FIXED
def load_stage_labels(cancer: str, labels_dir: str):
    """
    Loads cancer stage labels for a specific cancer type by merging stage and type files.
    """
    # 1. Load both the stage and cancer type files
    train_stage_df = pd.read_csv(os.path.join(labels_dir, "train_stage.csv"), index_col=0)
    train_type_df = pd.read_csv(os.path.join(labels_dir, "train_cancer_type.csv"), index_col=0)
    
    test_stage_df = pd.read_csv(os.path.join(labels_dir, "test_stage.csv"), index_col=0)
    test_type_df = pd.read_csv(os.path.join(labels_dir, "test_cancer_type.csv"), index_col=0)
    
    # 2. Join them on their index (sample ID)
    # This creates a single DataFrame with both 'pathologic_stage' and 'cancertype' columns
    train_combined = train_stage_df.join(train_type_df)
    test_combined = test_stage_df.join(test_type_df)

    # 3. Filter for the specific cancer using the 'cancertype' column
    train_labels_all = train_combined[train_combined.cancertype == cancer]
    test_labels_all = test_combined[test_combined.cancertype == cancer]

    # 4. Select the 'pathologic_stage' column and drop any rows that might be missing a label
    train_labels = train_labels_all['stage'].dropna()
    test_labels = test_labels_all['stage'].dropna()
    
    return train_labels, test_labels

def load_feature_data_for_classification(data_dir: str):
    """Loads all feature sets, including the original non-imputed data."""
    train_data_variants = {
        'original': pd.read_csv(os.path.join(data_dir, "real_data_train.csv"), index_col=0),
        'knn': pd.read_csv(os.path.join(data_dir, f"imputed_knn.csv"), index_col=0),
        'multi': pd.read_csv(os.path.join(data_dir, f"imputed_multi.csv"), index_col=0),
        'coherent': pd.read_csv(os.path.join(data_dir, f"imputed_coherent.csv"), index_col=0)
    }
    test_data = pd.read_csv(os.path.join(data_dir, "test_data.csv"), index_col=0)
    return train_data_variants, test_data

def run_classification_pipeline(cancer: str, labels_dir: str, data_dir: str):
    """Trains and evaluates RF models for a single cancer type."""
    train_labels, test_labels = load_stage_labels(cancer, labels_dir)
    train_data_variants, test_data = load_feature_data_for_classification(data_dir)
    train_common_idx = train_labels.index.intersection(train_data_variants['original'].index)
    test_common_idx = test_labels.index.intersection(test_data.index)
    y_train = train_labels.loc[train_common_idx].sort_index()
    y_test = test_labels.loc[test_common_idx].sort_index()
    X_test = test_data.loc[test_common_idx].sort_index()
    X_train_variants = {}
    for name, df in train_data_variants.items():
        X_train_variants[name] = df.loc[train_common_idx].sort_index()
    if len(y_train) < 20 or len(y_test) < 10 or y_train.nunique() < 2:
        return None
    results = []
    for variant_name, X_train in X_train_variants.items():
        model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        balanced_acc = balanced_accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        macro_f1 = report['macro avg']['f1-score']
        results.append({
            'data_variant': variant_name,
            'accuracy': accuracy,
            'balanced_accuracy': balanced_acc,
            'macro_f1_score': macro_f1
        })
    return pd.DataFrame(results)

# THIS FUNCTION IS NOW FIXED
def run_for_all_cancers_classification(labels_dir: str, data_dir: str):
    """Orchestrates the classification pipeline across all cancer types."""
    
    # 1. Discover all cancer types from the CORRECT file: train_cancer_type.csv
    train_type_path = os.path.join(labels_dir, "train_cancer_type.csv")
    all_cancers = pd.read_csv(train_type_path)['cancertype'].unique()
    print(f"Discovered {len(all_cancers)} cancer types.")
    
    all_results_dfs = []
    for cancer in sorted(all_cancers):
        print(f"\n----- Running classification for: {cancer} -----")
        try:
            results_df = run_classification_pipeline(cancer, labels_dir, data_dir)
            if results_df is not None:
                print(f"Successfully completed analysis for {cancer}.")
                results_df['cancer'] = cancer
                all_results_dfs.append(results_df)
            else:
                 print(f"Skipping {cancer} due to insufficient data or classes.")
        except Exception as e:
            print(f"!!! An error occurred while processing {cancer}: {e}")
    if all_results_dfs:
        final_summary = pd.concat(all_results_dfs, ignore_index=True)
        cols = ['cancer', 'data_variant', 'accuracy', 'balanced_accuracy', 'macro_f1_score']
        final_summary = final_summary.reindex(columns=cols)
        return final_summary
    else:
        print("No results were generated.")
        return None

# --- Main Execution ---import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report

def load_stage_labels(cancer: str, labels_dir: str):
    """
    Loads cancer stage labels for a specific cancer type by merging stage and type files.
    """
    train_stage_df = pd.read_csv(os.path.join(labels_dir, "train_stage.csv"), index_col=0)
    train_type_df = pd.read_csv(os.path.join(labels_dir, "train_cancer_type.csv"), index_col=0)
    test_stage_df = pd.read_csv(os.path.join(labels_dir, "test_stage.csv"), index_col=0)
    test_type_df = pd.read_csv(os.path.join(labels_dir, "test_cancer_type.csv"), index_col=0)
    
    train_combined = train_stage_df.join(train_type_df)
    test_combined = test_stage_df.join(test_type_df)

    train_labels_all = train_combined[train_combined.cancertype == cancer]
    test_labels_all = test_combined[test_combined.cancertype == cancer]

    train_labels = train_labels_all['stage'].dropna()
    test_labels = test_labels_all['stage'].dropna()
    
    return train_labels, test_labels

def load_feature_data_for_classification(data_dir: str):
    """Loads all feature sets, including the original non-imputed data."""
    train_data_variants = {
        'original': pd.read_csv(os.path.join(data_dir, "real_data_train.csv"), index_col=0),
        'knn': pd.read_csv(os.path.join(data_dir, f"imputed_knn.csv"), index_col=0),
        'multi': pd.read_csv(os.path.join(data_dir, f"imputed_multi.csv"), index_col=0),
        'coherent': pd.read_csv(os.path.join(data_dir, f"imputed_coherent.csv"), index_col=0)
    }
    test_data = pd.read_csv(os.path.join(data_dir, "test_data.csv"), index_col=0)
    return train_data_variants, test_data

# MODIFIED FUNCTION
def run_classification_pipeline(cancer: str, labels_dir: str, data_dir: str):
    """Trains and evaluates RF models for a single cancer type."""
    
    train_labels, test_labels = load_stage_labels(cancer, labels_dir)
    train_data_variants, test_data = load_feature_data_for_classification(data_dir)

    train_common_idx = train_labels.index.intersection(train_data_variants['original'].index)
    test_common_idx = test_labels.index.intersection(test_data.index)
    
    y_train = train_labels.loc[train_common_idx].sort_index()
    y_test = test_labels.loc[test_common_idx].sort_index()
    
    X_test = test_data.loc[test_common_idx].sort_index()

    X_train_variants = {}
    for name, df in train_data_variants.items():
        X_train_variants[name] = df.loc[train_common_idx].sort_index()

    if len(y_train) < 20 or len(y_test) < 10 or y_train.nunique() < 2:
        return None

    # NEW: Capture the train and test sizes after alignment
    train_size = len(y_train)
    test_size = len(y_test)

    results = []
    for variant_name, X_train in X_train_variants.items():
        model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        balanced_acc = balanced_accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        macro_f1 = report['macro avg']['f1-score']
        
        # NEW: Add the sizes to the results dictionary
        results.append({
            'data_variant': variant_name,
            'train_size': train_size,
            'test_size': test_size,
            'accuracy': accuracy,
            'balanced_accuracy': balanced_acc,
            'macro_f1_score': macro_f1
        })
        
    return pd.DataFrame(results)

# MODIFIED FUNCTION
def run_for_all_cancers_classification(labels_dir: str, data_dir: str):
    """Orchestrates the classification pipeline across all cancer types."""
    
    train_type_path = os.path.join(labels_dir, "train_cancer_type.csv")
    all_cancers = pd.read_csv(train_type_path)['cancertype'].unique()
    print(f"Discovered {len(all_cancers)} cancer types.")
    
    all_results_dfs = []
    for cancer in sorted(all_cancers):
        print(f"\n----- Running classification for: {cancer} -----")
        try:
            results_df = run_classification_pipeline(cancer, labels_dir, data_dir)
            if results_df is not None:
                print(f"Successfully completed analysis for {cancer}.")
                results_df['cancer'] = cancer
                all_results_dfs.append(results_df)
            else:
                 print(f"Skipping {cancer} due to insufficient data or classes.")
        except Exception as e:
            print(f"!!! An error occurred while processing {cancer}: {e}")
    if all_results_dfs:
        final_summary = pd.concat(all_results_dfs, ignore_index=True)
        
        # NEW: Update the column order to include the new size columns
        cols = ['cancer', 'data_variant', 'train_size', 'test_size', 'accuracy', 'balanced_accuracy', 'macro_f1_score']
        final_summary = final_summary.reindex(columns=cols)
        return final_summary
    else:
        print("No results were generated.")
        return None

# --- Main Execution ---
if __name__ == '__main__':
    LABELS_DIR = "../../datasets_TCGA/downstream_labels"
    DATA_DIR = "./data_task_02"
    
    classification_results = run_for_all_cancers_classification(
        labels_dir=LABELS_DIR,
        data_dir=DATA_DIR
    )

    if classification_results is not None:
        print("\n\n===== FINAL SUMMARY OF CLASSIFICATION RESULTS =====")
        sorted_results = classification_results.sort_values(
            by=['cancer', 'balanced_accuracy'], ascending=[True, False]
        )
        print(sorted_results.to_string())
        print("\n\n===== BEST DATA VARIANT PER CANCER (based on Balanced Accuracy) =====")
        best_variants = sorted_results.loc[sorted_results.groupby('cancer')['balanced_accuracy'].idxmax()]
        print(best_variants)
if __name__ == '__main__':
    LABELS_DIR = "../../datasets_TCGA/downstream_labels"
    DATA_DIR = "./data_task_02"
    
    classification_results = run_for_all_cancers_classification(
        labels_dir=LABELS_DIR,
        data_dir=DATA_DIR
    )

    if classification_results is not None:
        print("\n\n===== FINAL SUMMARY OF CLASSIFICATION RESULTS =====")
        sorted_results = classification_results.sort_values(
            by=['cancer', 'balanced_accuracy'], ascending=[True, False]
        )
        print(sorted_results.to_string())
        print("\n\n===== BEST DATA VARIANT PER CANCER (based on Balanced Accuracy) =====")
        best_variants = sorted_results.loc[sorted_results.groupby('cancer')['balanced_accuracy'].idxmax()]
        print(best_variants)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import friedmanchisquare, wilcoxon, spearmanr
from itertools import combinations
import warnings

# Suppress warnings from Wilcoxon test on small samples
warnings.filterwarnings('ignore', category=UserWarning, message='Exact p-value calculation does not work if there are ties. Switching to normal approximation.')

def analyze_classification_results(results_df: pd.DataFrame):
    """
    Performs a deep analysis of classification results to compare data variants
    and investigate the role of sample size.
    """
    
    # Use balanced_accuracy as our primary performance metric
    metric = 'balanced_accuracy'
    
    print("==========================================================")
    print(" Part 1: Which Data Variant is Better? ")
    print("==========================================================")

    # --- 1a. Visualization ---
    plt.figure(figsize=(10, 7))
    sns.boxplot(data=results_df, x='data_variant', y=metric, order=['original', 'knn', 'multi', 'coherent'])
    sns.swarmplot(data=results_df, x='data_variant', y=metric, order=['original', 'knn', 'multi', 'coherent'], color='0.25')
    plt.title(f'Performance Distribution by Data Variant', fontsize=16)
    plt.ylabel(f'Balanced Accuracy Score', fontsize=12)
    plt.xlabel('Data Variant', fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

    # --- 1b. Statistical Testing ---
    # Reshape data for the Friedman test: each row is a cancer, each column is a data variant
    pivot_df = results_df.pivot_table(index='cancer', columns='data_variant', values=metric)
    pivot_df.dropna(inplace=True) # Ensure we only test on cancers with results for all variants

    print(f"\nPerforming statistical tests on {len(pivot_df)} cancer types with complete data.")

    # Friedman Test (checks for any difference among all groups)
    stat, p_friedman = friedmanchisquare(pivot_df['original'], pivot_df['knn'], pivot_df['multi'], pivot_df['coherent'])
    print("\n--- Friedman Test ---")
    print(f"Statistic: {stat:.4f}, p-value: {p_friedman:.4f}")

    if p_friedman < 0.05:
        print("✅ The test is significant (p < 0.05). There are meaningful differences between the data variants.")
        
        # Post-Hoc Pairwise Wilcoxon Tests
        print("\n--- Post-Hoc Pairwise Wilcoxon Tests (comparing each pair) ---")
        alpha = 0.05
        # 6 pairs to compare: (4*3)/2
        num_tests = len(list(combinations(pivot_df.columns, 2)))
        corrected_alpha = alpha / num_tests
        print(f"Using Bonferroni corrected alpha = {corrected_alpha:.4f} for significance.")
        
        pairs = list(combinations(pivot_df.columns, 2))
        for pair in pairs:
            method1, method2 = pair
            stat_wilcox, p_wilcox = wilcoxon(pivot_df[method1], pivot_df[method2])
            
            print(f"\nComparison: {method1.upper()} vs {method2.upper()}")
            print(f"  p-value: {p_wilcox:.4f}")
            if p_wilcox < corrected_alpha:
                winner = method1 if pivot_df[method1].mean() > pivot_df[method2].mean() else method2
                print(f"  -> Statistically SIGNIFICANT difference found. {winner.upper()} was better on average.")
            else:
                print(f"  -> No significant difference found.")
    else:
        print("The test is not significant (p >= 0.05). There is no strong evidence of a difference among the data variants.")

    
    print("\n\n==========================================================")
    print(" Part 2: Does Sample Size Play a Role? ")
    print("==========================================================")

    # --- 2a. Visualization ---
    # Use lmplot to create a scatter plot with a regression line for each data variant
    g = sns.lmplot(
        data=results_df, x='train_size', y=metric, hue='data_variant',
        height=6, aspect=1.5, ci=None, legend_out=False
    )
    g.fig.suptitle('Role of Training Set Size on Model Performance', y=1.02, fontsize=16)
    g.set_axis_labels('Training Set Size', 'Balanced Accuracy Score', fontsize=12)
    plt.grid(linestyle='--', alpha=0.7)
    plt.show()

    # --- 2b. Correlation Analysis ---
    print("\n--- Spearman Correlation between Train Size and Performance ---")
    print("(Spearman's Rho: +1 is perfect positive correlation, -1 is perfect negative, 0 is no correlation)\n")
    for variant in results_df['data_variant'].unique():
        subset = results_df[results_df['data_variant'] == variant]
        corr, p_corr = spearmanr(subset['train_size'], subset[metric])
        
        print(f"Variant: {variant.upper()}")
        print(f"  Spearman's Rho: {corr:.4f}")
        print(f"  p-value: {p_corr:.4f}")
        if p_corr < 0.05:
            print("  -> The correlation is statistically significant.")
        else:
            print("  -> The correlation is not statistically significant.")



# Run the analysis
analyze_classification_results(classification_results)