# Statistics Prediction Values 

## Functions and libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as patches
import matplotlib.colors as mcolors
import os
import time
from scipy.stats import sem, t
from scipy import stats
import matplotlib.font_manager as fm
from statsmodels.stats.multitest import multipletests

# Parameters for the plotting:
plt.rcParams['font.family'] = 'sans-serif' 
plt.rcParams['font.sans-serif'] = ['DejaVu Sans'] #change font to a known standard font
plt.rcParams["font.size"] = 16
plt.rcParams["axes.labelsize"] = 16
plt.rcParams["xtick.labelsize"] = 16
plt.rcParams["ytick.labelsize"] = 16
plt.rcParams["legend.fontsize"] = 16
plt.rcParams["figure.titlesize"] = 14

In [None]:
def compare_predictions(df_aou, df_ukb, sheet_name):
    results = {}
    
    # Check for required columns
    aou_valid = check_required_columns(df_aou, sheet_name, "AOU")
    ukb_valid = check_required_columns(df_ukb, sheet_name, "UKB")
    
    if not (aou_valid and ukb_valid):
        return None  # Skip this comparison if required columns are missing

    for condition, mask_aou, mask_ukb in [
        ('All', slice(None), slice(None)),
        ('No HCC', df_aou['status'] == 1, df_ukb['status'] == 1),
        ('HCC', df_aou['status'] == 0, df_ukb['status'] == 0)
    ]:
        # Perform Mann-Whitney U test
        statistic, p_value = stats.mannwhitneyu(df_aou.loc[mask_aou, 'y_pred'], 
                                                df_ukb.loc[mask_ukb, 'y_pred'])
        
        # Calculate summary statistics
        aou_mean = df_aou.loc[mask_aou, 'y_pred'].mean()
        aou_std = df_aou.loc[mask_aou, 'y_pred'].std()
        ukb_mean = df_ukb.loc[mask_ukb, 'y_pred'].mean()
        ukb_std = df_ukb.loc[mask_ukb, 'y_pred'].std()
        
        results[condition] = {
            'AOU Mean': aou_mean,
            'AOU Std': aou_std,
            'UKB Mean': ukb_mean,
            'UKB Std': ukb_std,
            'P-value': p_value
        }
    
    return results


def check_required_columns(df, sheet_name, dataset_name):
    required_columns = ['status', 'y_pred']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        print(f"Warning: {dataset_name} dataset, sheet '{sheet_name}' is missing columns: {', '.join(missing_columns)}")
    return len(missing_columns) == 0

## Data import

In [None]:
path = "/home/jupyter/workspaces/machinelearningforlivercancerriskprediction"
fig_path = f"{path}/HCC/visuals"


# get the data from the combined
excel_file = pd.ExcelFile(path+'/combined_output/val/Prediction_values_combined.xlsx')
sheet_names = excel_file.sheet_names

# Create DataFrames
dataframes = {}
for sheet_name in sheet_names:
    dataframes[sheet_name] = pd.read_excel(excel_file, sheet_name)
    print(f"Sheet {sheet_name} read and saved to dataframes dictionary")
dataframes.pop('Sheet')
# dataframes.pop([i for i in list(dataframes.keys()) if str(i).endswith('Model_D') or str(i).endswith('Demographics')][0])
dataframes.keys()


# Load UKB prediction data
excel_file_ukb = pd.ExcelFile(path+'/HCC/prediction_values_ukb.xlsx')
sheet_names_ukb = excel_file_ukb.sheet_names
dataframes_ukb = {}
for sheet_name in sheet_names_ukb:
    dataframes_ukb[sheet_name] = pd.read_excel(excel_file_ukb, sheet_name)
    print(f"Sheet {sheet_name} read and saved to dataframes dictionary")

# dataframes.pop([i for i in list(dataframes.keys()) if str(i).endswith('Model_D') or str(i).endswith('Demographics')][0])
dataframes_ukb.keys()



In [None]:
dataframes["all_Model_TOP30"]

In [None]:


# Perform comparisons for all matching sheets
all_results = {}
for sheet_name in set(dataframes.keys()) & set(dataframes_ukb.keys()):
    result = compare_predictions(dataframes[sheet_name], dataframes_ukb[sheet_name], sheet_name)
    if result is not None:
        all_results[sheet_name] = result

if not all_results:
    print("No valid comparisons could be made due to missing columns.")
else:
    # Apply Bonferroni correction
    all_p_values = [result[condition]['P-value'] 
                    for result in all_results.values() 
                    for condition in result.keys()]
    rejected, corrected_p_values, _, _ = multipletests(all_p_values, method='bonferroni')

    # Update results with corrected p-values
    p_value_index = 0
    for sheet_name, result in all_results.items():
        for condition in result.keys():
            all_results[sheet_name][condition]['Corrected P-value'] = corrected_p_values[p_value_index]
            p_value_index += 1

    # Create a DataFrame to display the results
    results_df = pd.DataFrame([
        {
            'Sheet': sheet_name,
            'Condition': condition,
            'AOU Mean (SD)': f"{result[condition]['AOU Mean']:.4f} ({result[condition]['AOU Std']:.4f})",
            'UKB Mean (SD)': f"{result[condition]['UKB Mean']:.4f} ({result[condition]['UKB Std']:.4f})",
            'P-value': result[condition]['P-value'],
            'Corrected P-value': result[condition]['Corrected P-value']
        }
        for sheet_name, result in all_results.items()
        for condition in result.keys()
    ])

    # Sort the DataFrame
    results_df = results_df.sort_values(['Sheet', 'Condition'])

    # Display the results
    print(results_df.to_string(index=False))

    # Optionally, save to Excel
    results_df.to_excel(f"{fig_path}/prediction_comparison_results.xlsx", index=False)
    print(f"\nResults saved to {fig_path}/prediction_comparison_results.xlsx")

# Print column names for each dataframe
print("\nColumn names in each dataframe:")
for dataset_name, dataset in [("AOU", dataframes), ("UKB", dataframes_ukb)]:
    print(f"\n{dataset_name} dataset:")
    for sheet_name, df in dataset.items():
        print(f"  Sheet '{sheet_name}': {', '.join(df.columns)}")