In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from morphomics.io.io import load_obj, save_obj
from kxa_analysis import dimreduction_runner, bootstrap_runner
import numpy as np
from kxa_analysis import plot_2d, plot_pi, plot_dist_matrix, mask_pi, get_base, inverse_function, get_2d, plot_vae_dist
import plotly.express as px
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import torch as th
from morphomics.nn_models import train_test

import pandas as pd
base_path = "results/vae_analysis/"


In [None]:
import numpy as np
import pandas as pd
from morphomics import utils
from morphomics.pipeline import Pipeline
import torch as th

import os

from morphomics.nn_models import train_test

In [None]:
# Base path for storing results
dimreducer_path = "results/dim_reduction/Morphomics.PID_v1_l.pi_pca_vae_1_fitted_dimreducer"
reduced_path = "results/dim_reduction/Morphomics.PID_v1_l.pi_pca_vae_1_reduced_data"

vae_pip = load_obj(dimreducer_path)
mf = load_obj(reduced_path)
mf = mf.reset_index()  # Resets the index and adds the old index as a column
mf.rename(columns={'index': 'old_idcs'}, inplace=True)
pis = mf['pi']
pi_example = pis.iloc[0]

In [None]:
def get_base(pi, pixes_tokeep):
    pi_full = np.zeros_like(pi_example)
    pi_full[pixes_tokeep] = pi
    return pi_full

In [None]:
# Create a new column for the condition (Model + Sex)
mf['Condition'] = mf['Model'] + "-" + mf['Sex']
# Sort by condition
mf_sorted = mf.sort_values(by='Condition').reset_index(drop=True)
pixes_tokeep = vae_pip['pixes_tokeep']
pis_threshold = pis.apply(lambda pi: mask_pi(pi, pixes_tokeep)[0])
pis_filtered = pis.apply(lambda pi: mask_pi(pi, pixes_tokeep)[1])
standardizer = vae_pip['standardizer']
pis_filtered_arr = np.vstack(pis_filtered)
pis_scaled = standardizer.transform(pis_filtered_arr)
pca = vae_pip['fitted_pca_vae'][0]
pis_pca = pca.transform(pis_scaled)
vae = vae_pip['fitted_pca_vae'][1]
pis_pca_torch = th.tensor(pis_pca, dtype=th.float32)
pred, z_mean, z_log_var, mse = train_test.vae_test(data = pis_pca_torch,
                                                model = vae, 
                                                sample_size = 3,
                                               )
print('mse:', mse)
print('sample size:', z_mean.shape)
print('pred shape:', pred.shape)
pred_processed_pi_mean = pred.mean(dim=0)
pred_processed_pi_mean = pred.mean(dim=0)
mf['pred'] = list(pred_processed_pi_mean)


In [None]:
model_sex = mf[['Model', 'Sex']].apply(lambda x: '-'.join(x), axis=1)
mf['Model_Sex'] = model_sex
model_sex_sorted = sorted(model_sex.unique())

In [None]:
# Create a DataFrame for easier plotting (optional, but recommended)
df = pd.DataFrame(z_mean[:, [0, 1]], columns=['zx', 'zy'])
# Define the axis direction (you can normalize it later)
axis_direction = np.array([-1/3, -1/6])

# Normalize the axis direction (optional)
axis_direction_normalized = axis_direction / np.linalg.norm(axis_direction)

# Project the points onto the axis
projections = df.dot(axis_direction_normalized)

# Add projections to the DataFrame
df['projection'] = projections
df['Model_Sex'] = model_sex
df['Layer'] = mf['Layer']
df['Model'] = mf['Model']
df['Sex'] = mf['Sex']

df.keys()
df =  df[df['Model'].isin(['1xSaline_4h', '1xKXA_4h'])].copy()


In [None]:
groups = df.groupby('Model_Sex')


In [None]:
import numpy as np
import plotly.graph_objects as go
from scipy.stats import gaussian_kde

fig = go.Figure()

base_colors = {
    '1xKXA_4h-F': 'rgb(255, 50, 255)',
    '1xKXA_4h-M': 'rgb(50, 255, 255)',
    '1xSaline_4h-F': 'rgb(130, 130, 130)',
    '1xSaline_4h-M': 'rgb(20, 20, 20)',
}

# Loop through each category in Model_Sex
for cat in df['Model_Sex'].unique():
    data = -df[df['Model_Sex'] == cat]['projection']
    kde = gaussian_kde(data)
    x_range = np.linspace(data.min(), data.max(), 200)
    y_vals = kde(x_range)
    
    fig.add_trace(go.Scatter(
        x=x_range, 
        y=y_vals, 
        mode='lines',
        name=cat,
        line=dict(
            width=2,
            color=base_colors.get(cat, 'black')  # Default to black if key not found
        )
    ))

fig.update_layout(
    title='Density Distribution of Persistence Images in 1D VAE Latent Space',
    xaxis_title='Interpolation Axis',
    yaxis_title='Density',
    width=800,
    height=600
)

# Save as PDF using Kaleido (make sure it's installed: pip install -U kaleido)
save_filepath = "results/vae_analysis/pi_vae_1d.pdf"
fig.write_image(save_filepath)

fig.show()


In [None]:
import pandas as pd
from scipy.stats import shapiro, levene, bartlett
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np

# Group your data by Model_Sex
groups = df.groupby('Model_Sex')

# 1. Test for Normality using Shapiro-Wilk Test
for name, group in groups:
    stat, p_value = shapiro(group['projection'])
    print(f"Shapiro-Wilk Test for {name}: p-value = {p_value}")
    if p_value > 0.05:
        print(f"{name} follows a normal distribution")
    else:
        print(f"{name} does not follow a normal distribution")
        
# 2. Q-Q Plot to visualize normality
for name, group in groups:
    sm.qqplot(group['projection'], line ='45')
    plt.title(f"Q-Q Plot for {name}")
    plt.show()

# 3. Test for Equal Variance using Levene's Test
# Levene's Test across all groups
stat, p_value = levene(*[group['projection'].values for name, group in groups])
print(f"Levene's Test for equal variance: p-value = {p_value}")
if p_value > 0.05:
    print("Variances are equal across groups")
else:
    print("Variances are not equal across groups")

# 4. Optionally Bartlett's Test (if normality is assumed)
stat, p_value = bartlett(*[group['projection'].values for name, group in groups])
print(f"Bartlett's Test for equal variance: p-value = {p_value}")
if p_value > 0.05:
    print("Variances are equal across groups (Bartlett's Test)")
else:
    print("Variances are not equal across groups (Bartlett's Test)")


In [None]:
import scikit_posthocs as sp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as mcolors
import os  # To handle file saving

from scipy.stats import kruskal

# Group the data by 'Model_Sex'
groups = df.groupby('Model_Sex')

# Perform Kruskal-Wallis test on 'projection'
stat, p_value = kruskal(*[group['projection'].values for _, group in groups])

# Print the result
print(f"Kruskal-Wallis Test: p-value = {p_value}")
if p_value > 0.05:
    print("No statistically significant difference between the groups")
else:
    print("Statistically significant difference between the groups")

# Perform Dunn's test on 'projection' grouped by 'Model_Sex'
dunn_results = sp.posthoc_dunn(df, val_col='projection', group_col='Model_Sex', p_adjust='bonferroni')

# Ensure dunn_results is a DataFrame
if not isinstance(dunn_results, pd.DataFrame):
    dunn_results = pd.DataFrame(dunn_results)

# Set significance level
alpha = 0.05

# Create a mask for the lower triangle
lower_triangle_mask = np.tril(np.ones_like(dunn_results, dtype=bool), k=0)  # k=0 includes the diagonal

# Create a custom colormap: red for significant, blue for non-significant
cmap = mcolors.ListedColormap(['blue', 'red'])
bounds = [0, alpha, 1]  # Define the bounds for the colormap
norm = mcolors.BoundaryNorm(bounds, cmap.N)

# Create the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(dunn_results, 
            annot=dunn_results.applymap(lambda x: f"{x:.1e}"),  # Format all p-values with 1 decimal
            fmt='', 
            cmap=cmap, 
            cbar=False, 
            mask=~lower_triangle_mask,  # Only show the lower triangle
            linewidths=0.5, 
            norm=norm)

# Invert y-axis
plt.gca().invert_yaxis()

# Set labels and title
plt.xlabel('Group')
plt.ylabel('Group')
plt.title('Dunn Posthoc Test P-Values')

# Set y-tick labels to be horizontal
plt.yticks(rotation=0)

# Save the plot as a PDF
save_filepath = "results/vae_analysis/pi_vae_kruskal_posthoc_pval.pdf"
os.makedirs(os.path.dirname(save_filepath), exist_ok=True)
plt.savefig(save_filepath, format='pdf')

plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import rankdata, chi2

# Define the Scheirer-Ray-Hare function
def scheirer_ray_hare(df, dv, factor1, factor2):
    """
    Perform the Scheirer-Ray-Hare test (nonparametric two-way ANOVA).
    
    Parameters:
        df : pandas.DataFrame
            DataFrame containing the data.
        dv : str
            Column name for the dependent variable.
        factor1 : str
            Column name for the first factor.
        factor2 : str
            Column name for the second factor.
    
    Returns:
        results : dict
            Dictionary with test statistics and p-values.
    """
    # Number of observations
    N = df.shape[0]
    
    # Overall ranks
    df['rank'] = rankdata(df[dv])
    
    # Levels for each factor
    levels1 = df[factor1].unique()
    levels2 = df[factor2].unique()
    a = len(levels1)
    b = len(levels2)
    
    # Total Sum of Squares of Ranks
    SST = np.sum((df['rank'] - np.mean(df['rank']))**2)
    
    # Compute sums of ranks for each factor
    SS_f1 = 0
    for level in levels1:
        n = df[df[factor1]==level].shape[0]
        R = df[df[factor1]==level]['rank'].sum()
        SS_f1 += (R**2 / n)
    SS_f1 = SS_f1 - (np.sum(df['rank'])**2 / N)
    
    SS_f2 = 0
    for level in levels2:
        n = df[df[factor2]==level].shape[0]
        R = df[df[factor2]==level]['rank'].sum()
        SS_f2 += (R**2 / n)
    SS_f2 = SS_f2 - (np.sum(df['rank'])**2 / N)
    
    # Compute sums of ranks for each cell (interaction)
    SS_interaction = 0
    for level1 in levels1:
        for level2 in levels2:
            cell = df[(df[factor1]==level1) & (df[factor2]==level2)]
            if cell.shape[0] > 0:
                n = cell.shape[0]
                R = cell['rank'].sum()
                SS_interaction += (R**2 / n)
    SS_interaction = SS_interaction - SS_f1 - SS_f2 - (np.sum(df['rank'])**2 / N)
    
    # Degrees of freedom
    df_f1 = a - 1
    df_f2 = b - 1
    df_interaction = df_f1 * df_f2
    # Degrees of freedom for error is not directly used in SRH, so we omit it here.
    
    # Mean squares
    MS_f1 = SS_f1 / df_f1
    MS_f2 = SS_f2 / df_f2
    MS_interaction = SS_interaction / df_interaction
    
    # Test statistics (H statistics are similar to chi-square)
    H_f1 = (df_f1 * MS_f1) / (SST - SS_f1)
    H_f2 = (df_f2 * MS_f2) / (SST - SS_f2)
    H_interaction = (df_interaction * MS_interaction) / (SST - SS_interaction)
    
    p_f1 = 1 - chi2.cdf(H_f1, df_f1)
    p_f2 = 1 - chi2.cdf(H_f2, df_f2)
    p_interaction = 1 - chi2.cdf(H_interaction, df_interaction)
    
    results = {
        factor1: {'H': H_f1, 'df': df_f1, 'p': p_f1},
        factor2: {'H': H_f2, 'df': df_f2, 'p': p_f2},
        'Interaction': {'H': H_interaction, 'df': df_interaction, 'p': p_interaction}
    }
    
    # Remove the 'rank' column to not affect further analysis
    df.drop(columns='rank', inplace=True)
    
    return results

# Example usage:
# Assume df is your DataFrame containing the columns 'projection', 'Sex', and 'Model'
# For demonstration, let's create a dummy DataFrame:
np.random.seed(42)
df = pd.DataFrame({
    'projection': np.random.rand(100),
    'Sex': np.random.choice(['F', 'M'], 100),
    'Model': np.random.choice(['KXA', 'Saline'], 100)
})

# Perform the Scheirer-Ray-Hare test
results = scheirer_ray_hare(df, dv='projection', factor1='Sex', factor2='Model')

# Convert the results dictionary into a DataFrame
results_df = pd.DataFrame(results).T
results_df = results_df[['H', 'df', 'p']]  # Ensure correct column order

# Add an interpretation column based on the p-value
results_df['Significance'] = np.where(results_df['p'] < 0.05, 'Significant', 'Not Significant')

# Format the p-values for display (scientific notation)
results_df['p'] = results_df['p'].apply(lambda x: f"{x:.1e}")

# Display the table using matplotlib
fig, ax = plt.subplots(figsize=(8, 2))
ax.axis('tight')
ax.axis('off')
table = ax.table(cellText=results_df.values,
                 colLabels=results_df.columns,
                 rowLabels=results_df.index,
                 loc='center')
ax.set_title('Scheirer-Ray-Hare Test Results', fontweight='bold')
plt.show()

# Alternatively, if you prefer a printed table, you can simply display the DataFrame:
print(results_df)


In [None]:
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Let's use a two-way ANOVA to check the main effects and interaction
# assuming 'Sex' and 'Model' are categorical and 'projection' is the dependent variable

# Create a model that includes Sex, Model, and their interaction
model = smf.ols('projection ~ Sex * Model', data=df).fit()

# Perform the ANOVA
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)

# You can also visualize the interaction
sns.boxplot(x='Model', y='projection', hue='Sex', data=df)
plt.title("Interaction between Model and Sex")
plt.show()


In [None]:
df.keys()

In [None]:
model_sex = df[['Model', 'Sex']].apply(lambda x: '-'.join(x), axis=1)
df['Model_Sex'] = model_sex

In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Example for Tukey's HSD posthoc test
tukey = pairwise_tukeyhsd(df['projection'], df['Model_Sex'], alpha=0.05)
print(tukey.summary())


In [None]:
# Check the count for each unique combination of 'Model_Sex'
counts = df['Model_Sex'].value_counts()

# Print the result
print(counts)
