# Basic Statistics

## Set Up

### Imports

In [1]:
from warnings import simplefilter

import pandas as pd

simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
simplefilter(action="ignore", category=DeprecationWarning)
simplefilter(action="ignore", category=FutureWarning)

import re
from pathlib import Path
from pyprocessmacro import Process

import numpy as np
import pyperclip
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor

from scipy import stats
from IPython.display import Markdown, HTML, display
from io import StringIO
import matplotlib.pyplot as plt
from collections import defaultdict

from reload_recursive import reload_recursive
import sys
import os
from my_python_utils import smart_title

sys.path.insert(0, "/home/srs-9/Projects/ms_mri/analysis/thalamus/helpers")

import helpers
import utils
import regression_utils as regutils
import my_namespace


#### Reload modules

In [2]:
reload_recursive(regutils)
reload_recursive(helpers)
reload_recursive(utils)
reload_recursive(my_namespace)

from utils import zscore
from my_namespace import presentation_cols, colors

### Load Data

#### Clinical and Volumes

In [3]:
fig_path = Path(
    "/home/srs-9/Projects/ms_mri/analysis/thalamus/results/figures_tables/0-Basic_statistics"
)
if not fig_path.exists():
    os.makedirs(fig_path)

data = utils.load_data("/home/srs-9/Projects/ms_mri/analysis/thalamus/results/data.csv")


MS_patients = data["dz_type2"] == "MS"
NONMS_patients = data["dz_type2"] == "!MS"
NIND_patients = data["dz_type5"] == "NIND"
OIND_patients = data["dz_type5"] == "OIND"
RMS_patients = data["dz_type5"] == "RMS"
PMS_patients = data["dz_type5"] == "PMS"

p = "<p style='font-family: Mononoki Nerd Font Mono;line-height: 1.3;font-size: 14px;'>"

## Analysis

### Demographic and Clinical Data

### MRI Measures

In [4]:
model_data = data

patient_group_names1 = {
    "MS": MS_patients,
    "NIND": NIND_patients,
    "OIND": RMS_patients,
}

patient_group_names2 = {
    "RMS": RMS_patients,
    "PMS": PMS_patients,
    "NIND": NIND_patients,
    "OIND": RMS_patients,
}

units = "cm"

patient_group_names = patient_group_names1

vars = ["WBV", "WMV", "GMV", "WTV"]

display_frame = pd.DataFrame({}, columns=patient_group_names.keys(), index=vars)
for var in vars:
    for group_name, group in patient_group_names.items():
        if units == "cm":
            volume_factor = 1000
        else:
            volume_factor = 1
        display_frame.loc[var, group_name] = f"{model_data.loc[group, var].mean()/volume_factor:0.2f} " \
            f"± {model_data.loc[group, var].std()/volume_factor:0.2f}"

# max_width = max(len(outcome) for outcome in vars)
# display_text = ""
# display_text += f"{p}<u>MRI Volumes:</u></p>"
# display_volumes = [
#     f"{var:<{max_width + 1}}: {model_data[var].mean():0.2f} ± {model_data[var].std():0.2f}"
#     for var in vars
# ]
# display_text += f"{p}{'<br>'.join(display_volumes)}</p>"
# display(HTML(display_text))

display(Markdown(display_frame.to_markdown()))

|     | MS               | NIND             | OIND             |
|:----|:-----------------|:-----------------|:-----------------|
| WBV | 1097.16 ± 119.01 | 1144.80 ± 128.29 | 1107.00 ± 117.65 |
| WMV | 612.50 ± 64.42   | 631.97 ± 69.36   | 619.80 ± 63.64   |
| GMV | 484.66 ± 61.96   | 512.83 ± 68.13   | 487.20 ± 60.75   |
| WTV | 8.96 ± 1.64      | 10.03 ± 1.22     | 9.23 ± 1.50      |

In [46]:
mri_volumes = {
    "WBV": "WBV (mm3)",
    "WMV": "WMV (mm3)",
    "GMV": "GMV (mm3)",
    "WTV": "WTV (mm3)",
    "cortical_thickness": "cortical_thickness (mm)",
}
inflammation_measures = {
    "CP": "CP (mm3)",
    "T2LV": "T2LV (mm3)",
    "PRL": "PRL count",
}
csf_compartments = {
    "LV": "LV (mm<sup>3</sup>)",
    "interCSF": "interthalamic CSF (mm<sup>3</sup>)",
    "thirdV": "third ventricle (mm<sup>3</sup>)",
    "fourthV": "fourth ventricle (mm<sup>3</sup>)",
    "periCSF": "subarachnoid CSF (mm<sup>3</sup>)",
    "CCR2": "CCR",
}

conversion_factor = defaultdict(lambda: 1)
set_conversion_factor = {
    "WBV": 1000,
    "WMV": 1000,
    "GMV": 1000,
    "WTV": 1000
}
for k,v in set_conversion_factor.items():
    conversion_factor[k] = v
    
new_units = defaultdict(lambda: False)
set_units = {
    "WBV": "cm<sup>3</sup>",
    "WMV": "cm<sup>3</sup>",
    "GMV": "cm<sup>3</sup>",
    "WTV": "cm<sup>3</sup>",
    "CP": "mm<sup>3</sup>",
    "T2LV": "mm<sup>3</sup>",
}
for k,v in set_units.items():
    new_units[k] = v

In [48]:
patient_group_names = patient_group_names1


# * MRI Volumes
display(Markdown("### MRI Volumes:"))
display_frame = pd.DataFrame({}, index=patient_group_names.keys())
for var, var_disp in mri_volumes.items():
    if new_units[var]:
        var_disp = re.sub("mm3", new_units[var], var_disp)
    display_frame.loc[:, var_disp] = [f"{model_data.loc[group, var].mean()/conversion_factor[var]:0.2f} ± {(model_data.loc[group, var]/conversion_factor[var]).std():0.2f}"
                                 for group_name, group in patient_group_names.items()]
display_frame.index = patient_group_names.keys()
display(Markdown(display_frame.T.to_markdown()))


# * Measures of Inflammation
display(Markdown("### Measures of Inflammation:"))
display_frame = pd.DataFrame({}, index=patient_group_names.keys())
for var, var_disp in inflammation_measures.items():
    if new_units[var]:
        var_disp = re.sub("mm3", new_units[var], var_disp)
    display_frame.loc[:, var_disp] = [f"{model_data.loc[group, var].mean()/conversion_factor[var]:0.2f} ± {(model_data.loc[group, var]/conversion_factor[var]).std():0.2f}"
                                 for group_name, group in patient_group_names.items()]
display_frame.index = patient_group_names.keys()
display(Markdown(display_frame.T.to_markdown()))

# * CSF Compartments
display(Markdown("### CSF Compartments:"))
display_frame = pd.DataFrame({}, index=patient_group_names.keys())
for var, var_disp in csf_compartments.items():
    if new_units[var]:
        var_disp = re.sub("mm3", new_units[var], var_disp)
    display_frame.loc[:, var_disp] = [f"{model_data.loc[group, var].mean()/conversion_factor[var]:0.2f} ± {(model_data.loc[group, var]/conversion_factor[var]).std():0.2f}"
                                 for group_name, group in patient_group_names.items()]
display_frame.index = patient_group_names.keys()
display(Markdown(display_frame.T.to_markdown()))

### MRI Volumes:

|                         | MS               | NIND             | OIND             |
|:------------------------|:-----------------|:-----------------|:-----------------|
| WBV (cm<sup>3</sup>)    | 1097.16 ± 119.01 | 1144.80 ± 128.29 | 1107.00 ± 117.65 |
| WMV (cm<sup>3</sup>)    | 612.50 ± 64.42   | 631.97 ± 69.36   | 619.80 ± 63.64   |
| GMV (cm<sup>3</sup>)    | 484.66 ± 61.96   | 512.83 ± 68.13   | 487.20 ± 60.75   |
| WTV (cm<sup>3</sup>)    | 8.96 ± 1.64      | 10.03 ± 1.22     | 9.23 ± 1.50      |
| cortical_thickness (mm) | 2.28 ± 0.13      | 2.32 ± 0.12      | 2.30 ± 0.11      |

### Measures of Inflammation:

|                       | MS                 | NIND             | OIND              |
|:----------------------|:-------------------|:-----------------|:------------------|
| CP (mm<sup>3</sup>)   | 1718.20 ± 511.13   | 1643.54 ± 502.81 | 1647.04 ± 505.99  |
| T2LV (mm<sup>3</sup>) | 7748.56 ± 10695.12 | nan ± nan        | 5944.14 ± 7571.57 |
| PRL count             | 0.89 ± 1.79        | 0.02 ± 0.14      | 0.96 ± 1.91       |

### CSF Compartments:

|                                    | MS                   | NIND                 | OIND                 |
|:-----------------------------------|:---------------------|:---------------------|:---------------------|
| LV (mm<sup>3</sup>)                | 19492.70 ± 14859.00  | 17149.93 ± 13565.71  | 16653.86 ± 11439.53  |
| interthalamic CSF (mm<sup>3</sup>) | 964.45 ± 348.45      | 816.29 ± 263.58      | 918.65 ± 328.18      |
| third ventricle (mm<sup>3</sup>)   | 955.76 ± 483.74      | 854.88 ± 416.28      | 865.97 ± 421.00      |
| fourth ventricle (mm<sup>3</sup>)  | 1749.45 ± 512.52     | 1771.89 ± 641.84     | 1695.34 ± 469.62     |
| subarachnoid CSF (mm<sup>3</sup>)  | 345018.14 ± 34930.85 | 348921.28 ± 33356.71 | 342615.82 ± 33917.63 |
| CCR                                | 0.06 ± 0.04          | 0.05 ± 0.04          | 0.05 ± 0.03          |

In [6]:
lesion_volumes = {
    "T2LV": "total T2LV",
    "juxcort_T2LV": "juxtacortical T2LV",
    "periV_T2LV": "periventricular T2LV",
    "subcort_T2LV": "subcortical T2LV",
    "infraT_T2LV": "infratentorial T2LV"
}

max_width = max(
    len(var_disp)
    for var_disp in (lesion_volumes).values()
)

display_volumes = [
    f"{var_disp:<{max_width + 1}}: {model_data[var].mean():0.2f} ± {model_data[var].std():0.2f}"
    for var, var_disp in lesion_volumes.items()
]
print("Lesion Volumes (mm3):\n-----------------------------------------------")
print("\n".join(display_volumes))


Lesion Volumes (mm3):
-----------------------------------------------
total T2LV           : 7748.56 ± 10695.12
juxtacortical T2LV   : 1540.48 ± 2437.15
periventricular T2LV : 5618.05 ± 9514.87
subcortical T2LV     : 482.48 ± 1234.86
infratentorial T2LV  : 114.20 ± 265.07


## Assumption Checks

### Functions

In [4]:
raw_ms_data = data[MS_patients]

def zscore2(df, skip_vars=None):
    """Z-score numeric columns, preserving correlation structure.
    
    WARNING: This drops rows with ANY missing values in numeric columns
    to maintain proper covariance structure.
    """
    if skip_vars is None:
        skip_vars = []
    
    df_z = df.copy()
    numeric_cols = df_z.select_dtypes(include="number").columns
    numeric_cols = numeric_cols[~numeric_cols.isin(skip_vars)]
    
    # Drop rows with any NaN in numeric columns FIRST
    df_z = df_z.dropna(subset=numeric_cols)
    
    # Now z-score (no NaNs remain, so no need for nan_policy)
    df_z[numeric_cols] = (df_z[numeric_cols] - df_z[numeric_cols].mean()) / df_z[numeric_cols].std()
    
    return df_z


def check_vif(raw_data, vars):
    raw_data = raw_data[vars]
    data = zscore2(raw_data)
    X = data[vars]
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data.set_index("Variable")


def quick_print_vif(raw_data, variables, print_vars=None):
    if print_vars is None:
        print_vars = variables
    vif = check_vif(raw_data, variables)
    print_vif = vif.loc[print_vars, "VIF"]
    print(", ".join(variables))
    print("-"*40)
    display(Markdown(print_vif.to_markdown()))
    print("\n")

### VIF

In [5]:
variables = ["CP", "T2LV_log1p", "tiv", "WBV", "age", "Female"]
quick_print_vif(raw_ms_data, variables)

variables = ["CP", "T2LV_log1p", "tiv", "WTV", "CT", "age", "Female"]
quick_print_vif(raw_ms_data, variables)

CP, T2LV_log1p, tiv, WBV, age, Female
----------------------------------------


| Variable   |     VIF |
|:-----------|--------:|
| CP         | 1.80293 |
| T2LV_log1p | 1.44527 |
| tiv        | 8.83604 |
| WBV        | 9.56641 |
| age        | 1.48304 |
| Female     | 1.0763  |



CP, T2LV_log1p, tiv, WTV, CT, age, Female
----------------------------------------


| Variable   |     VIF |
|:-----------|--------:|
| CP         | 1.56571 |
| T2LV_log1p | 1.93046 |
| tiv        | 1.67956 |
| WTV        | 3.30999 |
| CT         | 2.2393  |
| age        | 1.36064 |
| Female     | 1.08016 |





#### VIF for models with WTV and additional thalamic nuclei

In [6]:
for nucleus in ["medial", "posterior", "ventral", "anterior"]:
    variables = ["CP", "WTV", nucleus, "age", "Female", "tiv",]
    quick_print_vif(raw_ms_data, variables, print_vars=["WTV", nucleus, "CP"])

CP, WTV, medial, age, Female, tiv
----------------------------------------


| Variable   |     VIF |
|:-----------|--------:|
| WTV        | 9.64538 |
| medial     | 9.22195 |
| CP         | 1.60764 |



CP, WTV, posterior, age, Female, tiv
----------------------------------------


| Variable   |      VIF |
|:-----------|---------:|
| WTV        | 11.396   |
| posterior  | 10.2198  |
| CP         |  1.56631 |



CP, WTV, ventral, age, Female, tiv
----------------------------------------


| Variable   |     VIF |
|:-----------|--------:|
| WTV        | 8.16626 |
| ventral    | 7.89023 |
| CP         | 1.67143 |



CP, WTV, anterior, age, Female, tiv
----------------------------------------


| Variable   |     VIF |
|:-----------|--------:|
| WTV        | 2.9413  |
| anterior   | 2.30892 |
| CP         | 1.55536 |





In [9]:
for nucleus in ["medial", "posterior", "ventral", "anterior"]:
    variables = ["CP", "WBV", nucleus, "age", "Female", "tiv",]
    quick_print_vif(raw_ms_data, variables, print_vars=["WBV", nucleus, "CP"])

CP, WBV, medial, age, Female, tiv
----------------------------------------


| Variable   |      VIF |
|:-----------|---------:|
| WBV        | 12.9061  |
| medial     |  2.60879 |
| CP         |  1.83232 |



CP, WBV, posterior, age, Female, tiv
----------------------------------------


| Variable   |      VIF |
|:-----------|---------:|
| WBV        | 12.3559  |
| posterior  |  2.34263 |
| CP         |  1.81788 |



CP, WBV, ventral, age, Female, tiv
----------------------------------------


| Variable   |      VIF |
|:-----------|---------:|
| WBV        | 12.5056  |
| ventral    |  2.55451 |
| CP         |  1.79737 |



CP, WBV, anterior, age, Female, tiv
----------------------------------------


| Variable   |      VIF |
|:-----------|---------:|
| WBV        | 10.0753  |
| anterior   |  1.6721  |
| CP         |  1.78628 |



