## Set Up

### Imports

In [51]:
from warnings import simplefilter

import pandas as pd

simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
simplefilter(action="ignore", category=DeprecationWarning)
simplefilter(action="ignore", category=FutureWarning)

import re
import textwrap
from pathlib import Path
from pyprocessmacro import Process

import numpy as np
import pyperclip
import statsmodels.api as sm
from matplotlib import colormaps
from scipy import stats
from IPython.display import Markdown, display
from io import StringIO
import matplotlib.pyplot as plt
from collections import defaultdict

from reload_recursive import reload_recursive
import sys

sys.path.insert(0, "/home/srs-9/Projects/ms_mri/analysis/thalamus/helpers")

import helpers
import utils
import regression_utils as regutils
import my_namespace


In [106]:
reload_recursive(regutils)
reload_recursive(helpers)
reload_recursive(utils)
reload_recursive(my_namespace)

import utils
from utils import load_df, zscore, get_colors, run_R_script
from my_namespace import *

import regression_utils as regutils
from regression_utils import (
    quick_regression,
    quick_regression2,
    residualize_vars,
    run_regressions,
)

### Load Data

#### Clinical and Volumes

In [None]:
fig_path = Path(
    "/home/srs-9/Projects/ms_mri/analysis/thalamus/results/figures_tables/3-CP_CSF_analysis"
)

data = pd.read_csv(
    "/home/srs-9/Projects/ms_mri/analysis/thalamus/results/data.csv", index_col="subid"
)
dataT = pd.read_csv(
    "/home/srs-9/Projects/ms_mri/analysis/thalamus/results/data_transformed.csv",
    index_col="subid",
)

MS_patients = data["dz_type2"] == "MS"
NONMS_patients = data["dz_type2"] == "!MS"
NIND_patients = data["dz_type5"] == "NIND"
OIND_patients = data["dz_type5"] == "OIND"
RMS_patients = data["dz_type5"] == "RMS"
PMS_patients = data["dz_type5"] == "PMS"

### Functions

In [None]:
def thalamus_influence_analysis(model_data, influenced_var, covariates):
    residualized_vars = {}
    for var in ["medial", "posterior", "anterior", "ventral", "THALAMUS_1"] + [
        influenced_var
    ]:
        residualized_vars[var] = residualize_vars(
            model_data,
            dependent_var=var,
            independent_vars=covariates,
        )

    residualized_vars = pd.DataFrame(residualized_vars).dropna()

    nobs = residualized_vars[influenced_var].shape[0]

    structs = ["posterior", "medial", "anterior", "ventral"]
    working_structs = structs.copy()

    nuclei_comps = {}
    thalamus_comps = {}
    pearson_results = {}
    pearson_results["THALAMUS_1"] = stats.pearsonr(
        residualized_vars["THALAMUS_1"], residualized_vars[influenced_var]
    )
    for i, struct in enumerate(structs):
        pearson_results[struct] = stats.pearsonr(
            residualized_vars[struct], residualized_vars[influenced_var]
        )

        thalamus_comps[f"THALAMUS_1 vs {struct}"] = run_R_script(
            pearson_results["THALAMUS_1"][0],
            pearson_results[struct][0],
            stats.pearsonr(residualized_vars["THALAMUS_1"], residualized_vars[struct])[
                0
            ],
            nobs,
        )
        if thalamus_comps[f"THALAMUS_1 vs {struct}"] >= 0.05:
            sign = "="
        elif abs(pearson_results["THALAMUS_1"][0]) > abs(pearson_results[struct][0]):
            sign = ">"
        else:
            sign = "<"
        thalamus_comps = {re.sub(r"vs", sign, k): v for k, v in thalamus_comps.items()}

    for i, struct1 in enumerate(structs):
        working_structs = working_structs[1:]
        for struct2 in working_structs:
            nuclei_comps[f"{struct1} vs {struct2}"] = run_R_script(
                pearson_results[struct1][0],
                pearson_results[struct2][0],
                stats.pearsonr(residualized_vars[struct1], residualized_vars[struct2])[
                    0
                ],
                nobs,
            )
            if nuclei_comps[f"{struct1} vs {struct2}"] >= 0.05:
                sign = "="
            elif abs(pearson_results[struct1][0]) > abs(pearson_results[struct2][0]):
                sign = ">"
            else:
                sign = "<"
            nuclei_comps = {re.sub(r"vs", sign, k): v for k, v in nuclei_comps.items()}

    return pearson_results, thalamus_comps, nuclei_comps

## Analysis

### Does CP drive central atrophy?


#### Relationship between CP and CSF compartments

CP predicts expansion of LV after controlling for whole thalamus volume. It also predicts LV expansion more strongly than it does the other CSF compartments

In [None]:
model_data_z = zscore(data[MS_patients], skip_vars=["PRL"])

test_outcomes = ["LV_log", "asegCSF_log", "thirdV_log", "fourthV_log", "periCSF"]
predictors = ["CP", "t2lv_log", "THALAMUS_1", "brain"]
covariates = ["THALAMUS_1", "age", "Female", "tiv"]

_, results = regutils.run_regressions(
    model_data_z, test_outcomes, predictors, covariates=covariates
)

predictor_to_show = "brain"
formula = regutils.formula_string("outcome", predictor_to_show, covariates)
results_present = results[predictor_to_show].rename(
    index={
        "asegCSF_log": "aseg24_CSF_log",
        "thirdV_log": "aseg14_thirdV",
        "fourthV_log": "aseg15_fourthV",
    }
)

display(Markdown(f"**Effect of {predictor_to_show} on outcome**"))
display(Markdown(f"`{formula}`"))
display(
    Markdown(regutils.present_model(results_present, presentation_cols).to_markdown())
)


test = f"""
**Effect of {predictor_to_show} on outcome** 

`{formula}`

{regutils.present_model(results_present, presentation_cols).to_markdown()}
"""

**Effect of brain on outcome**

`outcome ~ brain + THALAMUS_1 + age + Female + tiv`

| outcome        |       coef |      p_fdr |       pval |        se | ci               |       R2 |
|:---------------|-----------:|-----------:|-----------:|----------:|:-----------------|---------:|
| CCR_log        |  0.0252561 | 0.570263   | 0.570263   | 0.0444594 | [-0.0621, 0.113] | 0.454595 |
| periCSF_ratio  | -0.0389233 | 0.422959   | 0.422959   | 0.048532  | [-0.134, 0.0564] | 0.358228 |
| LV_log         | -0.0170503 | 0.679321   | 0.679321   | 0.041219  | [-0.0981, 0.064] | 0.533507 |
| aseg24_CSF_log | -0.0792637 | 0.15       | 0.1125     | 0.0498486 | [-0.177, 0.0187] | 0.317142 |
| aseg14_thirdV  | -0.0689426 | 0.13883    | 0.135878   | 0.0461484 | [-0.16, 0.0217]  | 0.452132 |
| aseg15_fourthV | -0.128682  | 0.0354882  | 0.0266162  | 0.0578542 | [-0.242, -0.015] | 0.155056 |
| periCSF        | -0.232674  | 7.0384e-10 | 1.7596e-10 | 0.0356414 | [-0.303, -0.163] | 0.620935 |

In [110]:
model_data_z = zscore(data[MS_patients], skip_vars=["PRL"])

rename_index = {
    "brain": "**brain**",
    "THALAMUS_1": "**THALAMUS_1**"
}

test_outcomes = ["LV_log", "thirdV_log", "fourthV_log", "periCSF"]
all_models = defaultdict(dict)
all_formulas = defaultdict(dict)
for outcome in test_outcomes:
    formula_list = {
        "brain": f"{outcome} ~ brain + age + Female + tiv",
        "thalamus": f"{outcome} ~ THALAMUS_1 + age + Female + tiv",
        "brain+thalamus": f"{outcome} ~ brain + THALAMUS_1 + age + Female + tiv"
    }
    
    all_models[outcome], all_formulas[outcome] = regutils.run_regressions3(
        model_data_z,
        formula_list.values(),
        model_names=formula_list.keys()
    )
    
outcome = "periCSF"
display(Markdown(f"### {outcome}"))
models = all_models[outcome]
formulas = all_formulas[outcome]

for model_name, model, formula in zip(
    models.keys(), models.values(), formulas.values()
):
    display(Markdown(f"#### {model_name}"))
    display(Markdown(f"`{formula}`"))
    display(Markdown(regutils.present_model(model, presentation_cols,
                                            rename_index=rename_index).to_markdown()))


outcome = "LV_log"
display(Markdown(f"### {outcome}"))
models = all_models[outcome]
formulas = all_formulas[outcome]

for model_name, model, formula in zip(
    models.keys(), models.values(), formulas.values()
):
    display(Markdown(f"#### {model_name}"))
    display(Markdown(f"`{formula}`"))
    display(Markdown(regutils.present_model(model, presentation_cols,
                                            rename_index=rename_index).to_markdown()))

### periCSF

#### brain

`periCSF ~ brain + age + Female + tiv`

|           |        coef |        pval |        se | ci               |
|:----------|------------:|------------:|----------:|:-----------------|
| Intercept |  0.00306216 | 0.915839    | 0.0289609 | [-0.0538, 0.06]  |
| **brain** | -0.273502   | 1.82052e-15 | 0.0332007 | [-0.339, -0.208] |
| age       |  0.0802698  | 0.0166544   | 0.0334039 | [0.0146, 0.146]  |
| Female    |  0.0710142  | 0.0436303   | 0.0351003 | [0.00204, 0.14]  |
| tiv       |  0.788783   | 6.89986e-80 | 0.0338826 | [0.722, 0.855]   |

#### thalamus

`periCSF ~ THALAMUS_1 + age + Female + tiv`

|                |        coef |        pval |        se | ci                |
|:---------------|------------:|------------:|----------:|:------------------|
| Intercept      |  0.00247846 | 0.934244    | 0.0300233 | [-0.0565, 0.0615] |
| **THALAMUS_1** | -0.202418   | 2.85889e-06 | 0.04271   | [-0.286, -0.118]  |
| age            |  0.131049   | 9.32293e-05 | 0.0332423 | [0.0657, 0.196]   |
| Female         |  0.0533553  | 0.151616    | 0.0371497 | [-0.0196, 0.126]  |
| tiv            |  0.832862   | 5.90869e-67 | 0.0406026 | [0.753, 0.913]    |

#### brain+thalamus

`periCSF ~ brain + THALAMUS_1 + age + Female + tiv`

|                |       coef |        pval |        se | ci                |
|:---------------|-----------:|------------:|----------:|:------------------|
| Intercept      |  0.0030116 | 0.916647    | 0.0287598 | [-0.0535, 0.0595] |
| **brain**      | -0.232674  | 1.7596e-10  | 0.0356414 | [-0.303, -0.163]  |
| **THALAMUS_1** | -0.110973  | 0.0121683   | 0.0440863 | [-0.198, -0.0243] |
| age            |  0.0613856 | 0.0721309   | 0.0340572 | [-0.00554, 0.128] |
| Female         |  0.081406  | 0.0202014   | 0.0349282 | [0.0128, 0.15]    |
| tiv            |  0.83345   | 3.67808e-75 | 0.0374126 | [0.76, 0.907]     |

### LV_log

#### brain

`LV_log ~ brain + age + Female + tiv`

|           |       coef |        pval |        se | ci                |
|:----------|-----------:|------------:|----------:|:------------------|
| Intercept |  0.0026758 | 0.945472    | 0.0391022 | [-0.0742, 0.0795] |
| **brain** | -0.2473    | 8.1804e-08  | 0.0453703 | [-0.336, -0.158]  |
| age       |  0.323868  | 2.90867e-12 | 0.0451421 | [0.235, 0.413]    |
| Female    | -0.0268361 | 0.53116     | 0.0428208 | [-0.111, 0.0573]  |
| tiv       |  0.257196  | 8.92385e-08 | 0.0473291 | [0.164, 0.35]     |

#### thalamus

`LV_log ~ THALAMUS_1 + age + Female + tiv`

|                |        coef |        pval |        se | ci                |
|:---------------|------------:|------------:|----------:|:------------------|
| Intercept      |  0.00235159 | 0.941026    | 0.0317697 | [-0.0601, 0.0648] |
| **THALAMUS_1** | -0.632521   | 7.20929e-60 | 0.0333041 | [-0.698, -0.567]  |
| age            |  0.222478   | 1.8665e-10  | 0.0341319 | [0.155, 0.29]     |
| Female         |  0.0297116  | 0.433491    | 0.0379015 | [-0.0448, 0.104]  |
| tiv            |  0.509043   | 2.82044e-28 | 0.0431338 | [0.424, 0.594]    |

#### brain+thalamus

`LV_log ~ brain + THALAMUS_1 + age + Female + tiv`

|                |        coef |        pval |        se | ci               |
|:---------------|------------:|------------:|----------:|:-----------------|
| Intercept      |  0.00239066 | 0.940186    | 0.0318426 | [-0.0602, 0.065] |
| **brain**      | -0.0170503  | 0.679321    | 0.041219  | [-0.0981, 0.064] |
| **THALAMUS_1** | -0.625819   | 4.07181e-48 | 0.0380734 | [-0.701, -0.551] |
| age            |  0.217373   | 2.07076e-09 | 0.0355524 | [0.148, 0.287]   |
| Female         |  0.0317672  | 0.409727    | 0.0384998 | [-0.0439, 0.107] |
| tiv            |  0.509086   | 3.38256e-28 | 0.0432039 | [0.424, 0.594]   |

#### CP drives preferential central expansion

With the degree of preferential central expansion defined as the central CSF ratio (CCR):

$$CCR = LV / periCSF$$

**CP is associated with increasing CCR, even more than what is predicted by brain or thalamus volumes**

In [11]:
model_data_z = zscore(data[MS_patients], skip_vars=["PRL"])

test_outcomes = ["CCR_log"]
predictors = ["CP"]
covariates = ["THALAMUS_1", "age", "Female", "tiv"]

_, results = regutils.run_regressions(
    model_data_z, test_outcomes, predictors, covariates=covariates
)

predictor_to_show = "CP"
formula = regutils.formula_string("outcome", predictor_to_show, covariates)
display(Markdown(f"`{formula}`"))
display(
    Markdown(
        regutils.present_model(
            results[predictor_to_show], presentation_cols
        ).to_markdown()
    )
)


test_outcomes = ["CCR_log"]
predictors = ["CP"]
covariates = ["brain", "age", "Female", "tiv"]

_, results = regutils.run_regressions(
    model_data_z, test_outcomes, predictors, covariates=covariates
)

predictor_to_show = "CP"
formula = regutils.formula_string("outcome", predictor_to_show, covariates)
display(Markdown(f"`{formula}`"))
display(
    Markdown(
        regutils.present_model(
            results[predictor_to_show], presentation_cols
        ).to_markdown()
    )
)


`outcome ~ CP + THALAMUS_1 + age + Female + tiv`

| outcome   |     coef |       p_fdr |        pval |        se | ci             |       R2 |
|:----------|---------:|------------:|------------:|----------:|:---------------|---------:|
| CCR_log   | 0.430836 | 9.30737e-25 | 9.30737e-25 | 0.0395383 | [0.353, 0.509] | 0.572901 |

`outcome ~ CP + brain + age + Female + tiv`

| outcome   |     coef |       p_fdr |        pval |        se | ci             |       R2 |
|:----------|---------:|------------:|------------:|----------:|:---------------|---------:|
| CCR_log   | 0.598909 | 1.11121e-42 | 1.11121e-42 | 0.0393473 | [0.522, 0.676] | 0.476232 |

Alternative presentation for the above

In [40]:
model_data_z = zscore(data[MS_patients], skip_vars=["PRL"])

outcome = "CCR_log"

exog_list = {
    "CP alone": ["CP"],
    "Control for brain": ["CP", "brain"],
    "Control for thalamus": ["CP", "THALAMUS_1"],
    "Just brain": ["brain"],
    "Just thalamus": ["THALAMUS_1"],
}
covariates = ["age", "Female", "tiv"]

models, formulas = regutils.run_regressions2(
    model_data_z,
    outcome,
    exog_list.values(),
    model_names=exog_list.keys(),
    covariates=covariates,
)

for model_name, model, formula in zip(
    models.keys(), models.values(), formulas.values()
):
    display(Markdown(f"#### {model_name}"))
    display(Markdown(f"`{formula}`"))
    display(Markdown(regutils.present_model(model, presentation_cols).to_markdown()))

# _, results = regutils.run_regressions(
#     model_data_z, test_outcomes, predictors, covariates=covariates
# )

# predictor_to_show = "CP"
# formula = regutils.formula_string("outcome", predictor_to_show, covariates)
# display(Markdown(f"**Effect of {predictor_to_show} on outcome**"))
# display(Markdown(f"`{formula}`"))

# results[predictor_to_show]

#### CP alone

`CCR_log ~ CP + age + Female + tiv`

|           |       coef |        pval |        se | ci               |
|:----------|-----------:|------------:|----------:|:-----------------|
| Intercept | 0.00130128 | 0.969231    | 0.0337174 | [-0.065, 0.0676] |
| CP        | 0.604836   | 5.48713e-51 | 0.0354435 | [0.535, 0.674]   |
| age       | 0.184526   | 5.11559e-06 | 0.0399898 | [0.106, 0.263]   |
| Female    | 0.0478492  | 0.210942    | 0.0381964 | [-0.0272, 0.123] |
| tiv       | 0.0550989  | 0.19337     | 0.0423001 | [-0.028, 0.138]  |

#### Control for brain

`CCR_log ~ CP + brain + age + Female + tiv`

|           |        coef |        pval |        se | ci                |
|:----------|------------:|------------:|----------:|:------------------|
| Intercept |  0.00135482 | 0.968039    | 0.0337943 | [-0.0651, 0.0678] |
| CP        |  0.598909   | 1.11121e-42 | 0.0393473 | [0.522, 0.676]    |
| brain     | -0.0201276  | 0.628742    | 0.0416012 | [-0.102, 0.0616]  |
| age       |  0.178081   | 1.2746e-05  | 0.0403598 | [0.0988, 0.257]   |
| Female    |  0.0502957  | 0.18169     | 0.0376018 | [-0.0236, 0.124]  |
| tiv       |  0.0591259  | 0.16024     | 0.0420367 | [-0.0235, 0.142]  |

#### Control for thalamus

`CCR_log ~ CP + THALAMUS_1 + age + Female + tiv`

|            |        coef |        pval |        se | ci                |
|:-----------|------------:|------------:|----------:|:------------------|
| Intercept  |  0.00160049 | 0.958184    | 0.0305078 | [-0.0584, 0.0616] |
| CP         |  0.430836   | 9.30737e-25 | 0.0395383 | [0.353, 0.509]    |
| THALAMUS_1 | -0.411817   | 2.23122e-19 | 0.0437306 | [-0.498, -0.326]  |
| age        |  0.113381   | 0.00123478  | 0.0348771 | [0.0448, 0.182]   |
| Female     |  0.0774055  | 0.0324117   | 0.0360733 | [0.00652, 0.148]  |
| tiv        |  0.256524   | 5.68249e-09 | 0.0432001 | [0.172, 0.341]    |

#### Just brain

`CCR_log ~ brain + age + Female + tiv`

|           |       coef |        pval |        se | ci                |
|:----------|-----------:|------------:|----------:|:------------------|
| Intercept |  0.0021989 | 0.957475    | 0.0412152 | [-0.0788, 0.0832] |
| brain     | -0.204844  | 1.91572e-05 | 0.0474232 | [-0.298, -0.112]  |
| age       |  0.318629  | 6.22008e-11 | 0.0475827 | [0.225, 0.412]    |
| Female    | -0.0426659 | 0.341084    | 0.0447697 | [-0.131, 0.0453]  |
| tiv       |  0.118626  | 0.0159512   | 0.0490405 | [0.0223, 0.215]   |

#### Just thalamus

`CCR_log ~ THALAMUS_1 + age + Female + tiv`

|            |        coef |        pval |        se | ci                |
|:-----------|------------:|------------:|----------:|:------------------|
| Intercept  |  0.00197182 | 0.954257    | 0.0343561 | [-0.0655, 0.0695] |
| THALAMUS_1 | -0.615487   | 1.68747e-49 | 0.0367721 | [-0.688, -0.543]  |
| age        |  0.204641   | 3.37956e-08 | 0.0364391 | [0.133, 0.276]    |
| Female     |  0.0189442  | 0.638123    | 0.0402523 | [-0.0602, 0.098]  |
| tiv        |  0.370416   | 3.86925e-15 | 0.0455399 | [0.281, 0.46]     |

##### CP fully mediates the relationship between brain and CCR.

In [14]:
p2 = Process(
    data=model_data_z,
    model=4,
    x="grey",
    y="CCF",
    m=["CP"],
    controls=["age", "Female", "tiv"],
    controls_in="all",
    suppr_init=True,
)

outcome, mediation = utils.read_pyprocess_output(p2)

The degree of preferential central expansion was modeled as the ration between LV and peripheral CSF volume:

- $CCR = LV / periCSF$

**MS is associated with an increase in CCR, after controlling for age, sex and TIV**

In [15]:
model_data = dataT.copy()[(MS_patients) | (NONMS_patients)]
model_data_z = zscore(model_data)
model_data_z = model_data_z.join([pd.get_dummies(model_data["dz_type2"], dtype=int)])

outcome = "MS"  # LV / allCSF
covariates = ["age", "Female", "tiv"]

predictors = ["CCR"]

results, _ = run_regressions(
    model_data_z, outcome, predictors, covariates, regression_model=sm.Logit
)
results = results[outcome]
display(Markdown(f"`{outcome} ~ CCR + {' + '.join(covariates)}`"))
display_order = results["coef"].apply(np.abs).sort_values(ascending=False).index
# display_order = results.index
display_cols = results.columns[~results.columns.isin(["ci"])]
display(results.loc["CCR", display_cols])

Optimization terminated successfully.
         Current function value: 0.434251
         Iterations 6


`MS ~ CCR + age + Female + tiv`

coef                             0.306418
pval                             0.020511
p_fdr                            0.020511
se                               0.132255
llci                             0.047202
ulci                             0.565634
R2                                   None
formula     MS ~ CCR + age + Female + tiv
coef_sig                         0.306418
Name: CCR, dtype: object

---

ThirdV width more strongly related to CCF

In [None]:
res, formula = quick_regression2(
    model_data_z, "CCF", ["thirdV_width", "thirdV", "CP", "age", "Female", "tiv"]
)
print(res.summary())


                            OLS Regression Results                            
Dep. Variable:                    CCF   R-squared:                       0.380
Model:                            OLS   Adj. R-squared:                  0.372
Method:                 Least Squares   F-statistic:                     47.06
Date:                Sun, 02 Nov 2025   Prob (F-statistic):           6.13e-45
Time:                        14:25:27   Log-Likelihood:                -550.89
No. Observations:                 467   AIC:                             1116.
Df Residuals:                     460   BIC:                             1145.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -0.0015      0.037     -0.040   