In [2]:
import pandas as pd
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
from pathlib import Path
import json
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
from datetime import datetime
import re
from scipy import stats
import numpy as np
import statsmodels.api as sm
import statsmodels
from matplotlib import colormaps
from tqdm.notebook import tqdm
import helpers
from collections import defaultdict
from tqdm.notebook import tqdm


from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.regression.linear_model import OLS

from mri_data import file_manager as fm

## Load Data

In [3]:
drive_root = fm.get_drive_root()
dataroot = drive_root / "3Tpioneer_bids"
data_dir = Path("/home/srs-9/Projects/ms_mri/data")
fig_path = Path("/home/srs-9/Projects/ms_mri/analysis/thalamus/figures_tables/edss_regressions")

choroid_volumes = pd.read_csv(
    "/home/srs-9/Projects/ms_mri/data/choroid_aschoplex_volumes.csv", index_col="subid"
).dropna()
tiv = pd.read_csv("/home/srs-9/Projects/ms_mri/data/tiv_data.csv", index_col="subid")

df = pd.read_csv(
    "/home/srs-9/Projects/ms_mri/data/clinical_data_processed.csv", index_col="subid"
)
df = df.join([choroid_volumes, tiv])
df['thalamus_sqrt'] = np.sqrt(df['thalamus'])
df['thalamus_curt'] = np.sqrt(df['thalamus']**3)
df_z = df.copy()
numeric_cols = df.select_dtypes(include='number').columns
df_z[numeric_cols] = df_z[numeric_cols].apply(stats.zscore, nan_policy="omit")

df_ms = df[df['dz_type2'] == "MS"]
df_ms_z = df_ms.copy()
df_ms_z[numeric_cols] = df_ms_z[numeric_cols].apply(stats.zscore, nan_policy="omit")


viridis = colormaps['viridis'].resampled(20)

colors = helpers.get_colors()

In [4]:
df_thomas = pd.read_csv(data_dir / "hipsthomas_vols.csv", index_col="subid")
cols_orig = df_thomas.columns
new_colnames = {}
for col in df_thomas.columns:
    new_col = re.sub(r"(\d+)-([\w-]+)", r"\2_\1", col)
    new_col = re.sub("-", "_", new_col)
    new_colnames[col] = new_col
df_thomas = df_thomas.rename(columns=new_colnames)
df_thomas_norm = df_thomas.apply(lambda col: col / df_thomas['THALAMUS_1'])
df_thomas_z = df_thomas.apply(stats.zscore, nan_policy="omit")
df_thomas_norm_z = df_thomas_norm.apply(stats.zscore, nan_policy="omit")

df_thomas_left = pd.read_csv(data_dir / "hipsthomas_left_vols.csv", index_col="subid")
df_thomas_left = df_thomas_left.rename(columns=new_colnames)
df_thomas_left_z = df_thomas_left.apply(stats.zscore, nan_policy="omit")

df_thomas_right = pd.read_csv(data_dir / "hipsthomas_right_vols.csv", index_col="subid")
df_thomas_right = df_thomas_right.rename(columns=new_colnames)
df_thomas_right_z = df_thomas_right.apply(stats.zscore, nan_policy="omit")


thalamic_nuclei = [2, 4, 5, 6, 7, 8, 9, 10, 11, 12]
thalamic_nuclei_str = [str(i) for i in thalamic_nuclei]

hips_thomas_ref = pd.read_csv(
    "/home/srs-9/Projects/ms_mri/data/hipsthomas_struct_index.csv", index_col="index"
)['struct']
# hips_thomas_ref.rename(columns={"struct": "struct_name"}, inplace=True)

choroid_dists = pd.read_csv(data_dir / "centroid-choroid_SDT.csv", index_col="subid")
ventricle_dists = pd.read_csv(
    data_dir / "centroid-ventricle_SDT.csv", index_col="subid"
)

mni_choroid_dists = pd.read_csv("/home/srs-9/Projects/ms_mri/data/mni-centroid-choroid_SDT2.csv")

def combine_nuclei(df):
    df2 = pd.DataFrame()
    df2['anterior'] = df['AV_2']
    df2['ventral'] = df['VA_4'] + df['VLa_5'] + df['VLP_6'] + df['VPL_7']
    df2['intralaminar'] = df['CM_11'] 
    df2['medial'] = df['MD_Pf_12']
    df2['posterior'] = df['Pul_8'] + df['LGN_9'] + df['MGN_10']
    df2['THALAMUS_1'] = df['THALAMUS_1']
    return df2

df_thomas2 = combine_nuclei(df_thomas)
df_thomas2_z = df_thomas2.apply(stats.zscore, nan_policy="omit")

## Functions

In [5]:
def compute_se_diff(se1, n1, se2, n2):
    return np.sqrt((se1**2/n1) + (se2**2/n2))

## Regressions

### Main MRI Features

#### MS Patients

In [16]:
model_data = df_ms_z.copy()
covariates = "age + Female + tiv"

outcome = "EDSS_sqrt"
predictors = ["brain", "white", "grey", "cortical_thickness", "thalamus", "t2lv", "PRL", "choroid_volume"]

pvals = {}
coefs = {}
stderrs = {}
llci = {}
ulci = {}
ci_str = {}

for x in predictors:
    formula = f"{outcome} ~ {x} + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[x] = res.pvalues[x]
    coefs[x] = res.params[x]
    stderrs[x] = res.HC0_se[x]
    ci = res.conf_int()
    llci[x] = ci.loc[x, 0]
    ulci[x] = ci.loc[x, 1]
    ci_str[x] = f"[{llci[x]:.6f}, {ulci[x]:.6f}]"


regression_results = pd.DataFrame({"coef": coefs, "stderr": stderrs, "llci": llci, "ulci": ulci, "pvals": pvals, "ci": ci_str})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)
regression_results.index.name = "structure"

regression_results.to_excel(fig_path / "EDSS_and_main_mri_features_MS.xlsx")

In [21]:
formula = "EDSS_sqrt ~ age + Female + tiv"
res = sm.OLS.from_formula(formula, data=model_data).fit()

#### NIND Group

In [22]:
model_data = df_z.copy()
model_data = model_data[model_data['dz_type3'].isin(["NIND"])]
covariates = "age + Female + tiv"

outcome = "EDSS_sqrt"
predictors = ["brain", "white", "grey", "cortical_thickness", "thalamus", "t2lv", "PRL", "choroid_volume"]

pvals = {}
coefs = {}
stderrs = {}
llci = {}
ulci = {}
ci_str = {}

for x in predictors:
    formula = f"{outcome} ~ {x} + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[x] = res.pvalues[x]
    coefs[x] = res.params[x]
    stderrs[x] = res.HC0_se[x]
    ci = res.conf_int()
    llci[x] = ci.loc[x, 0]
    ulci[x] = ci.loc[x, 1]
    ci_str[x] = f"[{llci[x]:.6f}, {ulci[x]:.6f}]"


regression_results = pd.DataFrame({"coef": coefs, "stderr": stderrs, "llci": llci, "ulci": ulci, "pvals": pvals, "ci": ci_str})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)
regression_results.index.name = "structure"

regression_results.to_excel(fig_path / "EDSS_and_main_mri_features_NIND.xlsx")
regression_results

Unnamed: 0_level_0,coef,stderr,llci,ulci,pvals,ci,p_fdr
structure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
brain,-0.179508,0.257239,-0.781819,0.422803,0.539081,"[-0.781819, 0.422803]",0.887287
white,0.088013,0.235531,-0.452799,0.628825,0.736379,"[-0.452799, 0.628825]",0.949229
grey,-0.381446,0.260129,-0.989191,0.226299,0.203827,"[-0.989191, 0.226299]",0.543538
cortical_thickness,-0.622947,0.16734,-1.216941,-0.028953,0.040837,"[-1.216941, -0.028953]",0.326694
thalamus,0.033765,0.462934,-1.064874,1.132404,0.949229,"[-1.064874, 1.132404]",0.949229
t2lv,0.034695,0.158398,-0.406233,0.475623,0.870541,"[-0.406233, 0.475623]",0.949229
PRL,2.598156,0.566916,-0.892707,6.089019,0.135308,"[-0.892707, 6.089019]",0.541231
choroid_volume,0.146956,0.212451,-0.367336,0.661249,0.554554,"[-0.367336, 0.661249]",0.887287


In [23]:
formula = "EDSS_sqrt ~ age + Female + tiv"
res = sm.OLS.from_formula(formula, data=model_data).fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:              EDSS_sqrt   R-squared:                       0.137
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.004
Date:                Mon, 09 Jun 2025   Prob (F-statistic):              0.413
Time:                        12:52:42   Log-Likelihood:                -28.169
No. Observations:                  23   AIC:                             64.34
Df Residuals:                      19   BIC:                             68.88
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.1670      0.237     -0.706      0.4

### Compare Thalamus Associations for PMS vs RMS

Thalamus does not predict EDSS any better in PMS patients compared to RMS patients.

There are other methods to try in the GPT convo (not because I think this result will change, but to practice them here so I can apply them elsewhere if desired)

- Chow / SUest
- Compare partial correlations with Fisher's z

In [26]:
formula = "EDSS_sqrt ~ THALAMUS_1*PMS + age*PMS + Female + tiv"

model_data = df_z.join([df_thomas_z])
model_data = model_data[model_data.dz_type5.isin(["PMS", "RMS"])]
model_data = pd.concat([model_data, pd.get_dummies(model_data.dz_type5, dtype="int")], axis=1)
model_data.loc[model_data['EDSS_sqrt'].isna(), 'EDSS_sqrt'] = model_data['EDSS_sqrt'].median()

res = sm.OLS.from_formula(formula, data=model_data).fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:              EDSS_sqrt   R-squared:                       0.394
Model:                            OLS   Adj. R-squared:                  0.385
Method:                 Least Squares   F-statistic:                     42.80
Date:                Mon, 09 Jun 2025   Prob (F-statistic):           1.95e-46
Time:                        13:40:01   Log-Likelihood:                -544.05
No. Observations:                 468   AIC:                             1104.
Df Residuals:                     460   BIC:                             1137.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -0.2137      0.042     -5.

Something is wrong here. Not significant if I compute standard deviation from the stderrs and plug into a t test calculator.

Using a t-test here isn't the best course anyways (according to GPT, check convo "Comparing regression betas"), but still curious what went wrong

In [50]:
formula = "EDSS_sqrt ~ THALAMUS_1 + age + Female + tiv"

model_data = df_z.join([df_thomas_z])

model_data_RMS = model_data.copy()
model_data_RMS = model_data_RMS[model_data_RMS['dz_type5'].isin(["RMS"])]
res_RMS = sm.OLS.from_formula(formula, data=model_data_RMS).fit()


model_data_PMS = model_data.copy()
model_data_PMS = model_data_PMS[model_data_PMS['dz_type5'].isin(["PMS"])]
res_PMS = sm.OLS.from_formula(formula, data=model_data_PMS).fit()

param1 = res_RMS.params['THALAMUS_1']
param2 = res_PMS.params['THALAMUS_1']

ci1 = res_RMS.conf_int()
ci2 = res_PMS.conf_int()

se1 = res_RMS.HC0_se['THALAMUS_1']
dof1 = res_RMS.df_resid
se2 = res_PMS.HC0_se['THALAMUS_1']
dof2 = res_PMS.df_resid

se_diff = compute_se_diff(se1, dof1, se2, dof2)
mean_diff = param1 - param2
t_stat = mean_diff / se_diff

dof = dof1 + dof2 - 2
p_val = stats.t.sf(t_stat, dof)

print("RMS:")
print(f"{param1:.2} ± {se1:.2} [{ci1.loc['THALAMUS_1', 0]:.2}, {ci1.loc['THALAMUS_1', 1]:.2}]")
print("\n")
print("PMS:")
print(f"{param2:.2} ± {se2:.2} [{ci2.loc['THALAMUS_1', 0]:.2}, {ci2.loc['THALAMUS_1', 1]:.2}]")
print("\n")

print(f"p = {p_val:.2}")

RMS:
-0.19 ± 0.058 [-0.3, -0.089]


PMS:
-0.22 ± 0.072 [-0.38, -0.052]


p = 0.0027


### HIPS-THOMAS

In [14]:
model_data = df_z.join(df_thomas_z)
model_data = model_data[model_data['dz_type5'].isin(["RMS", "PMS"])]
covariates = "age + Female + tiv"

pvals = {}
coefs = {}
stderrs = {}
llci = {}
ulci = {}
ci_str = {}

outcome = "EDSS_sqrt"

structs = thalamic_nuclei
all_results = {}
for x in structs:
    col = hips_thomas_ref[x]
    formula = f"{outcome} ~ {col} + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    all_results[col] = res
    pvals[col] = res.pvalues[col]
    coefs[col] = res.params[col]
    stderrs[col] = res.HC0_se[col]
    ci = res.conf_int()
    llci[col] = ci.loc[col, 0]
    ulci[col] = ci.loc[col, 1]
    ci_str[col] = f"[{llci[col]:.6f}, {ulci[col]:.6f}]"



regression_results = pd.DataFrame({"coef": coefs, "stderr": stderrs, "llci": llci, "ulci": ulci, "pvals": pvals, "ci": ci_str})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)
regression_results.sort_values(by="coef", inplace=True)
regression_results.to_excel(fig_path / "thalamic_nuclei.xlsx")
regression_results

Unnamed: 0,coef,stderr,llci,ulci,pvals,ci,p_fdr
VLP_6,-0.275143,0.049881,-0.371536,-0.17875,3.554299e-08,"[-0.371536, -0.178750]",8.885747e-08
Pul_8,-0.264485,0.043973,-0.352835,-0.176135,7.884106e-09,"[-0.352835, -0.176135]",6.904379e-08
VPL_7,-0.263361,0.045893,-0.35549,-0.171231,3.399101e-08,"[-0.355490, -0.171231]",8.885747e-08
AV_2,-0.252448,0.050028,-0.345964,-0.158932,1.772023e-07,"[-0.345964, -0.158932]",3.439696e-07
LGN_9,-0.248675,0.040212,-0.333189,-0.164161,1.380876e-08,"[-0.333189, -0.164161]",6.904379e-08
MD_Pf_12,-0.238181,0.043914,-0.326908,-0.149455,2.063818e-07,"[-0.326908, -0.149455]",3.439696e-07
MGN_10,-0.232713,0.045302,-0.328204,-0.137223,2.277099e-06,"[-0.328204, -0.137223]",3.252998e-06
VLa_5,-0.20148,0.052686,-0.305925,-0.097036,0.0001704892,"[-0.305925, -0.097036]",0.0002131114
CM_11,-0.164141,0.050985,-0.260973,-0.067309,0.0009357536,"[-0.260973, -0.067309]",0.001039726
VA_4,-0.14933,0.053562,-0.255283,-0.043377,0.005841277,"[-0.255283, -0.043377]",0.005841277


Correlate each of the thalamic nuclei to EDSS, including whole thalalmic volume as a covariate. None of the p values are significant, but we can still compare coefficients to see each nucleus's relative contribution to EDSS. Will bootstrap these regressions in the following cell to see if the rank is significant.  

In [33]:
model_data = df_z.join(df_thomas_z)
model_data = model_data[model_data['dz_type5'].isin(["RMS", "PMS"])]
covariates = "age + Female + tiv + THALAMUS_1"

pvals = {}
coefs = {}
thal_coefs = {}
stderrs = {}
llci = {}
ulci = {}
ci_str = {}

outcome = "EDSS_sqrt"

structs = thalamic_nuclei
all_results = {}
for x in structs:
    col = hips_thomas_ref[x]
    formula = f"{outcome} ~ {col} + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[col] = res.pvalues[col]
    coefs[col] = res.params[col]
    thal_coefs[col] = res.params['THALAMUS_1']
    stderrs[col] = res.HC0_se[col]
    ci = res.conf_int()
    llci[col] = ci.loc[col, 0]
    ulci[col] = ci.loc[col, 1]
    ci_str[col] = f"[{llci[col]:.6f}, {ulci[col]:.6f}]"



regression_results = pd.DataFrame({"coef": coefs, "thal_coef": thal_coefs, "stderr": stderrs, "llci": llci, "ulci": ulci, "pvals": pvals, "ci": ci_str})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)
main_coefs = np.array([coefs[hips_thomas_ref[x]] for x in structs])
regression_results.sort_values(by="coef", inplace=True)
regression_results.to_excel(fig_path / "thalamic_nuclei-thalamus_control.xlsx")
regression_results

Unnamed: 0,coef,thal_coef,stderr,llci,ulci,pvals,ci,p_fdr
LGN_9,-0.121349,-0.199314,0.054269,-0.240209,-0.00249,0.04541,"[-0.240209, -0.002490]",0.454099
AV_2,-0.104113,-0.225002,0.067506,-0.227771,0.019545,0.098696,"[-0.227771, 0.019545]",0.49348
VPL_7,-0.08307,-0.225802,0.077184,-0.240838,0.074698,0.301327,"[-0.240838, 0.074698]",0.602653
VLP_6,-0.065827,-0.240322,0.094286,-0.247363,0.115708,0.47644,"[-0.247363, 0.115708]",0.680629
Pul_8,-0.041364,-0.253989,0.127254,-0.284158,0.20143,0.737921,"[-0.284158, 0.201430]",0.819912
VLa_5,-0.026937,-0.280806,0.062542,-0.150953,0.097079,0.669675,"[-0.150953, 0.097079]",0.819912
MGN_10,-0.014345,-0.283989,0.072031,-0.160103,0.131412,0.846715,"[-0.160103, 0.131412]",0.846715
CM_11,0.072634,-0.344097,0.069361,-0.056085,0.201354,0.268034,"[-0.056085, 0.201354]",0.602653
MD_Pf_12,0.092401,-0.384728,0.105255,-0.124039,0.308841,0.401916,"[-0.124039, 0.308841]",0.66986
VA_4,0.09536,-0.350109,0.064383,-0.037399,0.228119,0.158748,"[-0.037399, 0.228119]",0.529159


Bootstrap the regressions

In [34]:
data_to_sample = df_z.join(df_thomas_z)
data_to_sample = data_to_sample[data_to_sample['dz_type5'].isin(["RMS", "PMS"])].reset_index()
data_to_sample.loc[data_to_sample['EDSS_sqrt'].isna(), 'EDSS_sqrt'] = model_data['EDSS_sqrt'].median()


def get_zeros():
    return np.zeros((1000,))

coefs_boot = defaultdict(get_zeros)

outcome = "EDSS_sqrt"

for i in tqdm(range(1000)):
    inds = np.random.randint(0, data_to_sample.index.max(), len(data_to_sample))
    model_data = data_to_sample.loc[inds, :]
    for x in structs:
        col = hips_thomas_ref[x]
        formula = f"{outcome} ~ {col} + {covariates}"
        res = sm.OLS.from_formula(formula, data=model_data).fit()
        coefs_boot[col][i] = res.params[col]

  0%|          | 0/1000 [00:00<?, ?it/s]

Get the mean and 95% CI of the bootstrapped coefficients for each nucleus

In [35]:
coefs_copy = coefs_boot.copy()
llci = {}
ulci = {}
means = {}
for x in structs:
    col = hips_thomas_ref[x]
    coefs_copy[col] = np.sort(coefs_copy[col])
    llci[col] = coefs_copy[col][25]
    ulci[col] = coefs_copy[col][975]
    means[col] = np.mean(coefs_copy[col])
    print(f"{means[col]:.4f} [{llci[col]:2.2f}, {ulci[col]:2.2f}]")

coefs_bootmean = np.array([means[hips_thomas_ref[x]] for x in structs])
np.corrcoef(main_coefs, coefs_bootmean)[0,1]

-0.1093 [-0.24, 0.03]
0.1013 [-0.02, 0.23]
-0.0279 [-0.15, 0.09]
-0.0701 [-0.25, 0.11]
-0.0736 [-0.22, 0.09]
-0.0417 [-0.31, 0.21]
-0.1245 [-0.23, -0.03]
-0.0198 [-0.16, 0.12]
0.0702 [-0.07, 0.21]
0.0894 [-0.12, 0.29]


0.9981998229200125

Correlate the coefficients of the nuclei for each boostrapped sample to the main regression to see if the relative strengths of the nuclei's coefficients are consistent across resamples

In [44]:
coefs_boot_arr = np.zeros((len(structs), 1000))
for i, x in enumerate(structs):
    col = hips_thomas_ref[x]
    coefs_boot_arr[i, :] = coefs_boot[col]

corr_coefs = np.zeros((1000,))
kendal_taus = np.zeros((1000,))
for i in range(1000):
    corr_coefs[i] = np.corrcoef(coefs_boot_arr[:,i], main_coefs)[0,1]
    kendal_taus[i] = stats.kendalltau(coefs_boot_arr[:,i], main_coefs)[0]
mean_corrcoef = np.mean(corr_coefs)
mean_tau = np.mean(kendal_taus)
corr_coefs = np.sort(corr_coefs)
kendal_taus = np.sort(kendal_taus)
print(f"{mean_corrcoef:.2f} [{corr_coefs[25]:.2f}, {corr_coefs[975]:.2f}]")
print(f"{mean_tau:.2f} [{kendal_taus[25]:.2f}, {kendal_taus[975]:.2f}]")

0.71 [0.29, 0.94]
0.53 [0.20, 0.82]


In [42]:
win_counts = np.zeros((coefs_boot_arr.shape[0],))
for i in range(1000):
    win_counts[coefs_boot_arr[:,i].argmin()] += 1

win_count_df = pd.Series(win_counts/1000, index=hips_thomas_ref[thalamic_nuclei])
win_count_df.sort_values(ascending=False)

struct
LGN_9       0.234
AV_2        0.201
VLP_6       0.183
Pul_8       0.176
VPL_7       0.136
MGN_10      0.029
VLa_5       0.020
MD_Pf_12    0.017
CM_11       0.004
VA_4        0.000
dtype: float64

In [16]:
model_data = df_z.join([df_thomas2_z])
model_data = model_data[model_data.dz_type2 == "MS"]

covariates = "age + Female + tiv + THALAMUS_1"

pvals = {}
coefs = {}
stderrs = {}
llci = {}
ulci = {}
ci_str = {}

for col in df_thomas2:
    formula = f"EDSS_sqrt ~ {col} + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[col] = res.pvalues[col]
    coefs[col] = res.params[col]
    stderrs[col] = res.HC0_se[col]
    ci = res.conf_int()
    llci[col] = ci.loc[col, 0]
    ulci[col] = ci.loc[col, 1]
    ci_str[col] = f"[{llci[col]:.6f}, {ulci[col]:.6f}]"

regression_results = pd.DataFrame({"coef": coefs, "stderr": stderrs, "llci": llci, "ulci": ulci, "pvals": pvals, "ci": ci_str})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)

regression_results.sort_values(by="coef")

Unnamed: 0,coef,stderr,llci,ulci,pvals,ci,p_fdr
THALAMUS_1,-0.294878,0.045745,-0.38822,-0.201536,1.220738e-09,"[-0.388220, -0.201536]",7.324429e-09
anterior,-0.104113,0.067506,-0.227771,0.019545,0.09869602,"[-0.227771, 0.019545]",0.2960881
posterior,-0.088322,0.132458,-0.34054,0.163896,0.4916786,"[-0.340540, 0.163896]",0.5900143
ventral,-0.050588,0.10621,-0.263906,0.16273,0.641394,"[-0.263906, 0.162730]",0.641394
intralaminar,0.072634,0.069361,-0.056085,0.201354,0.2680345,"[-0.056085, 0.201354]",0.536069
medial,0.092401,0.105255,-0.124039,0.308841,0.4019157,"[-0.124039, 0.308841]",0.5900143


In [13]:
model_data = df_z.join([df_thomas2_z])
model_data = model_data[model_data.dz_type2 == "MS"]

covariates = "age + Female + tiv + THALAMUS_1"

pvals = {}
coefs = {}
stderrs = {}
llci = {}
ulci = {}
ci_str = {}

for col in df_thomas2:
    formula = f"t2lv_logtrans ~ {col} + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[col] = res.pvalues[col]
    coefs[col] = res.params[col]
    stderrs[col] = res.HC0_se[col]
    ci = res.conf_int()
    llci[col] = ci.loc[col, 0]
    ulci[col] = ci.loc[col, 1]
    ci_str[col] = f"[{llci[col]:.6f}, {ulci[col]:.6f}]"

regression_results = pd.DataFrame({"coef": coefs, "stderr": stderrs, "llci": llci, "ulci": ulci, "pvals": pvals, "ci": ci_str})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)

regression_results.sort_values(by="coef")

Unnamed: 0,coef,stderr,llci,ulci,pvals,ci,p_fdr
THALAMUS_1,-0.576682,0.045397,-0.66209,-0.491274,2.7971999999999997e-34,"[-0.662090, -0.491274]",1.67832e-33
posterior,-0.467866,0.121892,-0.695641,-0.24009,6.350638e-05,"[-0.695641, -0.240090]",0.0001270128
medial,-0.150188,0.093699,-0.346483,0.046106,0.1333832,"[-0.346483, 0.046106]",0.1333832
anterior,-0.125549,0.057022,-0.238593,-0.012505,0.0295753,"[-0.238593, -0.012505]",0.04436295
intralaminar,0.103037,0.06585,-0.014826,0.2209,0.08648215,"[-0.014826, 0.220900]",0.1037786
ventral,0.41863,0.111829,0.227235,0.610026,2.101338e-05,"[0.227235, 0.610026]",6.304015e-05


In [15]:
model_data = df_z.join([df_thomas2_z])
model_data = model_data[model_data.dz_type2 == "MS"]

covariates = "age + Female + tiv + THALAMUS_1"

pvals = {}
coefs = {}
stderrs = {}
llci = {}
ulci = {}
ci_str = {}

for col in df_thomas2:
    formula = f"choroid_volume ~ {col} + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[col] = res.pvalues[col]
    coefs[col] = res.params[col]
    stderrs[col] = res.HC0_se[col]
    ci = res.conf_int()
    llci[col] = ci.loc[col, 0]
    ulci[col] = ci.loc[col, 1]
    ci_str[col] = f"[{llci[col]:.6f}, {ulci[col]:.6f}]"

regression_results = pd.DataFrame({"coef": coefs, "stderr": stderrs, "llci": llci, "ulci": ulci, "pvals": pvals, "ci": ci_str})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)

regression_results.sort_values(by="coef")

Unnamed: 0,coef,stderr,llci,ulci,pvals,ci,p_fdr
THALAMUS_1,-0.471241,0.047756,-0.557449,-0.385033,3.496056e-24,"[-0.557449, -0.385033]",2.0976330000000003e-23
medial,-0.326169,0.103968,-0.522535,-0.129804,0.0011796,"[-0.522535, -0.129804]",0.002359199
posterior,-0.295484,0.118422,-0.527846,-0.063121,0.01280286,"[-0.527846, -0.063121]",0.01536344
intralaminar,-0.174645,0.065383,-0.292919,-0.056371,0.00388849,"[-0.292919, -0.056371]",0.005832734
anterior,0.055964,0.064762,-0.058611,0.170539,0.3376318,"[-0.058611, 0.170539]",0.3376318
ventral,0.597348,0.098615,0.408056,0.786639,1.243761e-09,"[0.408056, 0.786639]",3.731284e-09


### Deep Grey Structures

In [136]:
model_data = df_z.join(df_thomas_z)
model_data = model_data[model_data['dz_type5'].isin(["RMS", "PMS"])]
covariates = "age + Female + tiv"

pvals = {}
coefs = {}
stderrs = {}
llci = {}
ulci = {}

outcome = "EDSS_sqrt"
structs = hips_thomas_ref.index[~hips_thomas_ref.index.isin(thalamic_nuclei)]

all_results = {}
for x in structs:
    col = hips_thomas_ref[x]
    formula = f"{outcome} ~ {col} + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[col] = res.pvalues[col]
    coefs[col] = res.params[col]
    stderrs[col] = res.HC0_se[col]
    ci = res.conf_int()
    llci[col] = ci.loc[col, 0]
    ulci[col] = ci.loc[col, 1]


regression_results = pd.DataFrame({"coef": coefs, "stderr": stderrs, "llci": llci, "ulci": ulci, "pvals": pvals})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)
regression_results.sort_values(by="coef", inplace=True)
regression_results.to_csv(fig_path / "deep_grey_regressions.csv")
regression_results

Unnamed: 0,coef,stderr,llci,ulci,pvals,p_fdr
THALAMUS_1,-0.294878,0.045745,-0.38822,-0.201536,1.220738e-09,1.464886e-08
Cla_28,-0.215097,0.049181,-0.313742,-0.116452,2.235316e-05,0.0001341189
Acc_26,-0.177858,0.048999,-0.280341,-0.075376,0.0007065838,0.002826335
RN_32,-0.148705,0.05409,-0.258899,-0.038512,0.00828267,0.01987841
Hb_13,-0.134733,0.043543,-0.219743,-0.049723,0.001959034,0.005877101
Cau_27,-0.133372,0.054594,-0.234685,-0.032058,0.009991938,0.01998388
Amy_34,-0.111904,0.045568,-0.200122,-0.023686,0.01302889,0.02233524
Put_31,-0.084238,0.059147,-0.192744,0.024269,0.1277856,0.1916784
GP_33,-0.073634,0.050006,-0.177461,0.030194,0.1640826,0.1968991
GPe_29,-0.073625,0.048188,-0.176024,0.028774,0.1583391,0.1968991


In [210]:
vif_data = df_thomas_z[hips_thomas_ref[thalamic_nuclei + [1]]]
vif = pd.DataFrame()
vif['struct'] = vif_data.columns
vif.set_index("struct", inplace=True)
vif['VIF'] = [variance_inflation_factor(vif_data.values, i, )
                          for i in range(len(vif_data.columns))]

In [225]:
model_data = df_z.join(df_thomas2_z)
model_data = model_data[model_data['dz_type5'].isin(["RMS", "PMS"])]
formula = f"EDSS_sqrt ~ age + Female + tiv + {" + ".join(df_thomas2_z.columns[~df_thomas2_z.columns.isin(["THALAMUS_1"])])}"
res = sm.OLS.from_formula(formula, data=model_data).fit()
print(res.summary())


                            OLS Regression Results                            
Dep. Variable:              EDSS_sqrt   R-squared:                       0.289
Model:                            OLS   Adj. R-squared:                  0.277
Method:                 Least Squares   F-statistic:                     22.61
Date:                Thu, 05 Jun 2025   Prob (F-statistic):           5.27e-29
Time:                        20:05:35   Log-Likelihood:                -569.77
No. Observations:                 453   AIC:                             1158.
Df Residuals:                     444   BIC:                             1195.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -0.0257      0.041     -0.631   

---

### Elastic Net Trials

In [46]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np

In [50]:
model_data = df_z.join(df_thomas_z)
model_data = model_data[model_data['dz_type5'].isin(["RMS", "PMS"])]
model_data = model_data[~model_data['EDSS_sqrt'].isna()]

structs = hips_thomas_ref[thalamic_nuclei]


X = model_data[structs]
y = model_data['EDSS_sqrt']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Lasso Regression model with a specific alpha (regularization strength)
alpha = 0.1  # Adjust alpha as needed
lasso = Lasso(alpha=alpha)

# Train the model
lasso.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lasso.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Print the coefficients (note that some may be zero)
print("Coefficients:", lasso.coef_)

Mean Squared Error: 1.0688643148091397
Coefficients: [-0.16628181 -0.         -0.         -0.         -0.         -0.02281968
 -0.02927025 -0.15345964 -0.         -0.        ]


In [51]:
check = pd.DataFrame({"coef": lasso.coef_}, index=X.columns)
check['abs'] = check['coef'].abs()
check = check.sort_values(by="abs", ascending=False)
check

Unnamed: 0,coef,abs
AV_2,-0.166282,0.166282
MGN_10,-0.15346,0.15346
LGN_9,-0.02927,0.02927
Pul_8,-0.02282,0.02282
VA_4,-0.0,0.0
VLa_5,-0.0,0.0
VLP_6,-0.0,0.0
VPL_7,-0.0,0.0
CM_11,-0.0,0.0
MD_Pf_12,-0.0,0.0


In [49]:
from sklearn.linear_model import ElasticNetCV


regr = ElasticNetCV(cv=10, random_state=0)
regr.fit(X_train, y_train)
print(regr.alpha_)
print(regr.intercept_)
y_pred = regr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(mse)

check = pd.DataFrame({"coef": regr.coef_}, index=X.columns)
check['abs'] = check['coef'].abs()
check = check.sort_values(by="abs", ascending=False)
check

0.04268818040991856
-0.005121767360380086
1.042679507226447


Unnamed: 0,coef,abs
AV_2,-0.203274,0.203274
MGN_10,-0.167571,0.167571
LGN_9,-0.058592,0.058592
Pul_8,-0.035116,0.035116
VLa_5,-0.013014,0.013014
VA_4,0.0,0.0
VLP_6,-0.0,0.0
VPL_7,0.0,0.0
CM_11,-0.0,0.0
MD_Pf_12,-0.0,0.0


In [245]:
formula = "EDSS_sqrt ~ age + Female + tiv + AV_2 + MGN_10"
res = sm.OLS.from_formula(formula, data=model_data).fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:              EDSS_sqrt   R-squared:                       0.275
Model:                            OLS   Adj. R-squared:                  0.267
Method:                 Least Squares   F-statistic:                     33.94
Date:                Thu, 05 Jun 2025   Prob (F-statistic):           2.15e-29
Time:                        20:52:20   Log-Likelihood:                -574.28
No. Observations:                 453   AIC:                             1161.
Df Residuals:                     447   BIC:                             1185.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0215      0.041     -0.524      0.6