In [1]:
import pandas as pd
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
from pathlib import Path
import json
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
from datetime import datetime
import re
from scipy import stats
import numpy as np
import statsmodels.api as sm
import statsmodels
from matplotlib import colormaps
from tqdm.notebook import tqdm
import helpers
from collections import defaultdict
from tqdm.notebook import tqdm
from statsmodels.miscmodels.ordinal_model import OrderedModel


from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.regression.linear_model import OLS

from mri_data import file_manager as fm

## Load Data

In [2]:
drive_root = fm.get_drive_root()
dataroot = drive_root / "3Tpioneer_bids"
data_dir = Path("/home/srs-9/Projects/ms_mri/data")
fig_path = Path("/home/srs-9/Projects/ms_mri/analysis/thalamus/figures_tables/edss_regressions")

choroid_volumes = pd.read_csv(
    "/home/srs-9/Projects/ms_mri/data/choroid_aschoplex_volumes.csv", index_col="subid"
).dropna()
tiv = pd.read_csv("/home/srs-9/Projects/ms_mri/data/tiv_data.csv", index_col="subid")

df = pd.read_csv(
    "/home/srs-9/Projects/ms_mri/data/clinical_data_processed.csv", index_col="subid"
)
sdmt = pd.read_csv("/home/srs-9/Projects/ms_mri/analysis/thalamus/SDMT_sheet.csv", index_col="subid")
df = df.join([choroid_volumes, tiv, sdmt['SDMT']])
df['SDMT'] = pd.to_numeric(df['SDMT'], errors='coerce')
df['thalamus_sqrt'] = np.sqrt(df['thalamus'])
df['thalamus_curt'] = np.sqrt(df['thalamus']**3)
df_z = df.copy()
numeric_cols = df.select_dtypes(include='number').columns
df_z[numeric_cols] = df_z[numeric_cols].apply(stats.zscore, nan_policy="omit")

df_ms = df[df['dz_type2'] == "MS"]
df_ms_z = df_ms.copy()
df_ms_z[numeric_cols] = df_ms_z[numeric_cols].apply(stats.zscore, nan_policy="omit")

# assert df.loc[1340, 'EDSS'] == 2.5


viridis = colormaps['viridis'].resampled(20)

colors = helpers.get_colors()

In [4]:
df_thomas = pd.read_csv(data_dir / "hipsthomas_vols.csv", index_col="subid")
cols_orig = df_thomas.columns
new_colnames = {}
for col in df_thomas.columns:
    new_col = re.sub(r"(\d+)-([\w-]+)", r"\2_\1", col)
    new_col = re.sub("-", "_", new_col)
    new_colnames[col] = new_col
df_thomas = df_thomas.rename(columns=new_colnames)
df_thomas_norm = df_thomas.apply(lambda col: col / df_thomas['THALAMUS_1'])
df_thomas_z = df_thomas.apply(stats.zscore, nan_policy="omit")
df_thomas_norm_z = df_thomas_norm.apply(stats.zscore, nan_policy="omit")

df_thomas_left = pd.read_csv(data_dir / "hipsthomas_left_vols.csv", index_col="subid")
df_thomas_left = df_thomas_left.rename(columns=new_colnames)
df_thomas_left_norm = df_thomas_left.apply(lambda col: col / df_thomas['THALAMUS_1'])
df_thomas_left_z = df_thomas_left.apply(stats.zscore, nan_policy="omit")
df_thomas_left_norm_z = df_thomas_left_norm.apply(stats.zscore, nan_policy="omit")


df_thomas_right = pd.read_csv(data_dir / "hipsthomas_right_vols.csv", index_col="subid")
df_thomas_right = df_thomas_right.rename(columns=new_colnames)
df_thomas_right_norm = df_thomas_right.apply(lambda col: col / df_thomas['THALAMUS_1'])
df_thomas_right_z = df_thomas_right.apply(stats.zscore, nan_policy="omit")
df_thomas_right_norm_z = df_thomas_right_norm.apply(stats.zscore, nan_policy="omit")


thalamic_nuclei = [2, 4, 5, 6, 7, 8, 9, 10, 11, 12]
thalamic_nuclei_str = [str(i) for i in thalamic_nuclei]

hips_thomas_ref = pd.read_csv(
    "/home/srs-9/Projects/ms_mri/data/hipsthomas_struct_index.csv", index_col="index"
)['struct']
# hips_thomas_ref.rename(columns={"struct": "struct_name"}, inplace=True)

choroid_dists = pd.read_csv(data_dir / "centroid-choroid_centroid-left.csv", index_col="subid")
ventricle_dists = pd.read_csv(
    data_dir / "centroid-ventricle_SDT.csv", index_col="subid"
)

mni_choroid_dists = pd.read_csv("/home/srs-9/Projects/ms_mri/data/mni-centroid-choroid_SDT2.csv")

def combine_nuclei(df):
    df2 = pd.DataFrame()
    df2['anterior'] = df['AV_2']
    df2['ventral'] = df['VA_4'] + df['VLa_5'] + df['VLP_6'] + df['VPL_7']
    df2['intralaminar'] = df['CM_11'] 
    df2['medial'] = df['MD_Pf_12']
    df2['posterior'] = df['Pul_8'] + df['LGN_9'] + df['MGN_10']
    df2['THALAMUS_1'] = df['THALAMUS_1']
    return df2

df_thomas2 = combine_nuclei(df_thomas)
grouped_nuclei = df_thomas2.columns[~df_thomas2.columns.isin(["THALAMUS_1"])]
df_thomas2_z = df_thomas2.apply(stats.zscore, nan_policy="omit")
df_thomas2_norm = df_thomas2.apply(lambda col: col / df_thomas['THALAMUS_1'])
df_thomas2_norm_z = df_thomas2_norm.apply(stats.zscore, nan_policy="omit")

df_thomas2_left = combine_nuclei(df_thomas_left)
df_thomas2_left_z = df_thomas2_left.apply(stats.zscore, nan_policy="omit")
df_thomas2_norm_left = df_thomas2.apply(lambda col: col / df_thomas['THALAMUS_1'])
df_thomas2_norm_left_z = df_thomas2_norm_left.apply(stats.zscore, nan_policy="omit")

df_thomas2_right = combine_nuclei(df_thomas_right)
df_thomas2_right_z = df_thomas2_right.apply(stats.zscore, nan_policy="omit")
df_thomas2_norm_right = df_thomas2.apply(lambda col: col / df_thomas['THALAMUS_1'])
df_thomas2_norm_right_z = df_thomas2_norm_right.apply(stats.zscore, nan_policy="omit")

## Functions

In [5]:
def compute_se_diff(se1, n1, se2, n2):
    return np.sqrt((se1**2/n1) + (se2**2/n2))

def plot_regression(
    data, predictor, outcome, covariates, xlabel=None, ylabel=None, title=None,
    color="blue1"
):
    plus_covariates = ""
    if len(covariates) > 0:
        plus_covariates = f"+ {' + '.join(covariates)}"
    if xlabel is None:
        xlabel = predictor
    if ylabel is None:
        ylabel = outcome
    if title is None:
        title = f"{outcome} vs {predictor}"
    
    light_color = colors[f"light {color}"]
    dark_color = colors[f"dark {color}"]

    formula = f"{outcome} ~ {predictor} {plus_covariates}"
    res = sm.OLS.from_formula(formula, data=data).fit()
    x, y_pred, y_lims = helpers.get_regression_y(data, res, predictor, outcome)

    fig, axs = plt.subplot_mosaic(
        [['histx', '.'], ['scatter', 'histy']],
        figsize=(8, 6),
        width_ratios=(4, 1),
        height_ratios=(1, 4),
        layout='constrained',
    )

    helpers.scatter_hist(
        data[predictor],
        data[outcome],
        axs['scatter'],
        axs['histx'],
        axs['histy'],
        light_color=light_color,
        dark_color=dark_color,
    )

    axs['scatter'].plot(x, y_pred, color="black")
    axs['scatter'].fill_between(
        x, y_lims[0], y_lims[1], alpha=0.4, color=light_color
    )
    axs['scatter'].set_ylabel(ylabel)
    axs['scatter'].set_xlabel(xlabel)
    fig.suptitle(title)
    return fig, axs

## Main MRI Features

### Regressions

#### MS Patients

In [15]:
model_data = df_z.join([df_thomas_z])
model_data = model_data[model_data['dz_type2'] == "MS"]
model_data['EDSS'] = df['EDSS']

mod_prob = OrderedModel.from_formula("EDSS ~ THALAMUS_1 + tiv + age + Female",
                                     data=model_data, distr='logit')
res_prob = mod_prob.fit(method='bfgs')
res_prob.summary()

Optimization terminated successfully.
         Current function value: 2.165988
         Iterations: 61
         Function evaluations: 62
         Gradient evaluations: 62


0,1,2,3
Dep. Variable:,EDSS,Log-Likelihood:,-981.19
Model:,OrderedModel,AIC:,2006.0
Method:,Maximum Likelihood,BIC:,2097.0
Date:,"Wed, 25 Jun 2025",,
Time:,14:38:54,,
No. Observations:,453,,
Df Residuals:,431,,
Df Model:,4,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
THALAMUS_1,-0.5772,0.100,-5.756,0.000,-0.774,-0.381
tiv,0.0525,0.112,0.468,0.640,-0.168,0.273
age,0.7414,0.095,7.778,0.000,0.555,0.928
Female,-0.1020,0.100,-1.016,0.310,-0.299,0.095
0.0/0.5,-4.1098,0.313,-13.137,0.000,-4.723,-3.497
0.5/1.0,-1.3513,0.574,-2.353,0.019,-2.477,-0.226
1.0/1.5,0.7190,0.127,5.680,0.000,0.471,0.967
1.5/2.0,0.4174,0.084,4.964,0.000,0.253,0.582
2.0/2.5,-0.2318,0.112,-2.073,0.038,-0.451,-0.013


In [14]:
model_data = df_ms_z.copy()
model_data = model_data.join([df_thomas_z])
model_data['EDSS'] = df_ms['EDSS']
covariates = "age + Female + tiv"

outcome = "EDSS"
predictors = ["brain", "white", "grey", "cortical_thickness", "THALAMUS_1", "t2lv", "PRL", "choroid_volume"]

pvals = {}
coefs = {}
llci = {}
ulci = {}
ci_str = {}

for x in predictors:
    formula = f"{outcome} ~ {x} + {covariates}"
    res = OrderedModel.from_formula(formula, data=model_data, distr='logit').fit(method='bfgs', disp=0)
    pvals[x] = res.pvalues[x]
    coefs[x] = res.params[x]
    ci = res.conf_int()
    llci[x] = ci.loc[x, 0]
    ulci[x] = ci.loc[x, 1]
    ci_str[x] = f"[{llci[x]:.6f}, {ulci[x]:.6f}]"


regression_results = pd.DataFrame({"coef": coefs, "llci": llci, "ulci": ulci, "pvals": pvals, "ci": ci_str})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)
regression_results.index.name = "structure"
regression_results.to_excel(fig_path / "EDSS_and_main_mri_features_MS_ordinal.xlsx")
regression_results

Unnamed: 0_level_0,coef,llci,ulci,pvals,ci,p_fdr
structure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
brain,-0.167463,-0.356923,0.021996,0.08320019,"[-0.356923, 0.021996]",0.1109336
white,-0.059875,-0.236815,0.117065,0.5071798,"[-0.236815, 0.117065]",0.5071798
grey,-0.22586,-0.425807,-0.025913,0.02683046,"[-0.425807, -0.025913]",0.04392405
cortical_thickness,-0.403521,-0.592447,-0.214595,2.83635e-05,"[-0.592447, -0.214595]",0.000113454
THALAMUS_1,-0.57725,-0.773803,-0.380697,8.605393e-09,"[-0.773803, -0.380697]",6.884315e-08
t2lv,0.196538,0.021842,0.371233,0.02745253,"[0.021842, 0.371233]",0.04392405
PRL,0.193983,0.028391,0.359574,0.02167562,"[0.028391, 0.359574]",0.04392405
choroid_volume,0.132437,-0.048714,0.313587,0.1518849,"[-0.048714, 0.313587]",0.1735827


In [30]:
resThal = sm.OLS.from_formula("THALAMUS_1 ~ age + Female + tiv", data=model_data).fit()
resCT = sm.OLS.from_formula("cortical_thickness ~ age + Female + tiv", data=model_data).fit()
resEDSS = sm.OLS.from_formula("EDSS_sqrt ~ age + Female + tiv", data=model_data).fit()

resids = pd.concat([resThal.resid.rename("THALAMUS_1"),
                    resCT.resid.rename("cortical_thickness"),
                    resEDSS.resid.rename("EDSS_sqrt")], axis=1).dropna()


rx1y = stats.pearsonr(resids['THALAMUS_1'], resids['EDSS_sqrt']).statistic
rx2y = stats.pearsonr(resids['cortical_thickness'], resids['EDSS_sqrt']).statistic
rx1x2 = stats.pearsonr(resids['cortical_thickness'], resids['THALAMUS_1']).statistic
print(rx1y, rx2y, rx1x2)

-0.2814095453014782 -0.21767321433834244 0.6290560063840223


In [75]:
formula = "EDSS_sqrt ~ age + Female + tiv + THALAMUS_1"
res = sm.OLS.from_formula(formula, data=model_data).fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:              EDSS_sqrt   R-squared:                       0.276
Model:                            OLS   Adj. R-squared:                  0.269
Method:                 Least Squares   F-statistic:                     41.56
Date:                Wed, 11 Jun 2025   Prob (F-statistic):           1.59e-29
Time:                        14:50:51   Log-Likelihood:                -554.25
No. Observations:                 441   AIC:                             1118.
Df Residuals:                     436   BIC:                             1139.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0441      0.041     -1.073      0.2

#### NIND Group

In [None]:
model_data = df_z.join(df_thomas_z)
model_data = model_data[model_data['dz_type3'].isin(["NIND"])]
model_data['EDSS'] = df_ms['EDSS']

covariates = "age + Female + tiv"

outcome = "EDSS_sqrt"
predictors = ["brain", "white", "grey", "cortical_thickness", "THALAMUS_1", "t2lv", "PRL", "choroid_volume"]

pvals = {}
coefs = {}
llci = {}
ulci = {}
ci_str = {}

for x in predictors:
    formula = f"{outcome} ~ {x} + {covariates}"
    res = OrderedModel.from_formula(formula, data=model_data, distr='probit').fit(method='bfgs', disp=0)
    pvals[x] = res.pvalues[x]
    coefs[x] = res.params[x]
    ci = res.conf_int()
    llci[x] = ci.loc[x, 0]
    ulci[x] = ci.loc[x, 1]
    ci_str[x] = f"[{llci[x]:.6f}, {ulci[x]:.6f}]"


regression_results = pd.DataFrame({"coef": coefs, "llci": llci, "ulci": ulci, "pvals": pvals, "ci": ci_str})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)
regression_results.index.name = "structure"
# regression_results.to_excel(fig_path / "EDSS_and_main_mri_features_MS.xlsx")
regression_results

Unnamed: 0_level_0,coef,llci,ulci,pvals,ci,p_fdr
structure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
brain,-0.083625,-0.190153,0.022903,0.1239086,"[-0.190153, 0.022903]",0.1652115
white,-0.024814,-0.124856,0.075229,0.6268748,"[-0.124856, 0.075229]",0.6268748
grey,-0.115944,-0.227597,-0.004291,0.04182113,"[-0.227597, -0.004291]",0.06691381
cortical_thickness,-0.236372,-0.343847,-0.128896,1.628432e-05,"[-0.343847, -0.128896]",6.513727e-05
THALAMUS_1,-0.324317,-0.436698,-0.211935,1.548015e-08,"[-0.436698, -0.211935]",1.238412e-07
t2lv,0.12213,0.028102,0.216158,0.01090513,"[0.028102, 0.216158]",0.02181025
PRL,0.118455,0.028221,0.208689,0.01008345,"[0.028221, 0.208689]",0.02181025
choroid_volume,0.069328,-0.035773,0.174429,0.1960593,"[-0.035773, 0.174429]",0.2240678


In [10]:
model_data['EDSS']

subid
1027   NaN
1029   NaN
1064   NaN
1101   NaN
1105   NaN
1203   NaN
1224   NaN
1280   NaN
1283   NaN
1285   NaN
1317   NaN
1321   NaN
1344   NaN
1355   NaN
1383   NaN
1406   NaN
1423   NaN
1437   NaN
1463   NaN
1476   NaN
1486   NaN
1489   NaN
1492   NaN
1547   NaN
1548   NaN
1557   NaN
2005   NaN
2020   NaN
2057   NaN
2059   NaN
2070   NaN
2083   NaN
2085   NaN
2090   NaN
2097   NaN
2108   NaN
2110   NaN
2114   NaN
2123   NaN
2126   NaN
2129   NaN
2134   NaN
2142   NaN
2144   NaN
2146   NaN
2200   NaN
2206   NaN
2231   NaN
2243   NaN
Name: EDSS, dtype: float64

### Compare Thalamus Associations for PMS vs RMS

Thalamus does not predict EDSS any better in PMS patients compared to RMS patients.

There are other methods to try in the GPT convo (not because I think this result will change, but to practice them here so I can apply them elsewhere if desired)

- Chow / SUest
- Compare partial correlations with Fisher's z

In [24]:
formula = "EDSS ~ THALAMUS_1*PMS + age + Female + tiv"

model_data = df_z.join([df_thomas_z])
model_data = model_data[model_data.dz_type5.isin(["PMS", "RMS"])]
model_data = pd.concat([model_data, pd.get_dummies(model_data.dz_type5, dtype="int")], axis=1)
model_data['EDSS'] = df_ms['EDSS']


res = OrderedModel.from_formula(formula, data=model_data, distr='logit').fit(method='bfgs', disp=0)
print(res.summary())

                             OrderedModel Results                             
Dep. Variable:                   EDSS   Log-Likelihood:                -945.62
Model:                   OrderedModel   AIC:                             1939.
Method:            Maximum Likelihood   BIC:                             2038.
Date:                Fri, 20 Jun 2025                                         
Time:                        15:22:19                                         
No. Observations:                 453                                         
Df Residuals:                     429                                         
Df Model:                           6                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
THALAMUS_1        -0.4224      0.124     -3.419      0.001      -0.665      -0.180
PMS                2.1129      0.278    

Something is wrong here. Not significant if I compute standard deviation from the stderrs and plug into a t test calculator.

Using a t-test here isn't the best course anyways (according to GPT, check convo "Comparing regression betas"), but still curious what went wrong

In [50]:
formula = "EDSS_sqrt ~ THALAMUS_1 + age + Female + tiv"

model_data = df_z.join([df_thomas_z])

model_data_RMS = model_data.copy()
model_data_RMS = model_data_RMS[model_data_RMS['dz_type5'].isin(["RMS"])]
res_RMS = sm.OLS.from_formula(formula, data=model_data_RMS).fit()


model_data_PMS = model_data.copy()
model_data_PMS = model_data_PMS[model_data_PMS['dz_type5'].isin(["PMS"])]
res_PMS = sm.OLS.from_formula(formula, data=model_data_PMS).fit()

param1 = res_RMS.params['THALAMUS_1']
param2 = res_PMS.params['THALAMUS_1']

ci1 = res_RMS.conf_int()
ci2 = res_PMS.conf_int()

se1 = res_RMS.HC0_se['THALAMUS_1']
dof1 = res_RMS.df_resid
se2 = res_PMS.HC0_se['THALAMUS_1']
dof2 = res_PMS.df_resid

se_diff = compute_se_diff(se1, dof1, se2, dof2)
mean_diff = param1 - param2
t_stat = mean_diff / se_diff

dof = dof1 + dof2 - 2
p_val = stats.t.sf(t_stat, dof)

print("RMS:")
print(f"{param1:.2} ± {se1:.2} [{ci1.loc['THALAMUS_1', 0]:.2}, {ci1.loc['THALAMUS_1', 1]:.2}]")
print("\n")
print("PMS:")
print(f"{param2:.2} ± {se2:.2} [{ci2.loc['THALAMUS_1', 0]:.2}, {ci2.loc['THALAMUS_1', 1]:.2}]")
print("\n")

print(f"p = {p_val:.2}")

RMS:
-0.19 ± 0.058 [-0.3, -0.089]


PMS:
-0.22 ± 0.072 [-0.38, -0.052]


p = 0.0027


## HIPS-THOMAS

Check these out:

- https://pmc.ncbi.nlm.nih.gov/articles/PMC11081814/
- https://pmc.ncbi.nlm.nih.gov/articles/PMC11087027/

### L/R for thalamus diagram

In [26]:
model_data = df_z.join(df_thomas_left_z)
model_data['whole_thalamus'] = df_thomas_z['THALAMUS_1']
model_data = model_data[model_data['dz_type5'].isin(["RMS", "PMS"])]
covariates = "age + Female + tiv + whole_thalamus"

pvals = {}
coefs = {}
llci = {}
ulci = {}
ci_str = {}


outcome = "EDSS"

structs = thalamic_nuclei
all_results = {}
for x in structs:
    col = hips_thomas_ref[x]
    formula = f"{outcome} ~ {col}*dzdur + {covariates}"
    res = OrderedModel.from_formula(formula, data=model_data, distr='logit').fit(method='bfgs', disp=0)
    all_results[col] = res
    pvals[col] = res.pvalues[col]
    coefs[col] = res.params[col]
    ci = res.conf_int()
    llci[col] = ci.loc[col, 0]
    ulci[col] = ci.loc[col, 1]
    ci_str[col] = f"[{llci[col]:.6f}, {ulci[col]:.6f}]"

regression_results = pd.DataFrame({"coef": coefs, "pvals": pvals})
regression_results['p_fdr'] = stats.false_discovery_control(regression_results['pvals'], method='bh')
bins = np.linspace(regression_results['coef'].min(), regression_results.coef.max(), 10)
regression_results['coef_bin'] = np.digitize(regression_results['coef'], bins)
regression_results.sort_values(by="coef_bin")

Unnamed: 0,coef,pvals,p_fdr,coef_bin
AV_2,-0.257149,0.03908,0.169375,1
LGN_9,-0.244686,0.028423,0.169375,1
Pul_8,-0.144097,0.467744,0.790598,3
MGN_10,-0.101065,0.452909,0.790598,3
VLP_6,-0.056918,0.725426,0.790598,4
VPL_7,-0.075398,0.58976,0.790598,4
MD_Pf_12,-0.063165,0.72562,0.790598,4
VLa_5,-0.031176,0.790598,0.790598,5
CM_11,0.060809,0.618363,0.790598,6
VA_4,0.231525,0.050812,0.169375,10


In [135]:
model_data = df_z.join(df_thomas_right_z)
model_data['whole_thalamus'] = df_thomas_z['THALAMUS_1']

model_data = model_data[model_data['dz_type5'].isin(["RMS", "PMS"])]
covariates = "age + Female + tiv + whole_thalamus"

pvals = {}
coefs = {}
stderrs = {}
llci = {}
ulci = {}
ci_str = {}
r2 = {}

outcome = "SDMT"

structs = thalamic_nuclei
all_results = {}
for x in structs:
    col = hips_thomas_ref[x]
    formula = f"{outcome} ~ {col}*dzdur + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    all_results[col] = res
    pvals[col] = res.pvalues[col]
    coefs[col] = res.params[col]
    stderrs[col] = res.HC0_se[col]
    ci = res.conf_int()
    llci[col] = ci.loc[col, 0]
    ulci[col] = ci.loc[col, 1]
    ci_str[col] = f"[{llci[col]:.6f}, {ulci[col]:.6f}]"
    r2[col] = res.rsquared_adj

regression_results = pd.DataFrame({"coef": coefs, "pvals": pvals})
regression_results['p_fdr'] = stats.false_discovery_control(regression_results['pvals'], method='bh')
bins = np.linspace(regression_results['coef'].min(), regression_results.coef.max(), 10)
regression_results['coef_bin'] = np.digitize(regression_results['coef'], bins)
regression_results.sort_values(by="coef_bin")

Unnamed: 0,coef,pvals,p_fdr,coef_bin
MD_Pf_12,-0.136758,0.115708,0.578541,1
MGN_10,-0.073673,0.255085,0.850284,3
Pul_8,-0.055095,0.596611,0.986718,4
AV_2,-0.003957,0.946986,0.986718,6
VA_4,0.017358,0.788785,0.986718,6
VLa_5,0.02058,0.72347,0.986718,6
VPL_7,-0.003041,0.963638,0.986718,6
CM_11,0.000989,0.986718,0.986718,6
VLP_6,0.048392,0.539701,0.986718,8
LGN_9,0.100644,0.088978,0.578541,10


### Bilateral Thalamus

In [21]:
# data_to_sample = df_z.join(df_thomas_z).reset_index()
# inds = np.random.randint(0, len(data_to_sample), size=(300,))

# rng = np.random.default_rng()
# inds = rng.choice(len(data_to_sample), 200)
# model_data = data_to_sample.loc[inds, :].set_index("subid")

model_data = df_z.join(df_thomas_z)
model_data = model_data[model_data['dz_type5'].isin(["RMS", "PMS"])]
covariates = "age + Female + tiv"

pvals = {}
coefs = {}
stderrs = {}
llci = {}
ulci = {}
ci_str = {}
r2 = {}

outcome = "EDSS_sqrt"

structs = thalamic_nuclei
all_results = {}
for x in structs:
    col = hips_thomas_ref[x]
    formula = f"{outcome} ~ {col}*dzdur + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    all_results[col] = res
    pvals[col] = res.pvalues[col]
    coefs[col] = res.params[col]
    stderrs[col] = res.HC0_se[col]
    ci = res.conf_int()
    llci[col] = ci.loc[col, 0]
    ulci[col] = ci.loc[col, 1]
    ci_str[col] = f"[{llci[col]:.6f}, {ulci[col]:.6f}]"
    r2[col] = res.rsquared_adj



regression_results = pd.DataFrame({"coef": coefs, "stderr": stderrs, "llci": llci, "ulci": ulci, "pvals": pvals, "ci": ci_str})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)
regression_results['R2'] = r2
regression_results.sort_values(by="coef", inplace=True)
# regression_results.to_excel(fig_path / "thalamic_nuclei.xlsx")
main_coefs = np.array([coefs[hips_thomas_ref[x]] for x in structs])

regression_results

Unnamed: 0,coef,stderr,llci,ulci,pvals,ci,p_fdr,R2
AV_2,-0.24615,0.050051,-0.340305,-0.151995,4.16212e-07,"[-0.340305, -0.151995]",4e-06,0.287471
Pul_8,-0.224721,0.042781,-0.314127,-0.135316,1.107948e-06,"[-0.314127, -0.135316]",4e-06,0.298971
VLP_6,-0.223242,0.049036,-0.322426,-0.124059,1.221864e-05,"[-0.322426, -0.124059]",2.4e-05,0.284468
VPL_7,-0.219521,0.043713,-0.312219,-0.126823,4.297471e-06,"[-0.312219, -0.126823]",1.1e-05,0.293372
LGN_9,-0.212283,0.039497,-0.296814,-0.127752,1.131578e-06,"[-0.296814, -0.127752]",4e-06,0.296883
MGN_10,-0.193351,0.043738,-0.28882,-0.097881,8.035349e-05,"[-0.288820, -0.097881]",0.000118,0.27768
MD_Pf_12,-0.18484,0.043352,-0.276241,-0.09344,8.225939e-05,"[-0.276241, -0.093440]",0.000118,0.280289
VLa_5,-0.16746,0.049402,-0.271315,-0.063604,0.001635378,"[-0.271315, -0.063604]",0.002044,0.264186
CM_11,-0.132125,0.048073,-0.227993,-0.036256,0.007017786,"[-0.227993, -0.036256]",0.007798,0.262361
VA_4,-0.107387,0.05136,-0.213212,-0.001563,0.04672529,"[-0.213212, -0.001563]",0.046725,0.253289


In [24]:
# data_to_sample = df_z.join(df_thomas_z).reset_index()
# inds = np.random.randint(0, len(data_to_sample), size=(300,))

# rng = np.random.default_rng()
# inds = rng.choice(len(data_to_sample), 200)
# model_data = data_to_sample.loc[inds, :].set_index("subid")

model_data = df_z.join(df_thomas_norm_z)
model_data = model_data[model_data['dz_type5'].isin(["RMS", "PMS"])]
covariates = "age + Female + tiv"

pvals = {}
coefs = {}
stderrs = {}
llci = {}
ulci = {}
ci_str = {}
r2 = {}

outcome = "EDSS_sqrt"

structs = thalamic_nuclei
all_results = {}
for x in structs:
    col = hips_thomas_ref[x]
    formula = f"{outcome} ~ {col}*dzdur + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    all_results[col] = res
    pvals[col] = res.pvalues[col]
    coefs[col] = res.params[col]
    stderrs[col] = res.HC0_se[col]
    ci = res.conf_int()
    llci[col] = ci.loc[col, 0]
    ulci[col] = ci.loc[col, 1]
    ci_str[col] = f"[{llci[col]:.6f}, {ulci[col]:.6f}]"
    r2[col] = res.rsquared_adj



regression_results = pd.DataFrame({"coef": coefs, "stderr": stderrs, "llci": llci, "ulci": ulci, "pvals": pvals, "ci": ci_str})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)
regression_results['R2'] = r2
regression_results.sort_values(by="coef", inplace=True)
# regression_results.to_excel(fig_path / "thalamic_nuclei_normed.xlsx")
main_coefs = np.array([coefs[hips_thomas_ref[x]] for x in structs])

regression_results

Unnamed: 0,coef,stderr,llci,ulci,pvals,ci,p_fdr,R2
AV_2,-0.156576,0.047347,-0.24418,-0.068971,0.000489,"[-0.244180, -0.068971]",0.002445,0.26234
LGN_9,-0.128749,0.036063,-0.208895,-0.048602,0.001702,"[-0.208895, -0.048602]",0.004254,0.262389
Pul_8,-0.125798,0.039408,-0.20719,-0.044406,0.002525,"[-0.207190, -0.044406]",0.00505,0.264034
VPL_7,-0.026467,0.03966,-0.109075,0.056141,0.529233,"[-0.109075, 0.056141]",0.588037,0.242247
MD_Pf_12,0.005569,0.038726,-0.075662,0.086799,0.892888,"[-0.075662, 0.086799]",0.892888,0.241415
VLP_6,0.043402,0.042303,-0.043249,0.130054,0.325458,"[-0.043249, 0.130054]",0.406823,0.24614
MGN_10,0.075255,0.045413,-0.017328,0.167838,0.11087,"[-0.017328, 0.167838]",0.158386,0.252931
CM_11,0.089691,0.045008,0.003152,0.176231,0.042253,"[0.003152, 0.176231]",0.070422,0.250174
VLa_5,0.140635,0.042505,0.054622,0.226647,0.001407,"[0.054622, 0.226647]",0.004254,0.274305
VA_4,0.189895,0.040425,0.105276,0.274514,1.3e-05,"[0.105276, 0.274514]",0.000129,0.294136


#### Control for whole thalamus volume

Correlate each of the thalamic nuclei to EDSS, including whole thalalmic volume as a covariate. None of the p values are significant, but we can still compare coefficients to see each nucleus's relative contribution to EDSS. Will bootstrap these regressions in the following cell to see if the rank is significant.  

In [105]:
model_data = df_z.join(df_thomas_z)
model_data['whole_thalamus'] = df_thomas_z['THALAMUS_1']
model_data = model_data[model_data['dz_type5'].isin(["RMS", "PMS"])]
covariates = "age + Female + tiv + whole_thalamus"

pvals = {}
coefs = {}
thal_coefs = {}
stderrs = {}
llci = {}
ulci = {}
ci_str = {}
r2 = {}


outcome = "EDSS_sqrt"

structs = thalamic_nuclei
all_results = {}
for x in structs:
    col = hips_thomas_ref[x]
    formula = f"{outcome} ~ {col}*dzdur + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[col] = res.pvalues[col]
    coefs[col] = res.params[col]
    thal_coefs[col] = res.params['whole_thalamus']
    stderrs[col] = res.HC0_se[col]
    ci = res.conf_int()
    llci[col] = ci.loc[col, 0]
    ulci[col] = ci.loc[col, 1]
    ci_str[col] = f"[{llci[col]:.6f}, {ulci[col]:.6f}]"
    r2[col] = res.rsquared_adj



regression_results = pd.DataFrame({"coef": coefs, "thal_coef": thal_coefs, "stderr": stderrs, "llci": llci, "ulci": ulci, "pvals": pvals, "ci": ci_str})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)
regression_results['R2'] = r2

main_coefs = np.array([coefs[hips_thomas_ref[x]] for x in structs])
regression_results.sort_values(by="coef", inplace=True)
regression_results.to_excel(fig_path / "thalamic_nuclei-thalamus_control.xlsx")
regression_results

Unnamed: 0,coef,thal_coef,stderr,llci,ulci,pvals,ci,p_fdr,R2
AV_2,-0.141551,-0.160739,0.069829,-0.26686,-0.016243,0.026918,"[-0.266860, -0.016243]",0.171837,0.295513
LGN_9,-0.126408,-0.140696,0.053128,-0.243468,-0.009349,0.034367,"[-0.243468, -0.009349]",0.171837,0.302062
VPL_7,-0.094983,-0.161084,0.076241,-0.250398,0.060432,0.230348,"[-0.250398, 0.060432]",0.570279,0.297838
Pul_8,-0.090559,-0.154662,0.124959,-0.330646,0.149529,0.458904,"[-0.330646, 0.149529]",0.655577,0.299599
VLP_6,-0.058728,-0.192767,0.093892,-0.240543,0.123087,0.525879,"[-0.240543, 0.123087]",0.657349,0.29002
VLa_5,-0.029695,-0.234277,0.061649,-0.152544,0.093154,0.634982,"[-0.152544, 0.093154]",0.705536,0.287758
MGN_10,-0.026488,-0.225322,0.073376,-0.170783,0.117806,0.718439,"[-0.170783, 0.117806]",0.718439,0.290504
CM_11,0.061787,-0.292746,0.069573,-0.065913,0.189487,0.342167,"[-0.065913, 0.189487]",0.570279,0.291609
VA_4,0.096615,-0.306888,0.063836,-0.034879,0.228109,0.14944,"[-0.034879, 0.228109]",0.498132,0.290195
MD_Pf_12,0.116563,-0.354561,0.105327,-0.097585,0.330712,0.285317,"[-0.097585, 0.330712]",0.570279,0.293465


Bootstrap the regressions

In [81]:
data_to_sample = df_z.join(df_thomas_z)
data_to_sample = data_to_sample[data_to_sample['dz_type5'].isin(["RMS", "PMS"])].reset_index()
data_to_sample.loc[data_to_sample['EDSS_sqrt'].isna(), 'EDSS_sqrt'] = model_data['EDSS_sqrt'].median()

covariates = "age + Female + tiv"

def get_zeros():
    return np.zeros((1000,))

coefs_boot1 = defaultdict(get_zeros)

outcome = "EDSS_sqrt"

for i in tqdm(range(1000)):
    inds = np.random.randint(0, data_to_sample.index.max(), len(data_to_sample))
    model_data = data_to_sample.loc[inds, :]
    for x in structs:
        col = hips_thomas_ref[x]
        formula = f"{outcome} ~ {col}*dzdur + {covariates}"
        res = sm.OLS.from_formula(formula, data=model_data).fit()
        coefs_boot1[col][i] = res.params[col]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [84]:
data_to_sample = df_z.join(df_thomas_z)
data_to_sample = data_to_sample[data_to_sample['dz_type5'].isin(["RMS", "PMS"])].reset_index()
data_to_sample.loc[data_to_sample['EDSS_sqrt'].isna(), 'EDSS_sqrt'] = model_data['EDSS_sqrt'].median()

covariates = "age + Female + tiv + THALAMUS_1"

def get_zeros():
    return np.zeros((1000,))

coefs_boot2 = defaultdict(get_zeros)

outcome = "EDSS_sqrt"

for i in tqdm(range(1000)):
    inds = np.random.randint(0, data_to_sample.index.max(), len(data_to_sample))
    model_data = data_to_sample.loc[inds, :]
    for x in structs:
        col = hips_thomas_ref[x]
        formula = f"{outcome} ~ {col}*dzdur + {covariates}"
        res = sm.OLS.from_formula(formula, data=model_data).fit()
        coefs_boot2[col][i] = res.params[col]

  0%|          | 0/1000 [00:00<?, ?it/s]

Get the mean and 95% CI of the bootstrapped coefficients for each nucleus

In [39]:
coefs_copy = coefs_boot2.copy()
llci = {}
ulci = {}
means = {}
for x in structs:
    col = hips_thomas_ref[x]
    coefs_copy[col] = np.sort(coefs_copy[col])
    llci[col] = coefs_copy[col][25]
    ulci[col] = coefs_copy[col][975]
    means[col] = np.mean(coefs_copy[col])
    print(f"{means[col]:.4f} [{llci[col]:2.2f}, {ulci[col]:2.2f}]")

coefs_bootmean = np.array([means[hips_thomas_ref[x]] for x in structs])
np.corrcoef(main_coefs, coefs_bootmean)[0,1]

-0.1090 [-0.25, 0.02]
0.1029 [-0.02, 0.23]
-0.0241 [-0.15, 0.10]
-0.0599 [-0.23, 0.12]
-0.0667 [-0.22, 0.09]
-0.0583 [-0.33, 0.19]
-0.1281 [-0.23, -0.02]
-0.0202 [-0.17, 0.12]
0.0745 [-0.05, 0.21]
0.0838 [-0.12, 0.29]


0.9819684379090603

Correlate the coefficients of the nuclei for each boostrapped sample to the main regression to see if the relative strengths of the nuclei's coefficients are consistent across resamples

In [101]:
coefs_boot_arr = np.zeros((len(structs), 1000))
for i, x in enumerate(structs):
    col = hips_thomas_ref[x]
    coefs_boot_arr[i, :] = coefs_boot2[col]

corr_coefs = np.zeros((1000,))
kendal_taus = np.zeros((1000,))
for i in range(1000):
    corr_coefs[i] = np.corrcoef(coefs_boot_arr[:,i], main_coefs)[0,1]
    kendal_taus[i] = stats.kendalltau(coefs_boot_arr[:,i], main_coefs)[0]
mean_corrcoef = np.mean(corr_coefs)
mean_tau = np.mean(kendal_taus)
corr_coefs = np.sort(corr_coefs)
kendal_taus = np.sort(kendal_taus)
print(f"{mean_corrcoef:.2f} [{corr_coefs[25]:.2f}, {corr_coefs[975]:.2f}]")
print(f"{mean_tau:.2f} [{kendal_taus[25]:.2f}, {kendal_taus[975]:.2f}]")

0.56 [-0.02, 0.90]
0.44 [0.02, 0.73]


In [102]:
win_counts = np.zeros((coefs_boot_arr.shape[0],))
for i in range(1000):
    win_counts[coefs_boot_arr[:,i].argmin()] += 1

win_count_df = pd.Series(win_counts/1000, index=hips_thomas_ref[thalamic_nuclei])
win_count_df.sort_values(ascending=False)

struct
AV_2        0.301
Pul_8       0.265
LGN_9       0.180
VLP_6       0.113
VPL_7       0.108
MGN_10      0.017
VLa_5       0.012
MD_Pf_12    0.004
VA_4        0.000
CM_11       0.000
dtype: float64

In [44]:
coefs_boot_arr = np.zeros((len(structs), 1000))
for i, x in enumerate(structs):
    col = hips_thomas_ref[x]
    coefs_boot_arr[i, :] = coefs_boot1[col]
win_counts = np.zeros((coefs_boot_arr.shape[0],))
for i in range(1000):
    win_counts[coefs_boot_arr[:,i].argmin()] += 1

win_count_df = pd.Series(win_counts/1000, index=hips_thomas_ref[thalamic_nuclei])
win_count_df.sort_values(ascending=False)

struct
VLP_6       0.338
AV_2        0.183
Pul_8       0.165
VPL_7       0.127
LGN_9       0.113
MGN_10      0.042
VLa_5       0.022
MD_Pf_12    0.009
CM_11       0.001
VA_4        0.000
dtype: float64

In [26]:
model_data = df_z.join([df_thomas2_z])
model_data = model_data[model_data.dz_type2 == "MS"]

covariates = "age + Female + tiv + THALAMUS_1"

pvals = {}
coefs = {}
stderrs = {}
llci = {}
ulci = {}
ci_str = {}

for col in grouped_nuclei:
    formula = f"EDSS_sqrt ~ {col}*dzdur + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[col] = res.pvalues[col]
    coefs[col] = res.params[col]
    stderrs[col] = res.HC0_se[col]
    ci = res.conf_int()
    llci[col] = ci.loc[col, 0]
    ulci[col] = ci.loc[col, 1]
    ci_str[col] = f"[{llci[col]:.6f}, {ulci[col]:.6f}]"

regression_results = pd.DataFrame({"coef": coefs, "stderr": stderrs, "llci": llci, "ulci": ulci, "pvals": pvals, "ci": ci_str})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)

regression_results.sort_values(by="coef")

Unnamed: 0,coef,stderr,llci,ulci,pvals,ci,p_fdr
anterior,-0.141551,0.069829,-0.26686,-0.016243,0.026918,"[-0.266860, -0.016243]",0.134589
posterior,-0.13224,0.12909,-0.38101,0.116531,0.296727,"[-0.381010, 0.116531]",0.427709
ventral,-0.044266,0.105444,-0.256062,0.16753,0.681451,"[-0.256062, 0.167530]",0.681451
intralaminar,0.061787,0.069573,-0.065913,0.189487,0.342167,"[-0.065913, 0.189487]",0.427709
medial,0.116563,0.105327,-0.097585,0.330712,0.285317,"[-0.097585, 0.330712]",0.427709


In [31]:
model_data = df_z.copy()
struct_names = []
for col in df_thomas2.columns[~df_thomas2.columns.isin(["THALAMUS_1"])]:
    model_data[f'{col}_left'] = df_thomas2_norm_left_z[col]
    struct_names.append(f'{col}_left')
    model_data[f'{col}_right'] = df_thomas2_norm_right_z[col]
    struct_names.append(f'{col}_right')
model_data['whole_thalamus'] = df_thomas_z['THALAMUS_1']

covariates = "age + Female + tiv"

pvals = {}
coefs = {}
stderrs = {}
llci = {}
ulci = {}
ci_str = {}

for col in struct_names:
    formula = f"EDSS_sqrt ~ {col}*dzdur + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[col] = res.pvalues[col]
    coefs[col] = res.params[col]
    stderrs[col] = res.HC0_se[col]
    ci = res.conf_int()
    llci[col] = ci.loc[col, 0]
    ulci[col] = ci.loc[col, 1]
    ci_str[col] = f"[{llci[col]:.6f}, {ulci[col]:.6f}]"

regression_results = pd.DataFrame({"coef": coefs, "stderr": stderrs, "llci": llci, "ulci": ulci, "pvals": pvals, "ci": ci_str})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)

regression_results.sort_values(by="coef")

Unnamed: 0,coef,stderr,llci,ulci,pvals,ci,p_fdr
anterior_left,-0.162114,0.046174,-0.247317,-0.076911,0.000208,"[-0.247317, -0.076911]",0.001038
anterior_right,-0.162114,0.046174,-0.247317,-0.076911,0.000208,"[-0.247317, -0.076911]",0.001038
posterior_right,-0.128666,0.038341,-0.207566,-0.049766,0.001445,"[-0.207566, -0.049766]",0.003613
posterior_left,-0.128666,0.038341,-0.207566,-0.049766,0.001445,"[-0.207566, -0.049766]",0.003613
medial_left,0.000976,0.03777,-0.078336,0.080288,0.980714,"[-0.078336, 0.080288]",0.980714
medial_right,0.000976,0.03777,-0.078336,0.080288,0.980714,"[-0.078336, 0.080288]",0.980714
intralaminar_right,0.083412,0.043875,-0.00054,0.167364,0.051487,"[-0.000540, 0.167364]",0.064358
intralaminar_left,0.083412,0.043875,-0.00054,0.167364,0.051487,"[-0.000540, 0.167364]",0.064358
ventral_left,0.120768,0.039946,0.036056,0.205479,0.005298,"[0.036056, 0.205479]",0.008829
ventral_right,0.120768,0.039946,0.036056,0.205479,0.005298,"[0.036056, 0.205479]",0.008829


In [16]:
model_data = df_z[['EDSS_sqrt', 'age']].copy()
struct_names = []
for col in df_thomas2.columns:
    model_data[f'{col}_left'] = df_thomas2_left_z[col]
    struct_names.append(f'{col}_left')
    model_data[f'{col}_right'] = df_thomas2_right_z[col]
    struct_names.append(f'{col}_right')
model_data['whole_thalamus'] = df_thomas_z['THALAMUS_1']
struct_names

['anterior_left',
 'anterior_right',
 'ventral_left',
 'ventral_right',
 'intralaminar_left',
 'intralaminar_right',
 'medial_left',
 'medial_right',
 'posterior_left',
 'posterior_right',
 'THALAMUS_1_left',
 'THALAMUS_1_right']

In [89]:
data_to_sample = df_z.join(df_thomas2_z)
data_to_sample = data_to_sample[data_to_sample['dz_type5'].isin(["RMS", "PMS"])].reset_index()
data_to_sample.loc[data_to_sample['EDSS_sqrt'].isna(), 'EDSS_sqrt'] = model_data['EDSS_sqrt'].median()

covariates = "age + Female + tiv + THALAMUS_1"
structs = df_thomas2.columns[~df_thomas2.columns.isin(["THALAMUS_1"])]

def get_zeros():
    return np.zeros((1000,))

coefs_boot3 = defaultdict(get_zeros)

outcome = "EDSS_sqrt"

for i in tqdm(range(1000)):
    inds = np.random.randint(0, data_to_sample.index.max(), len(data_to_sample))
    model_data = data_to_sample.loc[inds, :]
    for col in structs:
        formula = f"{outcome} ~ {col}*dzdur + {covariates}"
        res = sm.OLS.from_formula(formula, data=model_data).fit()
        coefs_boot3[col][i] = res.params[col]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [90]:
coefs_boot_arr = np.zeros((len(structs), 1000))
for i, col in enumerate(structs):
    coefs_boot_arr[i, :] = coefs_boot3[col]
win_counts = np.zeros((coefs_boot_arr.shape[0],))
for i in range(1000):
    win_counts[coefs_boot_arr[:,i].argmin()] += 1

win_count_df = pd.Series(win_counts/1000, index=structs)
win_count_df.sort_values(ascending=False)

posterior       0.454
anterior        0.395
ventral         0.143
intralaminar    0.004
medial          0.004
dtype: float64

In [117]:
model_data = df_z.join(df_thomas_z)

covariates = "age + Female + tiv"

pvals = {}
coefs = {}
stderrs = {}
llci = {}
ulci = {}
ci_str = {}

for x in thalamic_nuclei:
    col = hips_thomas_ref[x]
    res = sm.OLS.from_formula(f"{col} ~ THALAMUS_1 + tiv", data=model_data).fit()
    model_data[f"{col}_resid"] = res.resid

    col2 = f"{col}_resid"
    formula = f"EDSS_sqrt ~ {col2}*dzdur + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[col] = res.pvalues[col2]
    coefs[col] = res.params[col2]
    stderrs[col] = res.HC0_se[col2]
    ci = res.conf_int()
    llci[col] = ci.loc[col2, 0]
    ulci[col] = ci.loc[col2, 1]
    ci_str[col] = f"[{llci[col]:.6f}, {ulci[col]:.6f}]"

regression_results = pd.DataFrame({"coef": coefs, "stderr": stderrs, "llci": llci, "ulci": ulci, "pvals": pvals, "ci": ci_str})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)

regression_results.sort_values(by="coef")

Unnamed: 0,coef,stderr,llci,ulci,pvals,ci,p_fdr
LGN_9,-0.149686,0.051829,-0.264497,-0.034874,0.01072,"[-0.264497, -0.034874]",0.107202
VPL_7,-0.109161,0.077652,-0.265438,0.047115,0.170535,"[-0.265438, 0.047115]",0.426338
VLP_6,-0.106625,0.092477,-0.286748,0.073498,0.245336,"[-0.286748, 0.073498]",0.490673
AV_2,-0.088672,0.070059,-0.210556,0.033211,0.153502,"[-0.210556, 0.033211]",0.426338
Pul_8,-0.029199,0.124842,-0.269485,0.211087,0.811376,"[-0.269485, 0.211087]",0.99724
MGN_10,-0.007467,0.073632,-0.15023,0.135296,0.918183,"[-0.150230, 0.135296]",0.99724
VLa_5,0.000216,0.064464,-0.122552,0.122984,0.99724,"[-0.122552, 0.122984]",0.99724
CM_11,0.060011,0.06794,-0.06781,0.187832,0.356714,"[-0.067810, 0.187832]",0.514482
MD_Pf_12,0.100432,0.104229,-0.115016,0.315881,0.360137,"[-0.115016, 0.315881]",0.514482
VA_4,0.124176,0.062814,-0.004679,0.253031,0.058882,"[-0.004679, 0.253031]",0.294409


In [124]:
model_data = df_z.join([df_thomas2_z])
model_data = model_data[model_data.dz_type2 == "MS"]

covariates = "age + Female + tiv + THALAMUS_1"

pvals = {}
coefs = {}
stderrs = {}
llci = {}
ulci = {}
ci_str = {}

for col in df_thomas2:
    formula = f"EDSS_sqrt ~ {col}*dzdur + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[col] = res.pvalues[col]
    coefs[col] = res.params[col]
    stderrs[col] = res.HC0_se[col]
    ci = res.conf_int()
    llci[col] = ci.loc[col, 0]
    ulci[col] = ci.loc[col, 1]
    ci_str[col] = f"[{llci[col]:.6f}, {ulci[col]:.6f}]"

regression_results = pd.DataFrame({"coef": coefs, "stderr": stderrs, "llci": llci, "ulci": ulci, "pvals": pvals, "ci": ci_str})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)

regression_results.sort_values(by="coef")

Unnamed: 0,coef,stderr,llci,ulci,pvals,ci,p_fdr
THALAMUS_1,-0.238654,0.045351,-0.334926,-0.142382,2e-06,"[-0.334926, -0.142382]",9e-06
anterior,-0.132947,0.069969,-0.257915,-0.007978,0.037114,"[-0.257915, -0.007978]",0.111343
posterior,-0.130666,0.128086,-0.381456,0.120125,0.30641,"[-0.381456, 0.120125]",0.459615
ventral,-0.030548,0.104296,-0.242056,0.18096,0.776657,"[-0.242056, 0.180960]",0.776657
intralaminar,0.052855,0.069251,-0.074872,0.180583,0.416498,"[-0.074872, 0.180583]",0.499797
medial,0.126086,0.102518,-0.087665,0.339838,0.246961,"[-0.087665, 0.339838]",0.459615


In [15]:
model_data = df_z.join([df_thomas2_z])
model_data = model_data[model_data.dz_type2 == "MS"]

covariates = "age + Female + tiv + THALAMUS_1"

pvals = {}
coefs = {}
stderrs = {}
llci = {}
ulci = {}
ci_str = {}

for col in df_thomas2:
    formula = f"choroid_volume ~ {col} + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[col] = res.pvalues[col]
    coefs[col] = res.params[col]
    stderrs[col] = res.HC0_se[col]
    ci = res.conf_int()
    llci[col] = ci.loc[col, 0]
    ulci[col] = ci.loc[col, 1]
    ci_str[col] = f"[{llci[col]:.6f}, {ulci[col]:.6f}]"

regression_results = pd.DataFrame({"coef": coefs, "stderr": stderrs, "llci": llci, "ulci": ulci, "pvals": pvals, "ci": ci_str})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)

regression_results.sort_values(by="coef")

Unnamed: 0,coef,stderr,llci,ulci,pvals,ci,p_fdr
THALAMUS_1,-0.471241,0.047756,-0.557449,-0.385033,3.496056e-24,"[-0.557449, -0.385033]",2.0976330000000003e-23
medial,-0.326169,0.103968,-0.522535,-0.129804,0.0011796,"[-0.522535, -0.129804]",0.002359199
posterior,-0.295484,0.118422,-0.527846,-0.063121,0.01280286,"[-0.527846, -0.063121]",0.01536344
intralaminar,-0.174645,0.065383,-0.292919,-0.056371,0.00388849,"[-0.292919, -0.056371]",0.005832734
anterior,0.055964,0.064762,-0.058611,0.170539,0.3376318,"[-0.058611, 0.170539]",0.3376318
ventral,0.597348,0.098615,0.408056,0.786639,1.243761e-09,"[0.408056, 0.786639]",3.731284e-09


### Deep Grey Structures

In [136]:
model_data = df_z.join(df_thomas_z)
model_data = model_data[model_data['dz_type5'].isin(["RMS", "PMS"])]
covariates = "age + Female + tiv"

pvals = {}
coefs = {}
stderrs = {}
llci = {}
ulci = {}

outcome = "EDSS_sqrt"
structs = hips_thomas_ref.index[~hips_thomas_ref.index.isin(thalamic_nuclei)]

all_results = {}
for x in structs:
    col = hips_thomas_ref[x]
    formula = f"{outcome} ~ {col} + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[col] = res.pvalues[col]
    coefs[col] = res.params[col]
    stderrs[col] = res.HC0_se[col]
    ci = res.conf_int()
    llci[col] = ci.loc[col, 0]
    ulci[col] = ci.loc[col, 1]


regression_results = pd.DataFrame({"coef": coefs, "stderr": stderrs, "llci": llci, "ulci": ulci, "pvals": pvals})
regression_results['p_fdr'] = stats.false_discovery_control(
    regression_results['pvals'], method='bh'
)
regression_results.sort_values(by="coef", inplace=True)
regression_results.to_csv(fig_path / "deep_grey_regressions.csv")
regression_results

Unnamed: 0,coef,stderr,llci,ulci,pvals,p_fdr
THALAMUS_1,-0.294878,0.045745,-0.38822,-0.201536,1.220738e-09,1.464886e-08
Cla_28,-0.215097,0.049181,-0.313742,-0.116452,2.235316e-05,0.0001341189
Acc_26,-0.177858,0.048999,-0.280341,-0.075376,0.0007065838,0.002826335
RN_32,-0.148705,0.05409,-0.258899,-0.038512,0.00828267,0.01987841
Hb_13,-0.134733,0.043543,-0.219743,-0.049723,0.001959034,0.005877101
Cau_27,-0.133372,0.054594,-0.234685,-0.032058,0.009991938,0.01998388
Amy_34,-0.111904,0.045568,-0.200122,-0.023686,0.01302889,0.02233524
Put_31,-0.084238,0.059147,-0.192744,0.024269,0.1277856,0.1916784
GP_33,-0.073634,0.050006,-0.177461,0.030194,0.1640826,0.1968991
GPe_29,-0.073625,0.048188,-0.176024,0.028774,0.1583391,0.1968991


In [210]:
vif_data = df_thomas_z[hips_thomas_ref[thalamic_nuclei + [1]]]
vif = pd.DataFrame()
vif['struct'] = vif_data.columns
vif.set_index("struct", inplace=True)
vif['VIF'] = [variance_inflation_factor(vif_data.values, i, )
                          for i in range(len(vif_data.columns))]

In [225]:
model_data = df_z.join(df_thomas2_z)
model_data = model_data[model_data['dz_type5'].isin(["RMS", "PMS"])]
formula = f"EDSS_sqrt ~ age + Female + tiv + {" + ".join(df_thomas2_z.columns[~df_thomas2_z.columns.isin(["THALAMUS_1"])])}"
res = sm.OLS.from_formula(formula, data=model_data).fit()
print(res.summary())


                            OLS Regression Results                            
Dep. Variable:              EDSS_sqrt   R-squared:                       0.289
Model:                            OLS   Adj. R-squared:                  0.277
Method:                 Least Squares   F-statistic:                     22.61
Date:                Thu, 05 Jun 2025   Prob (F-statistic):           5.27e-29
Time:                        20:05:35   Log-Likelihood:                -569.77
No. Observations:                 453   AIC:                             1158.
Df Residuals:                     444   BIC:                             1195.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -0.0257      0.041     -0.631   

---

### Elastic Net Trials

In [46]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np

In [50]:
model_data = df_z.join(df_thomas_z)
model_data = model_data[model_data['dz_type5'].isin(["RMS", "PMS"])]
model_data = model_data[~model_data['EDSS_sqrt'].isna()]

structs = hips_thomas_ref[thalamic_nuclei]


X = model_data[structs]
y = model_data['EDSS_sqrt']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Lasso Regression model with a specific alpha (regularization strength)
alpha = 0.1  # Adjust alpha as needed
lasso = Lasso(alpha=alpha)

# Train the model
lasso.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lasso.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Print the coefficients (note that some may be zero)
print("Coefficients:", lasso.coef_)

Mean Squared Error: 1.0688643148091397
Coefficients: [-0.16628181 -0.         -0.         -0.         -0.         -0.02281968
 -0.02927025 -0.15345964 -0.         -0.        ]


In [51]:
check = pd.DataFrame({"coef": lasso.coef_}, index=X.columns)
check['abs'] = check['coef'].abs()
check = check.sort_values(by="abs", ascending=False)
check

Unnamed: 0,coef,abs
AV_2,-0.166282,0.166282
MGN_10,-0.15346,0.15346
LGN_9,-0.02927,0.02927
Pul_8,-0.02282,0.02282
VA_4,-0.0,0.0
VLa_5,-0.0,0.0
VLP_6,-0.0,0.0
VPL_7,-0.0,0.0
CM_11,-0.0,0.0
MD_Pf_12,-0.0,0.0


In [49]:
from sklearn.linear_model import ElasticNetCV


regr = ElasticNetCV(cv=10, random_state=0)
regr.fit(X_train, y_train)
print(regr.alpha_)
print(regr.intercept_)
y_pred = regr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(mse)

check = pd.DataFrame({"coef": regr.coef_}, index=X.columns)
check['abs'] = check['coef'].abs()
check = check.sort_values(by="abs", ascending=False)
check

0.04268818040991856
-0.005121767360380086
1.042679507226447


Unnamed: 0,coef,abs
AV_2,-0.203274,0.203274
MGN_10,-0.167571,0.167571
LGN_9,-0.058592,0.058592
Pul_8,-0.035116,0.035116
VLa_5,-0.013014,0.013014
VA_4,0.0,0.0
VLP_6,-0.0,0.0
VPL_7,0.0,0.0
CM_11,-0.0,0.0
MD_Pf_12,-0.0,0.0


In [245]:
formula = "EDSS_sqrt ~ age + Female + tiv + AV_2 + MGN_10"
res = sm.OLS.from_formula(formula, data=model_data).fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:              EDSS_sqrt   R-squared:                       0.275
Model:                            OLS   Adj. R-squared:                  0.267
Method:                 Least Squares   F-statistic:                     33.94
Date:                Thu, 05 Jun 2025   Prob (F-statistic):           2.15e-29
Time:                        20:52:20   Log-Likelihood:                -574.28
No. Observations:                 453   AIC:                             1161.
Df Residuals:                     447   BIC:                             1185.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0215      0.041     -0.524      0.6