In [38]:
import os
import re
import subprocess
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import nibabel as nib
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from matplotlib import colormaps
from scipy import stats

from mri_data import file_manager as fm

sys.path.append("/home/srs-9/Projects/ms_mri/analysis/paper1")

import helpers

In [45]:
drive_root = fm.get_drive_root()
msmri_home = Path("/home/srs-9/Projects/ms_mri")
msmri_datadir = msmri_home / "data"
curr_dir = Path(os.getcwd())
data_dir = Path("/home/srs-9/Projects/ms_mri/data")
dataroot = Path("/media/smbshare/srs-9/thalamus_project/data")

showfigs = False
pd.options.display.precision = 3
subject_sessions = pd.read_csv(
    "/home/srs-9/Projects/ms_mri/analysis/thalamus/data0/subject-sessions.csv",
    index_col="sub",
)

In [46]:
cp_vols = pd.read_csv(data_dir / "choroid_aschoplex_volumes.csv", index_col="subid")
tiv_data = pd.read_csv(data_dir / "tiv_data.csv", index_col="subid")
clinical_data = pd.read_csv(data_dir / "clinical_data_processed.csv", index_col="subid")
df = cp_vols.join([tiv_data, clinical_data])

df_z = df.copy()
numeric_vars = [col for col in df_z.columns if pd.api.types.is_numeric_dtype(df_z[col])]
df_z[numeric_vars] = df_z[numeric_vars].apply(stats.zscore, nan_policy="omit")

In [48]:
df_thomas = pd.read_csv(data_dir / "hipsthomas_vols.csv", index_col="subid")
cols_orig = df_thomas.columns
new_colnames = {}
for col in df_thomas.columns:
    new_col = re.sub(r"(\d+)-([\w-]+)", r"\2_\1", col)
    new_col = re.sub("-", "_", new_col)
    new_colnames[col] = new_col
df_thomas = df_thomas.rename(columns=new_colnames)
df_thomas_norm = df_thomas.apply(lambda col: col / df_thomas['THALAMUS_1'])
df_thomas_z = df_thomas.apply(stats.zscore, nan_policy="omit")
df_thomas_norm_z = df_thomas_norm.apply(stats.zscore, nan_policy="omit")

df_thomas_left = pd.read_csv(data_dir / "hipsthomas_left_vols.csv", index_col="subid")
df_thomas_left = df_thomas_left.rename(columns=new_colnames)
df_thomas_left_z = df_thomas_left.apply(stats.zscore, nan_policy="omit")

df_thomas_right = pd.read_csv(data_dir / "hipsthomas_right_vols.csv", index_col="subid")
df_thomas_right = df_thomas_right.rename(columns=new_colnames)
df_thomas_right_z = df_thomas_right.apply(stats.zscore, nan_policy="omit")


thalamic_nuclei = [2, 4, 5, 6, 7, 8, 9, 10, 11, 12]
thalamic_nuclei_str = [str(i) for i in thalamic_nuclei]

hips_thomas_ref = pd.read_csv(
    "/home/srs-9/Projects/ms_mri/data/hipsthomas_struct_index.csv", index_col="index"
)['struct']

### MS vs NIND

#### Left Side

In [50]:
model_data = df_z.join(df_thomas_left_z)
model_data = model_data[model_data['dz_type3'].isin(["MS", "NIND"])]
model_data = pd.concat((model_data, pd.get_dummies(model_data['dz_type3'], dtype="int")), axis=1)

covars = "age + Female + tiv"
outcome = "MS"

pvals = {}
coefs = {}
all_results = {}
for struct in hips_thomas_ref:
    formula = f"{outcome} ~ {covars} + {struct}"
    res = sm.Logit.from_formula(formula, data=model_data).fit(disp=0)
    pvals[struct] = res.pvalues[struct]
    coefs[struct] = res.params[struct]
    all_results[struct] = res

regression_results = pd.DataFrame({"coef": coefs, "pvals": pvals})
regression_results['p_fdr'] = stats.false_discovery_control(regression_results['pvals'], method='bh')
regression_results = regression_results.sort_values(by="coef", ascending=True)
regression_results

Unnamed: 0,coef,pvals,p_fdr
Pul_8,-1.194,2.372e-06,5.219e-05
THALAMUS_1,-1.146,3.889e-05,0.0004278
MD_Pf_12,-0.878,6.954e-05,0.00051
CM_11,-0.67,0.0002267,0.001247
AV_2,-0.632,0.0004719,0.00173
GPi_30,-0.598,0.0003044,0.001339
Put_31,-0.585,0.002013,0.005536
VPL_7,-0.528,0.008204,0.02005
LGN_9,-0.52,0.001508,0.00474
GP_33,-0.461,0.01025,0.02256


### Right Side

In [51]:
model_data = df_z.join(df_thomas_right_z)
model_data = model_data[model_data['dz_type3'].isin(["MS", "NIND"])]
model_data = pd.concat((model_data, pd.get_dummies(model_data['dz_type3'], dtype="int")), axis=1)

covars = "age + Female + tiv"
outcome = "MS"

pvals = {}
coefs = {}
all_results = {}
for struct in hips_thomas_ref:
    formula = f"{outcome} ~ {covars} + {struct}"
    res = sm.Logit.from_formula(formula, data=model_data).fit(disp=0)
    pvals[struct] = res.pvalues[struct]
    coefs[struct] = res.params[struct]
    all_results[struct] = res

regression_results = pd.DataFrame({"coef": coefs, "pvals": pvals})
regression_results['p_fdr'] = stats.false_discovery_control(regression_results['pvals'], method='bh')
regression_results = regression_results.sort_values(by="coef", ascending=True)
regression_results

Unnamed: 0,coef,pvals,p_fdr
Pul_8,-1.222,1.201e-06,2.641e-05
THALAMUS_1,-1.206,1.489e-05,0.0001182
MD_Pf_12,-0.916,4.81e-05,0.0002646
AV_2,-0.847,1.612e-05,0.0001182
Put_31,-0.702,0.0004944,0.001813
GP_33,-0.625,0.0007037,0.002212
GPi_30,-0.596,0.0002175,0.0009571
RN_32,-0.572,0.005891,0.01303
VLP_6,-0.546,0.0102,0.02039
GPe_29,-0.511,0.005182,0.01303


### MS vs !MS

In [18]:
model_data = df_z.join(df_thomas_z)
model_data = model_data[model_data['dz_type3'].isin(["MS", "NIND"])]
model_data = pd.concat((model_data, pd.get_dummies(model_data['dz_type3'], dtype="int")), axis=1)

formula = "MS ~ THALAMUS_1 + age + Female + tiv"
res = sm.Logit.from_formula(formula, data=model_data).fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.271283
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                     MS   No. Observations:                  457
Model:                          Logit   Df Residuals:                      452
Method:                           MLE   Df Model:                            4
Date:                Fri, 06 Jun 2025   Pseudo R-squ.:                  0.1302
Time:                        17:37:25   Log-Likelihood:                -123.98
converged:                       True   LL-Null:                       -142.54
Covariance Type:            nonrobust   LLR p-value:                 1.695e-07
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.7687      0.238     11.617      0.000       2.302       3.236
THALAMUS_1    -1.4808      0.

In [20]:
covars = "age + Female + tiv"
outcome = "MS"

pvals = {}
coefs = {}
se = {}
all_results = {}
for struct in df_structs.loc[thalamic_nuclei_inds, 'struct']:
    formula = f"{outcome} ~ {covars} + {struct}"
    res = sm.Logit.from_formula(formula, data=model_data).fit()
    pvals[struct] = res.pvalues[struct]
    coefs[struct] = res.params[struct]
    # se[struct] = res.HC0_se[struct]
    all_results[struct] = res

regression_results = pd.DataFrame({"coef": coefs, "pvals": pvals, "se": se})
regression_results['p_fdr'] = stats.false_discovery_control(regression_results['pvals'], method='bh')
regression_results = regression_results.sort_values(by="coef", ascending=True)
regression_results

Optimization terminated successfully.
         Current function value: 0.272634
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.294757
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.297743
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.291650
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.288927
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.266719
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.292322
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.292813
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.286846
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.276151
  

Unnamed: 0,coef,pvals,se,p_fdr
Pul_8,-1.416,8.13e-07,,8.13e-06
MD_Pf_12,-1.128,1.096e-05,,3.653e-05
AV_2,-1.091,1.732e-06,,8.662e-06
VPL_7,-0.769,0.0009399,,0.00188
CM_11,-0.765,0.0002478,,0.0006194
VLP_6,-0.71,0.003131,,0.004864
MGN_10,-0.651,0.004388,,0.005484
VA_4,-0.582,0.009616,,0.01068
LGN_9,-0.547,0.003405,,0.004864
VLa_5,-0.421,0.03871,,0.03871


### RMS vs PMS

In [21]:
model_data = df_z.join(df_thomas_z)
model_data = model_data[model_data['dz_type5'].isin(["RMS", "PMS"])]
model_data = pd.concat((model_data, pd.get_dummies(model_data['dz_type5'], dtype="int")), axis=1)

formula = "PMS ~ THALAMUS_1 + age + Female + tiv"
res = sm.Logit.from_formula(formula, data=model_data).fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.358903
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                    PMS   No. Observations:                  414
Model:                          Logit   Df Residuals:                      409
Method:                           MLE   Df Model:                            4
Date:                Fri, 06 Jun 2025   Pseudo R-squ.:                  0.2931
Time:                        17:38:23   Log-Likelihood:                -148.59
converged:                       True   LL-Null:                       -210.18
Covariance Type:            nonrobust   LLR p-value:                 1.113e-25
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -2.0499      0.198    -10.371      0.000      -2.437      -1.663
THALAMUS_1    -0.7068      0.

In [23]:
covars = "age + Female + tiv"
outcome = "PMS"

pvals = {}
coefs = {}
se = {}
all_results = {}
for struct in df_structs.loc[thalamic_nuclei_inds, 'struct']:
    formula = f"{outcome} ~ {covars} + {struct}"
    res = sm.Logit.from_formula(formula, data=model_data).fit()
    pvals[struct] = res.pvalues[struct]
    coefs[struct] = res.params[struct]
    # se[struct] = res.HC0_se[struct]
    all_results[struct] = res

regression_results = pd.DataFrame({"coef": coefs, "pvals": pvals, "se": se})
regression_results['p_fdr'] = stats.false_discovery_control(regression_results['pvals'], method='bh')
regression_results = regression_results.sort_values(by="coef", ascending=True)
regression_results

Optimization terminated successfully.
         Current function value: 0.365664
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.372634
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.372063
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.359630
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.368762
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.363550
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.365824
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.375768
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.372501
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.366186
  

Unnamed: 0,coef,pvals,se,p_fdr
VLP_6,-0.698,5.007e-05,,0.0005007
AV_2,-0.649,0.0007042,,0.001409
Pul_8,-0.611,0.0002304,,0.001152
LGN_9,-0.549,0.0005841,,0.001409
MD_Pf_12,-0.541,0.0007044,,0.001409
VPL_7,-0.517,0.001906,,0.003176
VA_4,-0.481,0.01105,,0.01267
VLa_5,-0.477,0.009731,,0.01267
CM_11,-0.432,0.0114,,0.01267
MGN_10,-0.344,0.0478,,0.0478


### PPMS vs SPMS

Thalamic volume predicts PPMS vs SPMS. PPMS associated with greater thalamic volume, after controlling for age, sex, tiv, and disease duration

In [36]:
model_data = df_z.join(df_thomas_z)
model_data = model_data[model_data['ms_type'].isin(["PPMS", "SPMS"])]
model_data = pd.concat((model_data, pd.get_dummies(model_data['ms_type'], dtype="int")), axis=1)

formula = "PPMS ~ THALAMUS_1 + age + Female + tiv + dzdur"
res = sm.Logit.from_formula(formula, data=model_data).fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.548368
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                   PPMS   No. Observations:                   78
Model:                          Logit   Df Residuals:                       72
Method:                           MLE   Df Model:                            5
Date:                Fri, 06 Jun 2025   Pseudo R-squ.:                  0.1839
Time:                        17:43:10   Log-Likelihood:                -42.773
converged:                       True   LL-Null:                       -52.413
Covariance Type:            nonrobust   LLR p-value:                  0.001704
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.1583      0.420     -0.377      0.706      -0.981       0.665
THALAMUS_1     0.6552      0.

In [37]:
covars = "age + Female + tiv"
outcome = "PPMS"

pvals = {}
coefs = {}
se = {}
all_results = {}
for struct in df_structs.loc[thalamic_nuclei_inds, 'struct']:
    formula = f"{outcome} ~ {covars} + {struct}"
    res = sm.Logit.from_formula(formula, data=model_data).fit()
    pvals[struct] = res.pvalues[struct]
    coefs[struct] = res.params[struct]
    # se[struct] = res.HC0_se[struct]
    all_results[struct] = res

regression_results = pd.DataFrame({"coef": coefs, "pvals": pvals, "se": se})
regression_results['p_fdr'] = stats.false_discovery_control(regression_results['pvals'], method='bh')
regression_results = regression_results.sort_values(by="coef", ascending=True)
regression_results

Optimization terminated successfully.
         Current function value: 0.635106
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.652023
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.655371
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.631714
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.589296
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.588183
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.642673
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.636726
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.655130
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.612379
  

Unnamed: 0,coef,pvals,se,p_fdr
VLa_5,0.017,0.952,,0.952
CM_11,0.052,0.839,,0.932
VA_4,0.229,0.473,,0.591
LGN_9,0.361,0.166,,0.238
MGN_10,0.442,0.101,,0.168
VLP_6,0.48,0.067,,0.167
AV_2,0.576,0.083,,0.167
MD_Pf_12,0.628,0.016,,0.052
Pul_8,0.868,0.003,,0.023
VPL_7,0.877,0.005,,0.023


In [31]:
model_data = df_z.join(df_thomas_z)
model_data = model_data[model_data['ms_type'].isin(["SPMS", "RRMS"])]
model_data = pd.concat((model_data, pd.get_dummies(model_data['ms_type'], dtype="int")), axis=1)

formula = "RRMS ~ age + THALAMUS_1 + Female + tiv + dzdur"
res = sm.Logit.from_formula(formula, data=model_data).fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.244788
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                   RRMS   No. Observations:                  368
Model:                          Logit   Df Residuals:                      362
Method:                           MLE   Df Model:                            5
Date:                Fri, 06 Jun 2025   Pseudo R-squ.:                  0.3592
Time:                        17:41:15   Log-Likelihood:                -90.082
converged:                       True   LL-Null:                       -140.58
Covariance Type:            nonrobust   LLR p-value:                 3.244e-20
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.0153      0.310      9.712      0.000       2.407       3.624
age           -0.9106      0.

In [33]:
covars = "age + Female + tiv"
outcome = "RRMS"

pvals = {}
coefs = {}
se = {}
all_results = {}
for struct in df_structs.loc[thalamic_nuclei_inds, 'struct']:
    formula = f"{outcome} ~ {covars} + {struct}"
    res = sm.Logit.from_formula(formula, data=model_data).fit()
    pvals[struct] = res.pvalues[struct]
    coefs[struct] = res.params[struct]
    # se[struct] = res.HC0_se[struct]
    all_results[struct] = res

regression_results = pd.DataFrame({"coef": coefs, "pvals": pvals, "se": se})
regression_results['p_fdr'] = stats.false_discovery_control(regression_results['pvals'], method='bh')
regression_results = regression_results.sort_values(by="coef", ascending=True)
regression_results

Optimization terminated successfully.
         Current function value: 0.268260
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.276843
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.276132
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.258476
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.259288
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.257989
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.270770
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.275042
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.278471
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.262450
  

Unnamed: 0,coef,pvals,se,p_fdr
CM_11,0.473,0.02905,,0.02905
VA_4,0.576,0.01344,,0.01494
VLa_5,0.583,0.01165,,0.01456
MGN_10,0.584,0.005444,,0.007778
LGN_9,0.667,0.001391,,0.002318
MD_Pf_12,0.785,7.174e-05,,0.0001793
AV_2,0.832,0.0007468,,0.001494
VLP_6,0.894,1.743e-05,,6.433e-05
VPL_7,0.897,1.93e-05,,6.433e-05
Pul_8,0.927,1.847e-05,,6.433e-05
