In [1]:
import os
import re
import subprocess
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import nibabel as nib
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from matplotlib import colormaps
from scipy import stats

from mri_data import file_manager as fm

sys.path.append("/home/srs-9/Projects/ms_mri/analysis/paper1")

import helpers

In [2]:
drive_root = fm.get_drive_root()
msmri_home = Path("/home/srs-9/Projects/ms_mri")
msmri_datadir = msmri_home / "data"
curr_dir = Path(os.getcwd())
data_dir = curr_dir / "data0"
cp_data_dir = Path("/home/srs-9/Projects/ms_mri/analysis/paper1/data0")
dataroot = Path("/media/smbshare/srs-9/thalamus_project/data")

showfigs = False
pd.options.display.precision = 3
subject_sessions = pd.read_csv(
    "/home/srs-9/Projects/ms_mri/analysis/thalamus/data0/subject-sessions.csv",
    index_col="sub",
)

In [3]:
cp_vols = pd.read_csv(data_dir / "choroid_aschoplex_volumes.csv", index_col="subid")
tiv_data = pd.read_csv(data_dir / "tiv_data.csv", index_col="subid")
clinical_data = pd.read_csv(data_dir / "clinical_data_processed.csv", index_col="subid")
df = cp_vols.join([tiv_data, clinical_data])

df_z = df.copy()
numeric_vars = [col for col in df_z.columns if pd.api.types.is_numeric_dtype(df_z[col])]
df_z[numeric_vars] = df_z[numeric_vars].apply(stats.zscore, nan_policy="omit")

In [4]:
df_thomas = pd.read_csv(data_dir / "hipsthomas_vols.csv", index_col="subid")
cols_orig = df_thomas.columns
new_colnames = {}
for col in df_thomas.columns:
    new_col = re.sub(r"(\d+)-([\w-]+)", r"\2_\1", col)
    new_col = re.sub("-", "_", new_col)
    new_colnames[col] = new_col
df_thomas = df_thomas.rename(columns=new_colnames)
df_thomas_norm = df_thomas.apply(lambda col: col / df_thomas['THALAMUS_1'])
df_thomas_z = df_thomas.apply(stats.zscore, nan_policy="omit")
df_thomas_norm_z = df_thomas_norm.apply(stats.zscore, nan_policy="omit")


new_index = []
for col in df_thomas.columns:
    new_index.append(int(re.match(r".+_(\d+)$", col)[1]))

df_structs = pd.DataFrame({'struct': df_thomas.columns}, index=new_index)
ind_struct_lookup = {}
for i, row in df_structs.iterrows():
    ind_struct_lookup[i] = row['struct']
struct_ind_lookup = {}
for i, row in df_structs.iterrows():
    struct_ind_lookup[row['struct']] = i

thalamic_nuclei = df_thomas.columns[1:11]
thalamic_nuclei_inds = [struct_ind_lookup[struct] for struct in thalamic_nuclei]

### MS vs !MS

In [None]:
model_data = df_z.join(df_thomas_z)
model_data = model_data[model_data['dz_type2'].isin(["MS", "!MS"])]
model_data = pd.concat((model_data, pd.get_dummies(model_data['dz_type2'], dtype="int")), axis=1)

formula = "MS ~ THALAMUS_1 + age + Female + tiv"
res = sm.Logit.from_formula(formula, data=model_data).fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.383136
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                     MS   No. Observations:                  489
Model:                          Logit   Df Residuals:                      483
Method:                           MLE   Df Model:                            5
Date:                Tue, 20 May 2025   Pseudo R-squ.:                  0.1059
Time:                        10:42:21   Log-Likelihood:                -187.35
converged:                       True   LL-Null:                       -209.55
Covariance Type:            nonrobust   LLR p-value:                 1.936e-08
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.0545      0.168     12.251      0.000       1.726       2.383
THALAMUS_1    -1.2582      0.

In [11]:
covars = "THALAMUS_1 + age + Female + tiv"
outcome = "MS"

pvals = {}
coefs = {}
se = {}
all_results = {}
for struct in df_structs.loc[thalamic_nuclei_inds, 'struct']:
    formula = f"{outcome} ~ {covars} + {struct}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[struct] = res.pvalues[struct]
    coefs[struct] = res.params[struct]
    se[struct] = res.HC0_se[struct]
    all_results[struct] = res

regression_results = pd.DataFrame({"coef": coefs, "pvals": pvals, "se": se})
regression_results['p_fdr'] = stats.false_discovery_control(regression_results['pvals'], method='bh')
regression_results = regression_results.sort_values(by="coef", ascending=True)
regression_results

Unnamed: 0,coef,pvals,se,p_fdr
Pul_8,-0.144,0.003,0.046,0.019
AV_2,-0.069,0.004,0.026,0.019
MD_Pf_12,-0.004,0.919,0.038,0.919
CM_11,-0.003,0.91,0.027,0.919
LGN_9,-0.003,0.903,0.026,0.919
VA_4,0.01,0.722,0.027,0.919
VLa_5,0.015,0.537,0.024,0.895
MGN_10,0.026,0.359,0.03,0.718
VPL_7,0.035,0.279,0.03,0.697
VLP_6,0.075,0.038,0.035,0.126


### RMS vs PMS

In [19]:
model_data = df_z.join(df_thomas_z)
model_data = model_data[model_data['dz_type5'].isin(["RMS", "PMS"])]
model_data = pd.concat((model_data, pd.get_dummies(model_data['dz_type5'], dtype="int")), axis=1)

formula = "PMS ~ THALAMUS_1 + age + Female + tiv"
res = sm.Logit.from_formula(formula, data=model_data).fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.358903
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                    PMS   No. Observations:                  414
Model:                          Logit   Df Residuals:                      409
Method:                           MLE   Df Model:                            4
Date:                Tue, 20 May 2025   Pseudo R-squ.:                  0.2931
Time:                        10:51:39   Log-Likelihood:                -148.59
converged:                       True   LL-Null:                       -210.18
Covariance Type:            nonrobust   LLR p-value:                 1.113e-25
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -2.0499      0.198    -10.371      0.000      -2.437      -1.663
THALAMUS_1    -0.7068      0.

In [21]:
covars = "THALAMUS_1 + age + Female + tiv"
outcome = "PMS"

pvals = {}
coefs = {}
se = {}
all_results = {}
for struct in df_structs.loc[thalamic_nuclei_inds, 'struct']:
    formula = f"{outcome} ~ {covars} + {struct}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[struct] = res.pvalues[struct]
    coefs[struct] = res.params[struct]
    se[struct] = res.HC0_se[struct]
    all_results[struct] = res

regression_results = pd.DataFrame({"coef": coefs, "pvals": pvals, "se": se})
regression_results['p_fdr'] = stats.false_discovery_control(regression_results['pvals'], method='bh')
regression_results = regression_results.sort_values(by="coef", ascending=True)
regression_results

Unnamed: 0,coef,pvals,se,p_fdr
VLP_6,-0.053,0.175,0.043,0.826
LGN_9,-0.018,0.474,0.024,0.83
AV_2,-0.011,0.674,0.026,0.83
VLa_5,-0.011,0.672,0.026,0.83
VA_4,0.006,0.839,0.029,0.839
CM_11,0.009,0.747,0.028,0.83
VPL_7,0.02,0.553,0.039,0.83
MD_Pf_12,0.036,0.441,0.054,0.83
MGN_10,0.06,0.054,0.03,0.536
Pul_8,0.061,0.248,0.056,0.826


### PPMS vs SPMS

Thalamic volume predicts PPMS vs SPMS. PPMS associated with greater thalamic volume, after controlling for age, sex, tiv, and disease duration

In [18]:
model_data = df_z.join(df_thomas_z)
model_data = model_data[model_data['ms_type'].isin(["PPMS", "SPMS"])]
model_data = pd.concat((model_data, pd.get_dummies(model_data['ms_type'], dtype="int")), axis=1)

formula = "PPMS ~ thalamus + age + Female + tiv"
res = sm.Logit.from_formula(formula, data=model_data).fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.617482
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                   PPMS   No. Observations:                   78
Model:                          Logit   Df Residuals:                       73
Method:                           MLE   Df Model:                            4
Date:                Tue, 20 May 2025   Pseudo R-squ.:                 0.08107
Time:                        10:51:27   Log-Likelihood:                -48.164
converged:                       True   LL-Null:                       -52.413
Covariance Type:            nonrobust   LLR p-value:                   0.07494
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.1520      0.397     -0.383      0.702      -0.930       0.626
thalamus       0.5435      0.

In [16]:
covars = "THALAMUS_1 + age + Female + tiv"
outcome = "PPMS"

pvals = {}
coefs = {}
se = {}
all_results = {}
for struct in df_structs.loc[thalamic_nuclei_inds, 'struct']:
    formula = f"{outcome} ~ {covars} + {struct}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[struct] = res.pvalues[struct]
    coefs[struct] = res.params[struct]
    se[struct] = res.HC0_se[struct]
    all_results[struct] = res

regression_results = pd.DataFrame({"coef": coefs, "pvals": pvals, "se": se})
regression_results['p_fdr'] = stats.false_discovery_control(regression_results['pvals'], method='bh')
regression_results = regression_results.sort_values(by="coef", ascending=True)
regression_results

Unnamed: 0,coef,pvals,se,p_fdr
CM_11,-0.273,0.002,0.075,0.025
VA_4,-0.2,0.059,0.1,0.174
VLa_5,-0.198,0.017,0.072,0.085
MGN_10,-0.111,0.288,0.106,0.411
LGN_9,-0.099,0.288,0.091,0.411
VLP_6,-0.084,0.431,0.106,0.539
AV_2,-0.002,0.986,0.093,0.986
MD_Pf_12,0.023,0.859,0.122,0.955
VPL_7,0.148,0.123,0.088,0.246
Pul_8,0.288,0.07,0.15,0.174


In [46]:
model_data = df_z.join(df_thomas_z)
model_data = model_data[model_data['ms_type'].isin(["SPMS", "RRMS"])]
model_data = pd.concat((model_data, pd.get_dummies(model_data['ms_type'], dtype="int")), axis=1)

formula = "RRMS ~ age + choroid_volume + Female + tiv + dzdur"
res = sm.Logit.from_formula(formula, data=model_data).fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.260491
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                   RRMS   No. Observations:                  366
Model:                          Logit   Df Residuals:                      360
Method:                           MLE   Df Model:                            5
Date:                Thu, 15 May 2025   Pseudo R-squ.:                  0.3110
Time:                        18:24:49   Log-Likelihood:                -95.340
converged:                       True   LL-Null:                       -138.38
Covariance Type:            nonrobust   LLR p-value:                 4.453e-17
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          2.8356      0.288      9.861      0.000       2.272       3.399
age              