In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.insert(0, "/home/srs-9/Projects/ms_mri/analysis/thalamus/helpers")

import helpers
import utils
import json

In [30]:
max_dzdur = None

hips_thomas_ref = pd.read_csv(
    "/home/srs-9/Projects/ms_mri/data/hipsthomas_struct_index.csv", index_col="index"
)["struct"]
thalamic_nuclei = [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

# Load longitudinal volumes
df_long = pd.read_csv("/home/srs-9/Projects/ms_mri/longitudinal_pipeline/data0/full_volumes.csv")
df_long['time1'] = pd.to_datetime(df_long['time1'], format='%Y%m%d')
df_long['time2'] = pd.to_datetime(df_long['time2'], format='%Y%m%d')
df_long['interval_years'] = (df_long['time2'] - df_long['time1']).dt.days / 365.25

df_long2 = pd.read_csv("/home/srs-9/Projects/ms_mri/longitudinal_pipeline/data0/full_volumes_thalamus.csv")
df_long2['time1'] = pd.to_datetime(df_long2['time1'], format='%Y%m%d')
df_long2['time2'] = pd.to_datetime(df_long2['time2'], format='%Y%m%d')
df_long2['interval_years'] = (df_long2['time2'] - df_long2['time1']).dt.days / 365.25

groups = {
    "medial": ["MD_Pf_12", "CM_11"],
    "ventral": ["VA_4", "VLa_5", "VLP_6", "VPL_7"],
    "posterior": ["Pul_8", "MGN_10", "LGN_9"],
    "anterior": ["AV_2"]
}
for group, nucs in groups.items():
    cols = [f"{nuc}_time1" for nuc in nucs]
    df_long[f"{group}_time1"] = df_long[cols].sum(axis=1)
    cols = [f"{nuc}_time2" for nuc in nucs]
    df_long[f"{group}_time2"] = df_long[cols].sum(axis=1)


cols_ordered = ["subid", "interval_years", "time1", "time2"]
for struct in hips_thomas_ref.to_list() + list(groups.keys()):
    cols_ordered.extend([f"{struct}_time1", f"{struct}_time2"])
df_long = df_long[cols_ordered]

# Load baseline covariates (adjust path as needed)
# Expected columns: subid, T2LV, dzdur, age, Female, tiv, CP
df_base = utils.load_data("/home/srs-9/Projects/ms_mri/analysis/thalamus/results/data_wchaco.csv")
df_base = df_base.reset_index()

# Merge on subid
df = df_long.merge(df_base, on='subid', how='inner')
df['thalamus_time1'] = df_long2['thalamus_time1']
df['thalamus_time2'] = df_long2['thalamus_time2']

# skip_subs = [1027, 1264, 1163]
# df = df[~df['subid'].isin(skip_subs)]

df = df[df['dz_type3'] == "MS"]



if max_dzdur is not None:
    df = df[df['dzdur'] < max_dzdur]
    print(f"N after dzdur<{max_dzdur}: {len(df)}")
    print(f"Interval range: {df['interval_years'].min():.2f} – {df['interval_years'].max():.2f} years")
    
print(f"N: {len(df)}")
print(f"Interval range: {df['interval_years'].min():.2f} – {df['interval_years'].max():.2f} years")



N: 185
Interval range: 3.02 – 5.42 years


In [43]:
(df.loc[0, "THALAMUS_1_time2"] - df.loc[0, "THALAMUS_1_time1"]) / df.loc[0, "THALAMUS_1_time1"]

np.float64(-0.004176268590629131)

### compute annualized change scores
Using annualized % change to account for variable follow-up intervals.
Annualized % = ((V2 - V1) / V1) / interval * 100
This is our primary longitudinal outcome.

In [31]:
structures = {
    hips_thomas_ref[i]: hips_thomas_ref[i] for i in thalamic_nuclei
}
structures.update({group: group for group in groups})
structures.update({"thalamus": "thalamus"})

for col_prefix, label in structures.items():
    v1 = df[f'{col_prefix}_time1']
    v2 = df[f'{col_prefix}_time2']
    pct_change = (v2 - v1) / v1 * 100
    abs_change = v2 - v1
    df[f'{col_prefix}_pct_change'] = pct_change
    df[f'{col_prefix}_abs_change'] = abs_change
    df[f'{col_prefix}_ann_pct_change'] = pct_change / df['interval_years']
    df[f'{col_prefix}_ann_abs_change'] = abs_change / df['interval_years']
    df_long[f'{col_prefix}_pct_change'] = pct_change
    df_long[f'{col_prefix}_ann_pct_change'] = pct_change / df_long['interval_years']
    df_long[f'{col_prefix}_abs_change'] = abs_change
    df_long[f'{col_prefix}_ann_abs_change'] = abs_change / df_long['interval_years']

print(df[[f'{col_prefix}_ann_pct_change' for col_prefix in structures]].describe().round(3))


       THALAMUS_1_ann_pct_change  AV_2_ann_pct_change  VA_4_ann_pct_change  \
count                    185.000              185.000              185.000   
mean                      -0.489               -1.256               -0.794   
std                        0.789                5.524                1.132   
min                       -3.924              -17.796               -4.075   
25%                       -0.848               -4.478               -1.478   
50%                       -0.452               -2.022               -0.795   
75%                       -0.055                1.274                0.001   
max                        2.095               20.432                2.759   

       VLa_5_ann_pct_change  VLP_6_ann_pct_change  VPL_7_ann_pct_change  \
count               185.000               185.000               185.000   
mean                 -0.879                -0.684                -0.921   
std                   1.814                 1.266                 2.050 

In [32]:

THAL_MIN, THAL_MAX = 5500, 13000
THAL_MIN, THAL_MAX = 4000, 13000
ANN_CHANGE_THRESH = 5  # % per year — biologically implausible

df['qc_thal_t1_range'] = df['THALAMUS_1_time1'].between(THAL_MIN, THAL_MAX)
df['qc_thal_t2_range'] = df['THALAMUS_1_time2'].between(THAL_MIN, THAL_MAX)
df['qc_ann_change']    = df['THALAMUS_1_ann_pct_change'].abs() < ANN_CHANGE_THRESH
df['qc_pass']          = df['qc_thal_t1_range'] & df['qc_thal_t2_range'] & df['qc_ann_change']

print(f"\nQC summary:")
print(f"  Fail thal range at T1:    {(~df['qc_thal_t1_range']).sum()}")
print(f"  Fail thal range at T2:    {(~df['qc_thal_t2_range']).sum()}")
print(f"  Fail ann change threshold:{(~df['qc_ann_change']).sum()}")
print(f"  Total QC pass:            {df['qc_pass'].sum()} / {len(df)}")

# Inspect failures
df_fail = df[~df['qc_pass']][['subid', 'interval_years',
                               'THALAMUS_1_time1', 'THALAMUS_1_time2',
                               'THALAMUS_1_pct_change', 'THALAMUS_1_ann_pct_change']]
print("\nFailed subjects:")
print(df_fail.to_string())

df_qc = df[df['qc_pass']].copy()
print(f"\nProceeding with N={len(df_qc)} after QC")



QC summary:
  Fail thal range at T1:    0
  Fail thal range at T2:    0
  Fail ann change threshold:0
  Total QC pass:            185 / 185

Failed subjects:
Empty DataFrame
Columns: [subid, interval_years, THALAMUS_1_time1, THALAMUS_1_time2, THALAMUS_1_pct_change, THALAMUS_1_ann_pct_change]
Index: []

Proceeding with N=185 after QC


In [33]:
print("One-sample t-tests (H0: no change):")
print(f"{'Structure':<25} {'Mean %/yr':>10} {'t':>8} {'p':>8}")
print("-" * 55)
for key, label in structures.items():
    vals = df_qc[f'{key}_ann_pct_change'].dropna()
    t, p = stats.ttest_1samp(vals, 0)
    print(f"{label:<25} {vals.mean():>10.3f} {t:>8.3f} {p:>8.4f}")


One-sample t-tests (H0: no change):
Structure                  Mean %/yr        t        p
-------------------------------------------------------
THALAMUS_1                    -0.489   -8.437   0.0000
AV_2                          -1.256   -3.092   0.0023
VA_4                          -0.794   -9.543   0.0000
VLa_5                         -0.879   -6.589   0.0000
VLP_6                         -0.684   -7.343   0.0000
VPL_7                         -0.921   -6.111   0.0000
Pul_8                         -0.457   -4.652   0.0000
LGN_9                         -0.862   -3.686   0.0003
MGN_10                        -0.524   -3.486   0.0006
CM_11                         -0.305   -1.378   0.1699
MD_Pf_12                       0.068    0.625   0.5330
Hb_13                          0.457    0.536   0.5927
medial                         0.001    0.015   0.9880
ventral                       -0.795  -11.502   0.0000
posterior                     -0.509   -5.581   0.0000
anterior                    

In [37]:
df_qc_z = utils.zscore(df_qc)
struct = "Pul_8"
res = smf.ols(f"{struct}_ann_abs_change ~ WBV + Pul_8_chaco + age + Female + tiv + dzdur", data=df_qc_z).fit()
print(res.summary())

                             OLS Regression Results                             
Dep. Variable:     Pul_8_ann_abs_change   R-squared:                       0.053
Model:                              OLS   Adj. R-squared:                  0.020
Method:                   Least Squares   F-statistic:                     1.625
Date:                  Thu, 26 Feb 2026   Prob (F-statistic):              0.143
Time:                          18:04:05   Log-Likelihood:                -252.96
No. Observations:                   181   AIC:                             519.9
Df Residuals:                       174   BIC:                             542.3
Df Model:                             6                                         
Covariance Type:              nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      -0.0467      0.