In [95]:
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
import numpy as np
import re
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import mannwhitneyu
from scipy import stats
import matplotlib.ticker as ticker
import os
from reload_recursive import reload_recursive
import patsy
from pyprocessmacro import Process
from statsmodels.stats.mediation import Mediation
from statsmodels.miscmodels.ordinal_model import OrderedModel
from pingouin import mediation_analysis
from statsmodels.stats.outliers_influence import variance_inflation_factor


from mri_data import file_manager as fm
import helpers

In [96]:
reload_recursive(helpers)

## Setup

In [97]:
drive_root = fm.get_drive_root()
msmri_home = Path("/home/srs-9/Projects/ms_mri")
msmri_datadir = msmri_home / "data"
curr_dir = Path(os.getcwd())
data_dir = curr_dir / "data0"
showfigs = False

In [98]:
df_full = pd.read_csv(data_dir / "t1_data_full.csv")
df_full = df_full.set_index("subid")

df_full = helpers.set_dz_type5(df_full)
df_full = helpers.set_dz_type3(df_full)
df_full = helpers.set_dz_type2(df_full)
df = helpers.fix_edss(df_full)
df = helpers.clean_df(df)

keep_cols = [
    "subject",
    "age",
    "sex",
    "ms_type",
    "dz_type2",
    "dz_type3",
    "dz_type5",
    "dzdur",
    "EDSS",
    "MSSS",
    "gMSSS",
    "ARMSS",
    "DMT_score",
    "DMT_hx_all",
    "flair_contrast",
    "lesion_count",
    "lesion_vol_cubic",
    "PRL",
    "tiv",
    "choroid_volume",
    "pineal_volume",
    "pituitary_volume",
]

df = df.loc[:, keep_cols]
df = pd.concat((df, pd.get_dummies(df["sex"])), axis=1)

In [99]:
df.loc[:, "lesion_vol_logtrans"] = np.log(df["lesion_vol_cubic"])
df.loc[:, "edss_sqrt"] = np.sqrt(df["EDSS"].astype("float"))
df.loc[:, "msss_sqrt"] = np.sqrt(df["MSSS"])
df.loc[:, "armss_sqrt"] = np.sqrt(df["ARMSS"])
df.loc[:, "gmsss_sqrt"] = np.sqrt(df["gMSSS"])

In [121]:
vars = [
    "age",
    "Female",
    "dzdur",
    "EDSS",
    "MSSS",
    "gMSSS",
    "ARMSS",
    "edss_sqrt",
    "msss_sqrt",
    "armss_sqrt",
    "gmsss_sqrt",
    "DMT_score",
    "DMT_hx_all",
    "lesion_count",
    "lesion_vol_cubic",
    "lesion_vol_logtrans",
    "PRL",
    "tiv",
    "choroid_volume",
    "pineal_volume",
    "pituitary_volume"
]

for var in vars:
    df[var] = pd.to_numeric(df[var])

In [122]:
vars_to_center = [
    "edss_sqrt",
    "lesion_vol_logtrans",
    "lesion_vol_cubic",
    "dzdur",
    "choroid_volume",
    "pituitary_volume",
]

for var in vars_to_center:
    df[f"{var}_cent"] = df[var] - df[var].mean()

centered_vars = [f"{var}_cent" for var in vars_to_center]
vars.extend(centered_vars)

In [123]:
df_z = df[vars].astype("float")
df_z[df.columns[~df.columns.isin(vars)]] = df[df.columns[~df.columns.isin(vars)]]
df_z = df_z[df.columns]
df_z[vars] = df_z[vars].apply(stats.zscore, nan_policy="omit")

data = df[vars].astype("float")
data_z = data[vars].apply(stats.zscore, nan_policy="omit")

data_ms = df.loc[df["dz_type5"].isin(["RMS", "PMS"]), :]
data_ms = data_ms[vars].astype("float")
data_ms_z = data_ms[vars].apply(stats.zscore, nan_policy="omit")

In [124]:
rad_df = pd.read_csv("/home/srs-9/Projects/ms_mri/radiomics/pituitary-t1-radiomics.csv")
rad_df = rad_df.set_index("subid")
rad_df_z = rad_df.apply(stats.zscore)

In [113]:
all_rad_features = [
    "original_shape_Elongation",
    "original_shape_Flatness",
    "original_shape_LeastAxisLength",
    "original_shape_MajorAxisLength",
    "original_shape_Maximum2DDiameterColumn",
    "original_shape_Maximum2DDiameterRow",
    "original_shape_Maximum2DDiameterSlice",
    "original_shape_Maximum3DDiameter",
    "original_shape_MeshVolume",
    "original_shape_MinorAxisLength",
    "original_shape_Sphericity",
    "original_shape_SurfaceArea",
    "original_shape_SurfaceVolumeRatio",
    "original_shape_VoxelVolume"
]

longest_feat_name_len = max([len(feat) for feat in all_rad_features])
n_feat_spaces = {}
for feat in all_rad_features:
    n_feat_spaces[feat] = longest_feat_name_len - len(feat)

In [31]:
def check_vif(data, vif_vars):
    vif_data = rad_df[vif_vars].dropna()
    vif = pd.DataFrame()
    vif["feature"] = vif_data.columns
    vif = vif.set_index("feature")
    vif['VIF'] = [variance_inflation_factor(vif_data.values, i, )
                            for i in range(len(vif_data.columns))]
    return vif

### Regression on Lesion Volume

In [38]:
rad_features = ["original_shape_Elongation",
               "original_shape_VoxelVolume"]

vif = check_vif(rad_df, rad_features)
print(vif)

                                 VIF
feature                             
original_shape_Elongation   13.12332
original_shape_VoxelVolume  13.12332


In [115]:
def regress_iter(model, data, outcome, features, covars=None, **kwargs):
    if covars is None:
        covars = []
    p_vals = []
    for feature in features:    
       predictors = covars + [feature]
       res = model(data[outcome], data[predictors], **kwargs).fit(disp=False)
       p_vals.append(res.pvalues[feature])
    
    return p_vals, features

In [129]:
model_data = pd.concat([data, rad_df], axis=1)

p_vals, _ = regress_iter(sm.OLS, model_data, "lesion_vol_logtrans", all_rad_features, covars=["age", "Female", "tiv"],
                         hasconst=True, missing="drop")

for feature, p in zip(all_rad_features, p_vals):
    print("{}: {:0.2}".format(feature, p))

original_shape_Elongation: 0.23
original_shape_Flatness: 0.91
original_shape_LeastAxisLength: 0.91
original_shape_MajorAxisLength: 0.59
original_shape_Maximum2DDiameterColumn: 0.21
original_shape_Maximum2DDiameterRow: 0.094
original_shape_Maximum2DDiameterSlice: 0.51
original_shape_Maximum3DDiameter: 0.82
original_shape_MeshVolume: 0.99
original_shape_MinorAxisLength: 0.035
original_shape_Sphericity: 0.18
original_shape_SurfaceArea: 0.16
original_shape_SurfaceVolumeRatio: 0.055
original_shape_VoxelVolume: 0.98


In [51]:
corrected_p_values = stats.false_discovery_control(p_vals)
for feature, p in zip(all_rad_features, corrected_p_values):
    print("{}: {:0.2}".format(feature, p))

original_shape_Elongation: 0.19
original_shape_Flatness: 0.49
original_shape_LeastAxisLength: 0.4
original_shape_MajorAxisLength: 0.43
original_shape_Maximum2DDiameterColumn: 0.23
original_shape_Maximum2DDiameterRow: 0.16
original_shape_Maximum2DDiameterSlice: 0.35
original_shape_Maximum3DDiameter: 0.51
original_shape_MeshVolume: 0.4
original_shape_MinorAxisLength: 0.15
original_shape_Sphericity: 0.51
original_shape_SurfaceArea: 0.16
original_shape_SurfaceVolumeRatio: 0.15
original_shape_VoxelVolume: 0.4


In [72]:
model_data = pd.concat([data, rad_df], axis=1)
model_data = model_data.loc[df_full['dz_type5'].isin(['RMS', 'PMS'])]

In [None]:
model_data = pd.concat([df, rad_df], axis=1)
model_data = model_data.loc[df_full['dz_type2'].isin(['MS', '!MS'])]
model_data = pd.concat(
    (model_data, pd.get_dummies(model_data['dz_type2'])), axis=1
)
model_data['MS'] = model_data['MS'].astype("int64")

rad_features = ["original_shape_MajorAxisLength",
                "original_shape_MinorAxisLength"]
predictors = ["age", "Female"] + rad_features
outcome = ["MS"]

logit_mod = sm.Logit(model_data[outcome].astype("float"), model_data[predictors].astype("float"), missing="drop")
res = logit_mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.459983
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                     MS   No. Observations:                  413
Model:                          Logit   Df Residuals:                      409
Method:                           MLE   Df Model:                            3
Date:                Thu, 20 Feb 2025   Pseudo R-squ.:                -0.02836
Time:                        21:52:02   Log-Likelihood:                -189.97
converged:                       True   LL-Null:                       -184.73
Covariance Type:            nonrobust   LLR p-value:                     1.000
                                     coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
age                                0.0162      0.009      1.765     

**Why is TIV significant here, but not in data_analysis.ipynb?**

In [191]:
model_data = pd.concat([data, rad_df], axis=1)
# model_data = df_z.loc[df_z['dz_type5'].isin(['RMS', 'PMS'])]
model_data = model_data.loc[df_full['dz_type5'].isin(["PMS", "RMS"])]
model_data = pd.concat(
    (model_data, pd.get_dummies(df_full.loc[model_data.index, 'dz_type5'])), axis=1
)
# model_data_ind = set(model_data.index)
# rad_df_ind = set(rad_df.index)


rad_features = ["original_shape_MinorAxisLength"]
predictors = ["age", "Female", "tiv", "original_shape_SurfaceVolumeRatio", "Intercept"]
outcome = ["PMS"]
model_data['PMS'] = model_data['PMS'].astype("int64")
model_data['Intercept'] = 1

logit_mod = sm.Logit(model_data[outcome], model_data[predictors], missing="drop")
# logit_mod = sm.Logit.from_formula("PMS ~ original_shape_MinorAxisLength + age + tiv + Female", data=model_data)
res = logit_mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.386002
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                    PMS   No. Observations:                  345
Model:                          Logit   Df Residuals:                      340
Method:                           MLE   Df Model:                            4
Date:                Thu, 20 Feb 2025   Pseudo R-squ.:                  0.2407
Time:                        21:57:07   Log-Likelihood:                -133.17
converged:                       True   LL-Null:                       -175.38
Covariance Type:            nonrobust   LLR p-value:                 2.024e-17
                                        coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
age                                   0.1068      0.016      6

In [187]:
model_data = pd.concat([df, rad_df], axis=1)
model_data = model_data.loc[model_data['dz_type5'].isin(["PMS", "RMS", "NIND", "OIND"])]
model_data = pd.concat(
    (model_data, pd.get_dummies(model_data['dz_type5'])), axis=1
)

model_data['phenotype'] = 0
model_data.loc[model_data['dz_type5'] == "NIND", 'phenotype'] = 1
model_data.loc[model_data['dz_type5'] == "OIND", 'phenotype'] = 2
model_data.loc[model_data['dz_type5'] == "RMS", 'phenotype'] = 3
model_data.loc[model_data['dz_type5'] == "PMS", 'phenotype'] = 4

print(sum(model_data['phenotype'] == 0))

rad_features = ["original_shape_MinorAxisLength"]

predictors = ["age", "Female", "original_shape_Elongation", "tiv"]
outcome = ["phenotype"]

mod_prob = OrderedModel(model_data[outcome].astype("float"), model_data[predictors].astype("float"), distr='logit', missing="drop")
res = mod_prob.fit()
print(res.summary())

0
                             OrderedModel Results                             
Dep. Variable:              phenotype   Log-Likelihood:                -384.83
Model:                   OrderedModel   AIC:                             783.7
Method:            Maximum Likelihood   BIC:                             811.8
Date:                Thu, 20 Feb 2025                                         
Time:                        21:56:33                                         
No. Observations:                 411                                         
Df Residuals:                     404                                         
Df Model:                           4                                         
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
age                           0.0362      0.009      4.219      0.000       0.019       0.053
Femal

  retvals = optimize.fmin(f, start_params, args=fargs, xtol=xtol,


### Clinical Disability

In [192]:
model_data = pd.concat([data, rad_df], axis=1)
model_data = model_data.loc[df_full['dz_type5'].isin(["PMS", "RMS"])]
model_data['Intercept'] = 1

rad_features = [
                "original_shape_MinorAxisLength"]
predictors = ["age", "Female", "choroid_volume", "tiv", "Intercept"]
outcome = "edss_sqrt"

model = sm.OLS(model_data[outcome], model_data[predictors], missing="drop", hasconst=True)
res = model.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:              edss_sqrt   R-squared:                       0.225
Model:                            OLS   Adj. R-squared:                  0.217
Method:                 Least Squares   F-statistic:                     28.48
Date:                Thu, 20 Feb 2025   Prob (F-statistic):           8.62e-21
Time:                        21:58:05   Log-Likelihood:                -294.77
No. Observations:                 397   AIC:                             599.5
Df Residuals:                     392   BIC:                             619.5
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
age                0.0188      0.002      8.