In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
import os
from reload_recursive import reload_recursive
from pyprocessmacro import Process
from statsmodels.stats.mediation import Mediation
from statsmodels.miscmodels.ordinal_model import OrderedModel
import sys
import re

from mri_data import file_manager as fm
import helpers

In [2]:
reload_recursive(helpers)

### Paths and Config

In [3]:
drive_root = fm.get_drive_root()
msmri_home = Path("/home/srs-9/Projects/ms_mri")
msmri_datadir = msmri_home / "data"
curr_dir = Path(os.getcwd())
data_dir = curr_dir / "data0"
showfigs = False
pd.options.display.precision = 3
colors = helpers.get_colors()

### Load and Prepare Data

In [4]:
df_thomas = pd.read_csv(data_dir / "hipsthomas_vols.csv", index_col="subid")
cols_orig = df_thomas.columns

In [17]:
df_thomas = pd.read_csv(data_dir / "hipsthomas_vols.csv", index_col="subid")
new_colnames = {}
for col in df_thomas.columns:
    new_col = re.sub(r"(\d+)-([\w-]+)", r"\2_\1", col)
    new_col = re.sub("-", "_", new_col)
    new_colnames[col] = new_col

df_thomas = df_thomas.rename(columns=new_colnames)
df_thomas_norm = df_thomas.apply(lambda x: x / df_thomas['THALAMUS_1'])
df_thomas_z = df_thomas.apply(stats.zscore, nan_policy="omit")
df_thomas_norm_z = df_thomas_norm.apply(stats.zscore, nan_policy="omit")

In [6]:
df = pd.read_csv(data_dir / "t1_aschoplex_data_full.csv", index_col="subid")
df_full = df.copy()
df_tmp = pd.read_csv(data_dir / "t1_data_full.csv", index_col="subid")
df['pineal_volume'] = df_tmp['pineal_volume']
df['pituitary_volume'] = df_tmp['pituitary_volume']

df = helpers.get_mri_edss_delta(df)

keep_cols = [
    "subject",
    "age",
    "sex",
    "ms_type",
    "dzdur",
    "extracted_EDSS",
    "MSSS",
    "gMSSS",
    "ARMSS",
    "edss_mri_delta",
    "edss_date_closest",
    "mri_date_closest",
    "DMT_score",
    "DMT_hx_all",
    "TER",
    "DMF",
    "NAT",
    "INF",
    "flair_contrast",
    "thalamus",
    "brain",
    "white",
    "grey",
    "cortical_thickness",
    "lesion_count",
    "lesion_vol_cubic",
    "PRL",
    "tiv",
    "choroid_volume",
    "pineal_volume", 
    "pituitary_volume"
]
df = df.loc[:, keep_cols]

df = helpers.set_dz_type5(df)
df = helpers.set_dz_type3(df)
df = helpers.set_dz_type2(df)
df = helpers.fix_edss(df)
df = helpers.clean_df(df)
df = helpers.set_has_prl(df)
df = helpers.norm_volumes(df)

df.rename(columns={"lesion_vol_cubic": "t2lv"}, inplace=True)
df = df.rename(columns={"extracted_EDSS": "EDSS"})

df = helpers.do_sqrt_transform(df, ["EDSS", "MSSS", "ARMSS", "gMSSS"])
df = helpers.do_log_transform(df, ["t2lv"])

vars_to_center = ["EDSS_sqrt", "t2lv_logtrans", "t2lv", "dzdur", "choroid_volume", "PRL"]
df = helpers.do_center(df, vars_to_center)

vars_to_scale = [
    "age",
    "dzdur",
    "lesion_count",
    "t2lv",
    "t2lv_logtrans",
    "PRL",
    "tiv",
    "choroid_volume",
    "thalamus"
]
df = helpers.do_scale(df, vars_to_scale)

numeric_vars = [
    "age",
    "dzdur",
    "Female",
    "EDSS", "EDSS_sqrt",
    "MSSS", "MSSS_sqrt",
    "gMSSS", "gMSSS_sqrt",
    "ARMSS", "ARMSS_sqrt",
    "edss_mri_delta",
    "DMT_score",
    "DMT_hx_all",
    "TER",
    "DMF",
    "NAT",
    "INF",
    "thalamus",
    "brain",
    "white",
    "grey",
    "cortical_thickness",
    "lesion_count",
    "t2lv", "t2lv_logtrans",
    "PRL",
    "tiv",
    "choroid_volume",
    "norm_choroid_volume",
    "pineal_volume",
    "pituitary_volume"
]

for var in numeric_vars:
    df[var] = df[var].astype("float")

df_ms = df.loc[df['dz_type2'] == "MS"]
df_scale = df.copy() #temporary till I rename df_scale everywehre
df_scale_ms = df_scale.loc[df['dz_type2'] == "MS"]

df_z = df[numeric_vars].astype("float")
df_z[df.columns[~df.columns.isin(numeric_vars)]] = df[df.columns[~df.columns.isin(numeric_vars)]]
df_z = df_z[df.columns]
df_z[numeric_vars] = df_z[numeric_vars].apply(stats.zscore, nan_policy="omit")

# delete these vars once I fix all future variable references
data = df[numeric_vars].astype("float")
data_z = data[numeric_vars].apply(stats.zscore, nan_policy="omit")

data_ms = df.loc[df["dz_type5"].isin(["RMS", "PMS"]), :]
data_ms = data_ms[numeric_vars].astype("float")
data_ms_z = data_ms[numeric_vars].apply(stats.zscore, nan_policy="omit")

In [21]:
model_data = df_z.join(df_thomas_norm_z)
outcomes = df_thomas.columns[~df_thomas.columns.isin(["THALAMUS_1"])]
covariates = "age + Female + tiv"

pvals = {}
coefs = {}

for outcome in outcomes:
    formula = f"{outcome} ~ choroid_volume + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[outcome] = res.pvalues['choroid_volume']
    coefs[outcome] = res.params['choroid_volume']

regression_results = pd.DataFrame({"coef": coefs, "pvals": pvals})
regression_results['p_fdr'] = stats.false_discovery_control(regression_results['pvals'], method='bh')
# regression_results.insert(0, "structure", cols_orig)
regression_results = regression_results.sort_values(by="coef", ascending=True)
print(regression_results)

           coef      pvals      p_fdr
Pul_8    -0.347  1.721e-13  6.022e-13
LGN_9    -0.169  1.017e-03  1.525e-03
MD_Pf_12 -0.130  9.174e-03  1.284e-02
AV_2     -0.114  1.312e-02  1.621e-02
CM_11    -0.004  9.303e-01  9.303e-01
Cla_28    0.070  1.520e-01  1.596e-01
Hb_13     0.081  9.988e-02  1.104e-01
VPL_7     0.119  1.537e-02  1.793e-02
MGN_10    0.123  1.244e-02  1.621e-02
Acc_26    0.227  1.767e-06  2.854e-06
Amy_34    0.286  9.178e-10  1.606e-09
GPi_30    0.299  3.109e-13  9.326e-13
Put_31    0.308  3.760e-11  7.178e-11
RN_32     0.309  2.867e-11  6.020e-11
GPe_29    0.322  9.869e-13  2.591e-12
GP_33     0.322  1.298e-13  5.451e-13
Cau_27    0.338  3.422e-12  7.985e-12
MTT_14    0.355  9.418e-15  6.328e-14
VLP_6     0.357  1.205e-14  6.328e-14
VLa_5     0.416  2.999e-19  3.149e-18
VA_4      0.448  2.038e-22  4.281e-21


In [19]:
df_thomas_norm_z

Unnamed: 0_level_0,THALAMUS_1,AV_2,VA_4,VLa_5,VLP_6,VPL_7,Pul_8,LGN_9,MGN_10,CM_11,...,MTT_14,Acc_26,Cau_27,Cla_28,GPe_29,GPi_30,Put_31,RN_32,GP_33,Amy_34
subid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1002,,0.817,-0.269,-0.218,0.102,-0.677,0.378,0.230,0.050,0.089,...,-0.266,1.293,0.122,0.554,0.499,0.119,0.511,0.086,0.368,-0.236
1001,,-0.508,-1.083,-1.635,-0.699,0.268,1.055,1.067,-0.895,0.983,...,-0.194,-0.859,-0.768,-1.004,-0.653,-0.530,-0.560,-0.993,-0.570,0.756
1003,,-0.857,-0.965,-0.599,-1.109,0.654,-0.179,0.494,-0.928,-0.416,...,-0.309,-0.225,0.142,0.380,-0.265,-0.447,0.791,-0.998,-0.280,-0.310
1004,,-0.698,-2.381,2.104,-1.303,-8.370,-0.815,-4.970,-7.545,-6.270,...,-3.856,-5.520,3.795,2.435,2.939,-2.335,0.569,-5.657,0.786,-3.953
1005,,0.023,2.101,0.946,1.055,-0.598,-0.659,-0.151,2.722,1.255,...,2.152,1.853,2.294,0.928,3.181,2.512,2.433,3.816,3.074,1.934
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3020,,-0.308,0.203,0.291,-0.579,-0.291,-0.360,-0.534,0.074,3.426,...,-0.445,0.421,1.272,0.532,1.388,1.631,0.880,0.964,1.532,0.964
3021,,-0.738,1.708,3.026,0.796,-1.103,-2.056,-0.480,2.934,2.999,...,2.204,2.291,2.385,1.910,2.277,1.510,3.886,2.787,2.110,0.848
3023,,0.201,-0.194,0.593,0.688,0.821,-0.924,0.015,0.285,0.588,...,0.641,0.627,0.569,-0.326,-0.204,-0.388,0.064,0.109,-0.215,0.707
3024,,1.241,-1.203,-1.108,0.490,-0.076,1.391,0.503,0.150,0.488,...,0.214,-0.692,-0.613,0.882,-0.544,-0.525,-0.322,-0.326,-0.526,0.391


In [22]:
# reg_results = regression_results[['structure', 'coef', 'pvals', 'p_fdr']]
regression_results[['structure', 'coef', 'pvals', 'p_fdr']].to_csv("hips-thomas_regression.csv", index=False)

In [23]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np

In [38]:
df_thomas = pd.read_csv(data_dir / "hipsthomas_vols.csv", index_col="subid")
df_thomas_z = df_thomas.apply(stats.zscore, nan_policy="omit")
# X = df_thomas_z[df_thomas_z.columns[~df_thomas_z.columns.isin(["THALAMUS_1"])]]
# y = df_z['choroid_volume']
# imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# imputer.fit(y)
# y = imputer.transform(y)
# imputer.fit(X)
# X = imputer.transform(X)

data = df_thomas_z.join(df_z['choroid_volume'])
data = data.dropna()
X = data[df_thomas_z.columns[~df_thomas_z.columns.isin(["THALAMUS_1"])]]
y = data['choroid_volume']



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Lasso Regression model with a specific alpha (regularization strength)
alpha = 0.1  # Adjust alpha as needed
lasso = Lasso(alpha=alpha)

# Train the model
lasso.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lasso.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Print the coefficients (note that some may be zero)
print("Coefficients:", lasso.coef_)

Mean Squared Error: 0.8716937596292225
Coefficients: [-0.         -0.          0.          0.          0.          0.
 -0.11246098 -0.0056007  -0.01334235 -0.         -0.26049365 -0.
  0.12708313 -0.          0.         -0.          0.          0.
  0.         -0.          0.          0.0011668 ]


In [28]:
check = pd.DataFrame({"coef": lasso.coef_}, index=X.columns)
check['abs'] = check['coef'].abs()
check = check.sort_values(by="abs", ascending=False)
check

Unnamed: 0,coef,abs
12-MD-Pf,-0.26,0.26
14-MTT,0.127,0.127
8-Pul,-0.112,0.112
10-MGN,-0.013,0.013
9-LGN,-0.006,0.006
34-Amy,0.001,0.001
27-Cau,0.0,0.0
33-GP,0.0,0.0
32-RN,-0.0,0.0
31-Put,0.0,0.0


In [43]:
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split

data = df_thomas_z.join(df_z['choroid_volume'])
data = data.dropna()
X = data[df_thomas_z.columns[~df_thomas_z.columns.isin(["1-THALAMUS"])]]
y = data['choroid_volume']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

regr = ElasticNetCV(cv=10, random_state=0)
regr.fit(X_train, y_train)
print(regr.alpha_)
print(regr.intercept_)
y_pred = regr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(mse)

0.003201317522507642
0.02900795781463406
0.7212198554831487


In [44]:
X.columns

Index(['2-AV', '4-VA', '5-VLa', '6-VLP', '7-VPL', '8-Pul', '9-LGN', '10-MGN',
       '11-CM', '12-MD-Pf', '13-Hb', '14-MTT', '26-Acc', '27-Cau', '28-Cla',
       '29-GPe', '30-GPi', '31-Put', '32-RN', '33-GP', '34-Amy'],
      dtype='object')

In [45]:
check = pd.DataFrame({"coef": regr.coef_}, index=X.columns)
check['abs'] = check['coef'].abs()
check = check.sort_values(by="abs", ascending=False)
check

Unnamed: 0,coef,abs
12-MD-Pf,-0.331,0.331
8-Pul,-0.229,0.229
7-VPL,0.217,0.217
14-MTT,0.163,0.163
30-GPi,0.163,0.163
6-VLP,0.152,0.152
29-GPe,-0.152,0.152
26-Acc,-0.129,0.129
27-Cau,0.116,0.116
11-CM,-0.114,0.114
