In [2]:
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
import os
from reload_recursive import reload_recursive
from pyprocessmacro import Process
from statsmodels.stats.mediation import Mediation
from statsmodels.miscmodels.ordinal_model import OrderedModel
import sys
import re

from mri_data import file_manager as fm
# import helpers

In [3]:
# reload_recursive(helpers)

### Paths and Config

In [None]:
drive_root = fm.get_drive_root()
msmri_home = Path("/home/srs-9/Projects/ms_mri")
msmri_datadir = msmri_home / "data"
curr_dir = Path(os.getcwd())
data_dir = curr_dir / "data0"
showfigs = False
pd.options.display.precision = 3
# colors = helpers.get_colors()

NameError: name 'helpers' is not defined

### Load and Prepare Data

In [None]:
df_thomas = pd.read_csv(data_dir / "hipsthomas_vols.csv", index_col="subid")
cols_orig = df_thomas.columns

In [None]:
df_thomas = pd.read_csv(data_dir / "hipsthomas_vols.csv", index_col="subid")
new_colnames = {}
for col in df_thomas.columns:
    new_col = re.sub(r"(\d+)-([\w-]+)", r"\2_\1", col)
    new_col = re.sub("-", "_", new_col)
    new_colnames[col] = new_col

df_thomas = df_thomas.rename(columns=new_colnames)
df_thomas_norm = df_thomas.apply(lambda x: x / df_thomas['THALAMUS_1'])
df_thomas_z = df_thomas.apply(stats.zscore, nan_policy="omit")
df_thomas_norm_z = df_thomas_norm.apply(stats.zscore, nan_policy="omit")

In [None]:
subj = 1001
df_thomas = pd.read_csv(data_dir / "hipsthomas_vols.csv", index_col="subid")
structs_to_add = ["2-AV", "4-VA", "5-VLa", "6-VLP", "7-VPL", "9-LGN", "10-MGN", "11-CM", "12-MD-Pf", "13-Hb", "14-MTT", "26-Acc", "27-Cau", "32-RN"]
df_thomas.loc[subj, "1-THALAMUS"] - df_thomas.loc[subj, structs_to_add].sum()

-1961.495366000001

In [None]:
df_thomas.loc[subj, "1-THALAMUS"]

9065.068359

In [48]:
df = pd.read_csv(data_dir / "t1_aschoplex_data_full.csv", index_col="subid")
df_full = df.copy()
df_tmp = pd.read_csv(data_dir / "t1_data_full.csv", index_col="subid")
df['pineal_volume'] = df_tmp['pineal_volume']
df['pituitary_volume'] = df_tmp['pituitary_volume']

df = helpers.get_mri_edss_delta(df)

keep_cols = [
    "subject",
    "age",
    "sex",
    "ms_type",
    "dzdur",
    "extracted_EDSS",
    "MSSS",
    "gMSSS",
    "ARMSS",
    "edss_mri_delta",
    "edss_date_closest",
    "mri_date_closest",
    "DMT_score",
    "DMT_hx_all",
    "TER",
    "DMF",
    "NAT",
    "INF",
    "flair_contrast",
    "thalamus",
    "brain",
    "white",
    "grey",
    "cortical_thickness",
    "lesion_count",
    "lesion_vol_cubic",
    "PRL",
    "tiv",
    "choroid_volume",
    "pineal_volume", 
    "pituitary_volume"
]
df = df.loc[:, keep_cols]

df = helpers.set_dz_type5(df)
df = helpers.set_dz_type3(df)
df = helpers.set_dz_type2(df)
df = helpers.fix_edss(df)
df = helpers.clean_df(df)
df = helpers.set_has_prl(df)
df = helpers.norm_volumes(df)

df.rename(columns={"lesion_vol_cubic": "t2lv"}, inplace=True)
df = df.rename(columns={"extracted_EDSS": "EDSS"})

df = helpers.do_sqrt_transform(df, ["EDSS", "MSSS", "ARMSS", "gMSSS"])
df = helpers.do_log_transform(df, ["t2lv"])

vars_to_center = ["EDSS_sqrt", "t2lv_logtrans", "t2lv", "dzdur", "choroid_volume", "PRL"]
df = helpers.do_center(df, vars_to_center)

vars_to_scale = [
    "age",
    "dzdur",
    "lesion_count",
    "t2lv",
    "t2lv_logtrans",
    "PRL",
    "tiv",
    "choroid_volume",
    "thalamus"
]
df = helpers.do_scale(df, vars_to_scale)

numeric_vars = [
    "age",
    "dzdur",
    "Female",
    "EDSS", "EDSS_sqrt",
    "MSSS", "MSSS_sqrt",
    "gMSSS", "gMSSS_sqrt",
    "ARMSS", "ARMSS_sqrt",
    "edss_mri_delta",
    "DMT_score",
    "DMT_hx_all",
    "TER",
    "DMF",
    "NAT",
    "INF",
    "thalamus",
    "brain",
    "white",
    "grey",
    "cortical_thickness",
    "lesion_count",
    "t2lv", "t2lv_logtrans",
    "PRL",
    "tiv",
    "choroid_volume",
    "norm_choroid_volume",
    "pineal_volume",
    "pituitary_volume"
]

for var in numeric_vars:
    df[var] = df[var].astype("float")

df_ms = df.loc[df['dz_type2'] == "MS"]
df_scale = df.copy() #temporary till I rename df_scale everywehre
df_scale_ms = df_scale.loc[df['dz_type2'] == "MS"]

df_z = df[numeric_vars].astype("float")
df_z[df.columns[~df.columns.isin(numeric_vars)]] = df[df.columns[~df.columns.isin(numeric_vars)]]
df_z = df_z[df.columns]
df_z[numeric_vars] = df_z[numeric_vars].apply(stats.zscore, nan_policy="omit")

# delete these vars once I fix all future variable references
data = df[numeric_vars].astype("float")
data_z = data[numeric_vars].apply(stats.zscore, nan_policy="omit")

data_ms = df.loc[df["dz_type5"].isin(["RMS", "PMS"]), :]
data_ms = data_ms[numeric_vars].astype("float")
data_ms_z = data_ms[numeric_vars].apply(stats.zscore, nan_policy="omit")

In [49]:
model_data = df_z.join(df_thomas_norm_z)
outcomes = df_thomas.columns[~df_thomas.columns.isin(["THALAMUS_1"])]
covariates = "age + Female + tiv"

cols_orig2 = cols_orig[~cols_orig.isin(["1-THALAMUS"])]

pvals = {}
coefs = {}

for outcome in outcomes:
    formula = f"{outcome} ~ choroid_volume + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[outcome] = res.pvalues['choroid_volume']
    coefs[outcome] = res.params['choroid_volume']

regression_results = pd.DataFrame({"coef": coefs, "pvals": pvals})
regression_results['p_fdr'] = stats.false_discovery_control(regression_results['pvals'], method='bh')
regression_results.insert(0, "structure", cols_orig2)
regression_results = regression_results.sort_values(by="coef", ascending=True)
print(regression_results)

         structure   coef      pvals      p_fdr
Pul_8        8-Pul -0.347  1.721e-13  6.022e-13
LGN_9        9-LGN -0.169  1.017e-03  1.525e-03
MD_Pf_12  12-MD-Pf -0.130  9.174e-03  1.284e-02
AV_2          2-AV -0.114  1.312e-02  1.621e-02
CM_11        11-CM -0.004  9.303e-01  9.303e-01
Cla_28      28-Cla  0.070  1.520e-01  1.596e-01
Hb_13        13-Hb  0.081  9.988e-02  1.104e-01
VPL_7        7-VPL  0.119  1.537e-02  1.793e-02
MGN_10      10-MGN  0.123  1.244e-02  1.621e-02
Acc_26      26-Acc  0.227  1.767e-06  2.854e-06
Amy_34      34-Amy  0.286  9.178e-10  1.606e-09
GPi_30      30-GPi  0.299  3.109e-13  9.326e-13
Put_31      31-Put  0.308  3.760e-11  7.178e-11
RN_32        32-RN  0.309  2.867e-11  6.020e-11
GPe_29      29-GPe  0.322  9.869e-13  2.591e-12
GP_33        33-GP  0.322  1.298e-13  5.451e-13
Cau_27      27-Cau  0.338  3.422e-12  7.985e-12
MTT_14      14-MTT  0.355  9.418e-15  6.328e-14
VLP_6        6-VLP  0.357  1.205e-14  6.328e-14
VLa_5        5-VLa  0.416  2.999e-19  3.

In [52]:
model_data = df_z.join(df_thomas_z)
outcomes = df_thomas.columns[~df_thomas.columns.isin(["THALAMUS_1"])]
covariates = "age + Female + tiv + THALAMUS_1"

cols_orig2 = cols_orig[~cols_orig.isin(["1-THALAMUS"])]

pvals = {}
coefs = {}

for outcome in outcomes:
    formula = f"{outcome} ~ choroid_volume + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[outcome] = res.pvalues['choroid_volume']
    coefs[outcome] = res.params['choroid_volume']

regression_results = pd.DataFrame({"coef": coefs, "pvals": pvals})
regression_results['p_fdr'] = stats.false_discovery_control(regression_results['pvals'], method='bh')
regression_results.insert(0, "structure", cols_orig2)
regression_results = regression_results.sort_values(by="coef", ascending=True)
print(regression_results)

         structure   coef      pvals      p_fdr
CM_11        11-CM -0.101  2.903e-03  9.276e-03
Cla_28      28-Cla -0.082  2.275e-02  5.309e-02
LGN_9        9-LGN -0.073  6.239e-02  1.310e-01
MD_Pf_12  12-MD-Pf -0.060  3.092e-03  9.276e-03
Hb_13        13-Hb -0.054  2.720e-01  3.569e-01
Pul_8        8-Pul -0.052  4.603e-03  1.208e-02
MGN_10      10-MGN -0.046  1.427e-01  2.306e-01
Put_31      31-Put -0.036  3.358e-01  4.148e-01
Acc_26      26-Acc -0.032  4.013e-01  4.435e-01
RN_32        32-RN -0.005  8.860e-01  8.860e-01
GPe_29      29-GPe  0.020  6.432e-01  6.754e-01
GP_33        33-GP  0.036  3.856e-01  4.435e-01
AV_2          2-AV  0.042  2.543e-01  3.560e-01
Amy_34      34-Amy  0.063  2.069e-01  3.104e-01
Cau_27      27-Cau  0.071  1.004e-01  1.758e-01
GPi_30      30-GPi  0.074  9.106e-02  1.738e-01
VPL_7        7-VPL  0.098  4.088e-04  1.726e-03
VLP_6        6-VLP  0.131  4.734e-08  4.970e-07
VLa_5        5-VLa  0.159  9.261e-06  6.482e-05
MTT_14      14-MTT  0.160  4.111e-04  1.

In [53]:
regression_results.to_csv("hips-thomas_regression_covar.csv", index=False)

In [22]:
# reg_results = regression_results[['structure', 'coef', 'pvals', 'p_fdr']]
regression_results[['structure', 'coef', 'pvals', 'p_fdr']].to_csv("hips-thomas_regression.csv", index=False)

In [26]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np

In [38]:
df_thomas = pd.read_csv(data_dir / "hipsthomas_vols.csv", index_col="subid")
df_thomas_norm = df_thomas.apply(lambda x: x / df_thomas['1-THALAMUS'])
df_thomas_z = df_thomas.apply(stats.zscore, nan_policy="omit")
df_thomas_norm_z = df_thomas_norm.apply(stats.zscore, nan_policy="omit")
# X = df_thomas_z[df_thomas_z.columns[~df_thomas_z.columns.isin(["THALAMUS_1"])]]
# y = df_z['choroid_volume']
# imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# imputer.fit(y)
# y = imputer.transform(y)
# imputer.fit(X)
# X = imputer.transform(X)

data = df_thomas_norm_z[df_thomas_z.columns[~df_thomas_z.columns.isin(["1-THALAMUS"])]]
structs = data.columns
data = data.join(df_z['choroid_volume'])
data = data.dropna()
X = data[structs]
y = data['choroid_volume']



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Lasso Regression model with a specific alpha (regularization strength)
alpha = 0.1  # Adjust alpha as needed
lasso = Lasso(alpha=alpha)

# Train the model
lasso.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lasso.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Print the coefficients (note that some may be zero)
print("Coefficients:", lasso.coef_)

Mean Squared Error: 0.8872474190555677
Coefficients: [-0.03822869  0.0788567   0.01934477  0.14889012  0.0629965  -0.
 -0.         -0.         -0.         -0.         -0.          0.13246998
  0.          0.03463133 -0.          0.          0.          0.
  0.          0.          0.01010236]


In [39]:
check = pd.DataFrame({"coef": lasso.coef_}, index=X.columns)
check['abs'] = check['coef'].abs()
check = check.sort_values(by="abs", ascending=False)
check

Unnamed: 0,coef,abs
6-VLP,0.149,0.149
14-MTT,0.132,0.132
4-VA,0.079,0.079
7-VPL,0.063,0.063
2-AV,-0.038,0.038
27-Cau,0.035,0.035
5-VLa,0.019,0.019
34-Amy,0.01,0.01
11-CM,-0.0,0.0
12-MD-Pf,-0.0,0.0


In [40]:
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split

data = df_thomas_norm_z[df_thomas_z.columns[~df_thomas_z.columns.isin(["1-THALAMUS"])]]
structs = data.columns
data = data.join(df_z['choroid_volume'])
data = data.dropna()
X = data[structs]
y = data['choroid_volume']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

regr = ElasticNetCV(cv=10, random_state=0)
regr.fit(X_train, y_train)
print(regr.alpha_)
print(regr.intercept_)
y_pred = regr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(mse)

0.008443155185560358
0.032755726331618204
0.8741261424463812


In [41]:
check = pd.DataFrame({"coef": regr.coef_}, index=X.columns)
check['abs'] = check['coef'].abs()
check = check.sort_values(by="abs", ascending=False)
check

Unnamed: 0,coef,abs
14-MTT,0.197,0.197
6-VLP,0.178,0.178
31-Put,0.166,0.166
29-GPe,-0.158,0.158
7-VPL,0.143,0.143
30-GPi,0.138,0.138
32-RN,-0.115,0.115
27-Cau,0.1,0.1
2-AV,-0.09,0.09
34-Amy,0.09,0.09


In [43]:
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split

data = df_thomas_z.join(df_z['choroid_volume'])
data = data.dropna()
X = data[df_thomas_z.columns[~df_thomas_z.columns.isin(["1-THALAMUS"])]]
y = data['choroid_volume']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

regr = ElasticNetCV(cv=10, random_state=0)
regr.fit(X_train, y_train)
print(regr.alpha_)
print(regr.intercept_)
y_pred = regr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(mse)

0.003201317522507642
0.02900795781463406
0.7212198554831487


In [45]:
check = pd.DataFrame({"coef": regr.coef_}, index=X.columns)
check['abs'] = check['coef'].abs()
check = check.sort_values(by="abs", ascending=False)
check

Unnamed: 0,coef,abs
12-MD-Pf,-0.331,0.331
8-Pul,-0.229,0.229
7-VPL,0.217,0.217
14-MTT,0.163,0.163
30-GPi,0.163,0.163
6-VLP,0.152,0.152
29-GPe,-0.152,0.152
26-Acc,-0.129,0.129
27-Cau,0.116,0.116
11-CM,-0.114,0.114
