In [13]:
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
import os
from reload_recursive import reload_recursive
from pyprocessmacro import Process
from statsmodels.stats.mediation import Mediation
from statsmodels.miscmodels.ordinal_model import OrderedModel
import sys
import re

from mri_data import file_manager as fm
import helpers

In [2]:
reload_recursive(helpers)

### Paths and Config

In [3]:
drive_root = fm.get_drive_root()
msmri_home = Path("/home/srs-9/Projects/ms_mri")
msmri_datadir = msmri_home / "data"
curr_dir = Path(os.getcwd())
data_dir = curr_dir / "data0"
showfigs = False
pd.options.display.precision = 3
colors = helpers.get_colors()

### Load and Prepare Data

In [26]:
df_thomas = pd.read_csv(data_dir / "hipsthomas_vols.csv", index_col="subid")
new_colnames = {}
for col in df_thomas.columns:
    new_col = re.sub(r"(\d+)-(\w+)", r"\2_\1", col)
    new_col = re.sub("-", "_", new_col)
    new_colnames[col] = new_col

df_thomas = df_thomas.rename(columns=new_colnames)
df_thomas_z = df_thomas.apply(stats.zscore, nan_policy="omit")

In [None]:
df = pd.read_csv(data_dir / "t1_aschoplex_data_full.csv", index_col="subid")
df_full = df.copy()
df_tmp = pd.read_csv(data_dir / "t1_data_full.csv", index_col="subid")
df['pineal_volume'] = df_tmp['pineal_volume']
df['pituitary_volume'] = df_tmp['pituitary_volume']

df = helpers.get_mri_edss_delta(df)

keep_cols = [
    "subject",
    "age",
    "sex",
    "ms_type",
    "dzdur",
    "extracted_EDSS",
    "MSSS",
    "gMSSS",
    "ARMSS",
    "edss_mri_delta",
    "edss_date_closest",
    "mri_date_closest",
    "DMT_score",
    "DMT_hx_all",
    "TER",
    "DMF",
    "NAT",
    "INF",
    "flair_contrast",
    "thalamus",
    "brain",
    "white",
    "grey",
    "cortical_thickness",
    "lesion_count",
    "lesion_vol_cubic",
    "PRL",
    "tiv",
    "choroid_volume",
    "pineal_volume", 
    "pituitary_volume"
]
df = df.loc[:, keep_cols]

df = helpers.set_dz_type5(df)
df = helpers.set_dz_type3(df)
df = helpers.set_dz_type2(df)
df = helpers.fix_edss(df)
df = helpers.clean_df(df)
df = helpers.set_has_prl(df)
df = helpers.norm_volumes(df)

df.rename(columns={"lesion_vol_cubic": "t2lv"}, inplace=True)
df = df.rename(columns={"extracted_EDSS": "EDSS"})

df = helpers.do_sqrt_transform(df, ["EDSS", "MSSS", "ARMSS", "gMSSS"])
df = helpers.do_log_transform(df, ["t2lv"])

vars_to_center = ["EDSS_sqrt", "t2lv_logtrans", "t2lv", "dzdur", "choroid_volume", "PRL"]
df = helpers.do_center(df, vars_to_center)

vars_to_scale = [
    "age",
    "dzdur",
    "lesion_count",
    "t2lv",
    "t2lv_logtrans",
    "PRL",
    "tiv",
    "choroid_volume",
    "thalamus"
]
df = helpers.do_scale(df, vars_to_scale)

numeric_vars = [
    "age",
    "dzdur",
    "Female",
    "EDSS", "EDSS_sqrt",
    "MSSS", "MSSS_sqrt",
    "gMSSS", "gMSSS_sqrt",
    "ARMSS", "ARMSS_sqrt",
    "edss_mri_delta",
    "DMT_score",
    "DMT_hx_all",
    "TER",
    "DMF",
    "NAT",
    "INF",
    "thalamus",
    "brain",
    "white",
    "grey",
    "cortical_thickness",
    "lesion_count",
    "t2lv", "t2lv_logtrans",
    "PRL",
    "tiv",
    "choroid_volume",
    "norm_choroid_volume",
    "pineal_volume",
    "pituitary_volume"
]

for var in numeric_vars:
    df[var] = df[var].astype("float")

df_ms = df.loc[df['dz_type2'] == "MS"]
df_scale = df.copy() #temporary till I rename df_scale everywehre
df_scale_ms = df_scale.loc[df['dz_type2'] == "MS"]

df_z = df[numeric_vars].astype("float")
df_z[df.columns[~df.columns.isin(numeric_vars)]] = df[df.columns[~df.columns.isin(numeric_vars)]]
df_z = df_z[df.columns]
df_z[numeric_vars] = df_z[numeric_vars].apply(stats.zscore, nan_policy="omit")

# delete these vars once I fix all future variable references
data = df[numeric_vars].astype("float")
data_z = data[numeric_vars].apply(stats.zscore, nan_policy="omit")

data_ms = df.loc[df["dz_type5"].isin(["RMS", "PMS"]), :]
data_ms = data_ms[numeric_vars].astype("float")
data_ms_z = data_ms[numeric_vars].apply(stats.zscore, nan_policy="omit")

In [29]:
model_data = df_z.join(df_thomas_z)
outcomes = df_thomas.columns
covariates = "age + Female + tiv"

pvals = {}
coefs = {}

for outcome in outcomes:
    formula = f"{outcome} ~ choroid_volume + {covariates}"
    res = sm.OLS.from_formula(formula, data=model_data).fit()
    pvals[outcome] = res.pvalues['choroid_volume']
    coefs[outcome] = res.params['choroid_volume']

regression_results = pd.DataFrame({"coef": coefs, "pvals": pvals})
regression_results['p_fdr'] = stats.false_discovery_control(regression_results['pvals'], method='bh')
regression_results = regression_results.sort_values(by="coef", ascending=True)
print(regression_results)

             coef      pvals      p_fdr
Pul_8      -0.427  2.542e-24  3.417e-23
MD_12_Pf   -0.425  3.106e-24  3.417e-23
THALAMUS_1 -0.388  6.512e-23  4.775e-22
CM_11      -0.363  1.207e-18  6.641e-18
LGN_9      -0.360  7.602e-15  2.787e-14
MGN_10     -0.329  4.997e-16  2.199e-15
Cla_28     -0.295  8.555e-14  2.689e-13
VPL_7      -0.247  4.727e-09  1.300e-08
AV_2       -0.228  1.171e-07  2.342e-07
Acc_26     -0.220  3.502e-08  8.562e-08
Put_31     -0.210  4.706e-08  1.035e-07
VLP_6      -0.210  1.368e-07  2.508e-07
Hb_13      -0.201  2.096e-05  3.294e-05
RN_32      -0.195  3.324e-07  5.625e-07
GPe_29     -0.089  2.461e-02  3.610e-02
GP_33      -0.084  3.264e-02  4.488e-02
VLa_5      -0.078  4.736e-02  5.840e-02
VA_4       -0.075  4.778e-02  5.840e-02
Amy_34     -0.059  2.051e-01  2.324e-01
GPi_30     -0.052  2.112e-01  2.324e-01
Cau_27     -0.049  2.286e-01  2.395e-01
MTT_14      0.027  5.296e-01  5.296e-01


In [37]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np

In [41]:
# X = df_thomas_z[df_thomas_z.columns[~df_thomas_z.columns.isin(["THALAMUS_1"])]]
# y = df_z['choroid_volume']
# imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# imputer.fit(y)
# y = imputer.transform(y)
# imputer.fit(X)
# X = imputer.transform(X)

data = df_thomas_z.join(df_z['choroid_volume'])
data = data.dropna()
X = data[df_thomas_z.columns[~df_thomas_z.columns.isin(["THALAMUS_1"])]]
y = data['choroid_volume']



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Lasso Regression model with a specific alpha (regularization strength)
alpha = 0.1  # Adjust alpha as needed
lasso = Lasso(alpha=alpha)

# Train the model
lasso.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lasso.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Print the coefficients (note that some may be zero)
print("Coefficients:", lasso.coef_)

Mean Squared Error: 0.8716967830986361
Coefficients: [-0.          0.          0.          0.          0.         -0.11244972
 -0.00559917 -0.01333803 -0.         -0.26050745 -0.          0.12708231
 -0.          0.         -0.          0.          0.          0.
 -0.          0.          0.00116607]


In [47]:
check = pd.DataFrame({"coef": lasso.coef_}, index=X.columns)
check['abs'] = check['coef'].abs()
check = check.sort_values(by="abs", ascending=False)
check

Unnamed: 0,coef,abs
MD_12_Pf,-0.261,0.261
MTT_14,0.127,0.127
Pul_8,-0.112,0.112
MGN_10,-0.013,0.013
LGN_9,-0.006,0.006
Amy_34,0.001,0.001
VPL_7,0.0,0.0
VLa_5,0.0,0.0
VA_4,0.0,0.0
AV_2,-0.0,0.0


In [39]:
df_thomas_z.columns[~df_thomas_z.columns.isin(["THALAMUS_1"])]

Index(['AV_2', 'VA_4', 'VLa_5', 'VLP_6', 'VPL_7', 'Pul_8', 'LGN_9', 'MGN_10',
       'CM_11', 'MD_12_Pf', 'Hb_13', 'MTT_14', 'Acc_26', 'Cau_27', 'Cla_28',
       'GPe_29', 'GPi_30', 'Put_31', 'RN_32', 'GP_33', 'Amy_34'],
      dtype='object')