## Setup

### Imports

In [11]:
import pprint
from warnings import simplefilter

import pandas as pd
from IPython.display import Markdown, display
from statsmodels.stats.multitest import multipletests

simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
import json
import re
import textwrap
from collections import defaultdict
from datetime import datetime
import os
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pyperclip
import statsmodels.api as sm
from IPython.display import clear_output
from matplotlib import colormaps
from scipy import stats
from statsmodels.genmod.families import Poisson
from pyprocessmacro import Process


from reload_recursive import reload_recursive
from statsmodels.stats.mediation import Mediation
from statsmodels.stats.outliers_influence import variance_inflation_factor
from tqdm.notebook import tqdm
import sys

sys.path.insert(0, "/home/srs-9/Projects/ms_mri/analysis/thalamus/helpers")
from mri_data import file_manager as fm
import helpers

import regression_utils

In [6]:
reload_recursive(regression_utils)
reload_recursive(helpers)
import helpers
from helpers import load_df, zscore, get_colors
from regression_utils import quick_regression, quick_regression2, residualize_vars

### Load Data

#### Clinical and Volumes

In [13]:
drive_root = fm.get_drive_root()
dataroot = drive_root / "3Tpioneer_bids"
data_dir = Path("/home/srs-9/Projects/ms_mri/data")
fig_path = Path(
    "/home/srs-9/Projects/ms_mri/analysis/thalamus/figures_tables/3_choroid_thalamus"
)
if not fig_path.exists():
    os.makedirs(fig_path)

df = load_df()
df_thomas = helpers.load_hipsthomas(data_dir)

data = df.join(df_thomas)
data = helpers.composite_vars(data)

#! See suggestions from assumption_checks.ipynb
# TODO It would be helpful if the transformed variable name was general so I 
# TODO     wouldnt have to remember which transform was applied to each
transformations = {
    "LV": "log",
    "thirdV": "log",
    "fourthV": "log",
    "asegCSF": "log",
    "t2lv": "log",
    "PRL": "log1p",
    "CCF": "log",
    "CCF0": "log",
    "periCSF_ratio": "log",
    "periCSF_frac": "square"
}
data = helpers.transform_variables(data, transformations)
dataT = helpers.transform_variables(data, transformations, rename=False)


# these corrections should ultimately be made to the csv file
for struct in ["brain", "white", "grey", "thalamus", "t2lv"]:
    data[struct] = data[struct] * 1000

viridis = colormaps["viridis"].resampled(20)
colors = helpers.get_colors()

MS_patients = data["dz_type2"] == "MS"
NONMS_patients = data["dz_type2"] == "!MS"
NIND_patients = data["dz_type5"] == "NIND"
OIND_patients = data["dz_type5"] == "OIND"
RMS_patients = data["dz_type5"] == "RMS"
PMS_patients = data["dz_type5"] == "PMS"


thalamic_nuclei = [2, 4, 5, 6, 7, 8, 9, 10, 11, 12]
deep_grey = [13, 14, 26, 27, 28, 29, 30, 31, 32]

thalamic_nuclei_str = [str(i) for i in thalamic_nuclei]

hips_thomas_ref = pd.read_csv(
    "/home/srs-9/Projects/ms_mri/data/hipsthomas_struct_index.csv", index_col="index"
)["struct"]
hips_thomas_invref = pd.read_csv(
    "/home/srs-9/Projects/ms_mri/data/hipsthomas_struct_index.csv", index_col="struct"
)["index"]

### Functions

#### Regression Functions

In [13]:
def run_regressions(
    model_data: pd.DataFrame,
    outcome: str,
    predictors: list[str],
    covariates: list[str] = None,
    robust_cov: str = "HC3",
    fdr_method: str = "fdr_bh",
    fdr_alpha: float = 0.05,
):
    if covariates is None:
        covariates = []
        
    def _get_val_by_name(obj, name, attr):
        import numpy as np

        vals = getattr(obj, attr)
        # pandas Series (has .get)
        if hasattr(vals, "get"):
            return vals.get(name, np.nan)
        # numpy array / list-like: map via model exog names
        try:
            exog_names = list(obj.model.exog_names)
        except Exception:
            exog_names = []
        if name in exog_names:
            idx = exog_names.index(name)
            try:
                return np.asarray(vals)[idx]
            except Exception:
                return np.nan
        return np.nan

    results = {}
    models = {}
    for predictor in predictors:
        independent_vars = [predictor] + covariates
        formula = f"{outcome} ~ {" + ".join(independent_vars)}"
        model = sm.OLS.from_formula(formula, model_data).fit()

        if robust_cov:
            rres = model.get_robustcov_results(cov_type=robust_cov)
        else:
            rres = model
        
        # confidence interval: conf_int() returns DataFrame when names available
        ci_df = rres.conf_int()
        if hasattr(ci_df, "loc") and predictor in ci_df.index:
            llci, ulci = float(ci_df.loc[predictor, 0]), float(ci_df.loc[predictor, 1])
        else:
            # fallback via exog_names -> index
            try:
                exog_names = list(rres.model.exog_names)
                idx = exog_names.index(predictor)
                ci_arr = np.asarray(ci_df)
                llci, ulci = float(ci_arr[idx, 0]), float(ci_arr[idx, 1])
            except Exception:
                llci = ulci = np.nan

        ci_str = f"[{llci:.3}, {ulci:.3}]" if not np.isnan(llci) else ""
        results[predictor] = {
            "beta": _get_val_by_name(rres, predictor, "params"),
            "p_fdr": None,
            "se": _get_val_by_name(rres, predictor, "bse"),
            "llci": llci,
            "ulci": ulci, 
            "ci_str": ci_str,
            "p_raw": _get_val_by_name(rres, predictor, "pvalues"),
            "R2": rres.rsquared_adj,
        }
        models[predictor] = model

    results = pd.DataFrame(results).T

    fdr_method: str = "fdr_bh"
    fdr_alpha = 0.05
    _, p_fdr_vals, _, _ = multipletests(
        results["p_raw"], alpha=fdr_alpha, method=fdr_method
    )
    results["p_fdr"] = p_fdr_vals

    return results


def run_regressions_refactored(
    model_data: pd.DataFrame,
    outcomes,
    predictors,
    covariates: list = [],
    robust_cov: str = "HC3",
    fdr_method: str = "fdr_bh",
    fdr_alpha: float = 0.05,
):
    """
    Run OLS for every (struct, predictor).
    Returns (results_by_struct, results_by_predictor)
    - results_by_struct: dict struct -> DataFrame indexed by predictor
    - results_by_predictor: dict predictor -> DataFrame indexed by struct
    Each DataFrame columns: coef, pval, se, llci, ulci, ci, R2, p_fdr, coef_sig
    """
    if covariates is None:
        covariates = []
    outcomes = list(outcomes)
    predictors = list(predictors)
    # container: per-struct dataframes
    results_by_struct = {}
    
    def _get_val_by_name(obj, name, attr):
        
        import numpy as np
        vals = getattr(obj, attr)
        # pandas Series (has .get)
        if hasattr(vals, "get"):
            return vals.get(name, np.nan)
        # numpy array / list-like: map via model exog names
        try:
            exog_names = list(obj.model.exog_names)
        except Exception:
            exog_names = []
        if name in exog_names:
            idx = exog_names.index(name)
            try:
                return np.asarray(vals)[idx]
            except Exception:
                return np.nan
        return np.nan

    for struct in outcomes:
        rows = []
        for pred in predictors:
            exog = [pred] + covariates
            formula = f"{struct} ~ {" + ".join(exog)}"
            try:
                res = sm.OLS.from_formula(formula, data=model_data).fit()
                if robust_cov:
                    rres = res.get_robustcov_results(cov_type=robust_cov)
                else:
                    rres = res

                coef = _get_val_by_name(rres, pred, "params")
                pval = _get_val_by_name(rres, pred, "pvalues")
                se = _get_val_by_name(rres, pred, "bse")

                # confidence interval: conf_int() returns DataFrame when names available
                ci_df = rres.conf_int()
                if hasattr(ci_df, "loc") and pred in ci_df.index:
                    llci, ulci = float(ci_df.loc[pred, 0]), float(ci_df.loc[pred, 1])
                else:
                    # fallback via exog_names -> index
                    try:
                        exog_names = list(rres.model.exog_names)
                        idx = exog_names.index(pred)
                        ci_arr = np.asarray(ci_df)
                        llci, ulci = float(ci_arr[idx, 0]), float(ci_arr[idx, 1])
                    except Exception:
                        llci = ulci = np.nan

                ci_str = f"[{llci:.3}, {ulci:.3}]" if not np.isnan(llci) else ""
                r2 = res.rsquared_adj
            except Exception as e:
                print(f"Error occurred while processing {pred} for {struct}: {e}")
                coef = pval = se = llci = ulci = np.nan
                ci_str = ""
                r2 = np.nan
                raise e
            rows.append(
                {
                    "predictor": pred,
                    "coef": coef,
                    "pval": pval,
                    "se": se,
                    "llci": llci,
                    "ulci": ulci,
                    "ci": ci_str,
                    "R2": r2,
                }
            )
        df_struct = pd.DataFrame(rows).set_index("predictor")
        # FDR across predictors for this struct
        pvals = df_struct["pval"].fillna(1.0).values 
        _, p_fdr_vals, _, _ = multipletests(pvals, alpha=fdr_alpha, method=fdr_method)
        df_struct.insert(2, "p_fdr", p_fdr_vals)
        df_struct["coef_sig"] = df_struct["coef"].where(df_struct["p_fdr"] < fdr_alpha, 0.0)
        results_by_struct[struct] = df_struct

    # build results_by_predictor for compatibility
    results_by_predictor = {}
    cols = next(iter(results_by_struct.values())).columns
    for pred in predictors:
        rows = []
        for struct in outcomes:
            row = results_by_struct[struct].loc[pred].to_dict()
            row["struct"] = struct
            rows.append(row)
        df_pred = pd.DataFrame(rows).set_index("struct")[cols]
        results_by_predictor[pred] = df_pred

    return results_by_struct, results_by_predictor

### Select variables and subjects to focus on

In [14]:
model_data = dataT[MS_patients]
model_data_z = zscore(dataT[MS_patients], skip_vars=["PRL"])

variables_ref = [
     "LV",
    "CP",
    "periCSF",
    "allCSF",
    "thirdV",
    "fourthV",
    "asegCSF",
    "CCF",
    "CCF0",
    "periCSF_ratio",
    "periCSF_frac",
    "thirdV_width",
    "THALAMUS_1",
    "medial",
    "posterior",
    "ventral",
    "anterior",
    "t2lv_log",
    "brain",
    "white",
    "grey",
    "PRL"
]

## Analysis

In [15]:
resid_data = model_data_z.copy()
for var in variables_ref:
    resid_data[var] = helpers.residualize_structs(model_data, var, ["age", "Female", "tiv"])

In [26]:
outcomes = [
    "medial",
    "posterior",
    "ventral",
    "anterior"
]

predictor = ["CP", "LV", "brain", "white", "grey", "cortical_thickness", "t2lv"]
covariates = ["THALAMUS_1", "age", "Female", "tiv"]

_, results = regression_utils.run_regressions(
    model_data,
    outcomes,
    predictor,
    covariates
)

save_path = fig_path / "combined_nuclei_thalamus_control.xlsx"
save_cols = ["coef", "ci", "p_fdr"]
with pd.ExcelWriter(save_path) as writer:
    for predictor in results:
        save_results = results[predictor][save_cols]
        save_results.to_excel(writer, sheet_name=predictor)


results_to_display = results['CP']
display_order = results_to_display["coef"].apply(np.abs).sort_values(ascending=False).index
display(Markdown(f"`struct ~ CP + {" + ".join(covariates)}`"))
display(results_to_display.loc[display_order, :])

`struct ~ CP + THALAMUS_1 + age + Female + tiv`

Unnamed: 0_level_0,coef,pval,p_fdr,se,llci,ulci,ci,R2,formula,coef_sig
struct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ventral,0.102191,1.346145e-10,4.711509e-10,0.01555,0.071632,0.132749,"[0.0716, 0.133]",0.871857,ventral ~ CP + THALAMUS_1 + age + Female + tiv,0.102191
posterior,-0.048061,0.01175638,0.03014032,0.019001,-0.0854,-0.010722,"[-0.0854, -0.0107]",0.903201,posterior ~ CP + THALAMUS_1 + age + Female + tiv,-0.048061
medial,-0.037124,0.0002140643,0.000749225,0.009949,-0.056675,-0.017574,"[-0.0567, -0.0176]",0.891332,medial ~ CP + THALAMUS_1 + age + Female + tiv,-0.037124
anterior,0.003093,0.3997702,0.6995979,0.00367,-0.004119,0.010305,"[-0.00412, 0.0103]",0.561576,anterior ~ CP + THALAMUS_1 + age + Female + tiv,0.0


In [27]:
outcomes = hips_thomas_ref[thalamic_nuclei]

predictor = ["CP", "LV", "brain", "white", "grey", "cortical_thickness", "t2lv"]
covariates = ["THALAMUS_1", "age", "Female", "tiv"]

_, results = regression_utils.run_regressions(
    model_data,
    outcomes,
    predictor,
    covariates
)

save_path = fig_path / "all_nuclei_thalamus_control.xlsx"
save_cols = ["coef", "ci", "p_fdr"]
with pd.ExcelWriter(save_path) as writer:
    for predictor in results:
        save_results = results[predictor][save_cols]
        save_results.to_excel(writer, sheet_name=predictor)


results_to_display = results['CP']
display_order = results_to_display["coef"].apply(np.abs).sort_values(ascending=False).index
display(Markdown(f"`struct ~ CP + {" + ".join(covariates)}`"))
display(results_to_display.loc[display_order, :])

`struct ~ CP + THALAMUS_1 + age + Female + tiv`

Unnamed: 0_level_0,coef,pval,p_fdr,se,llci,ulci,ci,R2,formula,coef_sig
struct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
VLP_6,0.051037,6.509353e-06,2.278273e-05,0.011188,0.029051,0.073023,"[0.0291, 0.073]",0.817064,VLP_6 ~ CP + THALAMUS_1 + age + Female + tiv,0.051037
Pul_8,-0.038925,0.0299022,0.1046577,0.017871,-0.074044,-0.003807,"[-0.074, -0.00381]",0.894306,Pul_8 ~ CP + THALAMUS_1 + age + Female + tiv,0.0
MD_Pf_12,-0.029814,0.001550602,0.005427107,0.009364,-0.048215,-0.011413,"[-0.0482, -0.0114]",0.871944,MD_Pf_12 ~ CP + THALAMUS_1 + age + Female + tiv,-0.029814
VA_4,0.026369,4.109224e-09,1.438229e-08,0.004399,0.017725,0.035013,"[0.0177, 0.035]",0.674796,VA_4 ~ CP + THALAMUS_1 + age + Female + tiv,0.026369
VPL_7,0.018178,0.00186658,0.006533032,0.00581,0.006761,0.029595,"[0.00676, 0.0296]",0.751606,VPL_7 ~ CP + THALAMUS_1 + age + Female + tiv,0.018178
CM_11,-0.00731,0.007827863,0.05479504,0.002737,-0.012689,-0.001932,"[-0.0127, -0.00193]",0.62244,CM_11 ~ CP + THALAMUS_1 + age + Female + tiv,0.0
LGN_9,-0.006629,0.0855337,0.1754723,0.003847,-0.014188,0.000931,"[-0.0142, 0.000931]",0.558797,LGN_9 ~ CP + THALAMUS_1 + age + Female + tiv,0.0
VLa_5,0.006607,0.0002241412,0.000784494,0.001776,0.003116,0.010097,"[0.00312, 0.0101]",0.587137,VLa_5 ~ CP + THALAMUS_1 + age + Female + tiv,0.006607
AV_2,0.003093,0.3997702,0.6995979,0.00367,-0.004119,0.010305,"[-0.00412, 0.0103]",0.561576,AV_2 ~ CP + THALAMUS_1 + age + Female + tiv,0.0
MGN_10,-0.002507,0.01898802,0.04430539,0.001065,-0.0046,-0.000414,"[-0.0046, -0.000414]",0.705842,MGN_10 ~ CP + THALAMUS_1 + age + Female + tiv,-0.002507


### Residualize THALAMUS_1 from everything

In [31]:
model_data = zscore(data[MS_patients].copy())
variables = [var for var in variables_ref if var != "THALAMUS_1"]

for var in variables:
    model_data[var] = helpers.residualize_structs(model_data, var, ["THALAMUS_1"])

In [33]:
model_data['LV_log']

subid
1001    0.220589
1002   -0.386234
1003    0.887861
1004   -1.975279
1005   -0.111679
          ...   
3016   -0.716786
3017   -0.642087
3021    0.407933
3023    1.081607
3028    1.189331
Name: LV_log, Length: 468, dtype: float64