In [12]:
import pandas as pd
from reload_recursive import reload_recursive
import os
from loguru import logger
from pathlib import Path
import json
import numpy as np
from tqdm.notebook import tqdm
import re
import sys

import mri_data
import monai_training

In [13]:
reload_recursive(mri_data)
reload_recursive(monai_training)

from mri_data.file_manager import DataSet, scan_3Tpioneer_bids
from mri_data import file_manager as fm
from mri_data import utils
from monai_training.preprocess import DataSetProcesser

In [14]:
logger.remove()
logger.add(sys.stderr, level="INFO")

2

In [15]:
to_load = False
drive_root = fm.get_drive_root()
msmri_home = Path("/home/srs-9/Projects/ms_mri")
inference_root = drive_root / "3Tpioneer_bids_predictions"
dataroot = drive_root / "3Tpioneer_bids"
clinical_data_root = drive_root / "Secure_Data" / "Large"
project_dataroot = msmri_home / "data"

In [16]:
clinical_data_root

PosixPath('/mnt/h/Secure_Data/Large')

In [17]:
def subject_to_subid(subject):
    if not isinstance(subject, str):
        return None
    re_match = re.match(r"ms(\d{4})", subject)
    if re_match:
        return_val = int(re_match[1])
        return return_val
    else:
        return None

### Data Cleaning

- Add column which is subid as int
- Set the index to the new column
- Remove any rows where subid is NaN
- Rename columns
- Check "Working with missing data" on pandas documentation

In [18]:
if not to_load:
    df = pd.read_csv(clinical_data_root / "Clinical_Data_All.csv")
    df = df.convert_dtypes()

    # with open(msmri_home / "data" / "clinical_data_columns_full.txt", 'r') as f:
    #     keep_columns = [line.rstrip() for line in f.readlines()]
    # df = df[keep_columns]
    new_columns = {
        "ID": "subject",
        "age_at_obs_start": "age",
    }
    df.rename(columns=new_columns, inplace=True)
    df["subid"] = df["subject"].apply(subject_to_subid)
    df.drop(df[df["subid"].isna()].index, inplace=True)
    df["subid"] = df["subid"].astype(int)
    df = df.set_index("subid")

    new_columns = {}
    for col in df.columns:
        new_columns[col] = col.replace(" ", "_")
    df.rename(columns=new_columns, inplace=True)
    df.head()

### Additional Data

- ROI size

In [19]:
if to_load:
    df = pd.read_csv(project_dataroot / "clinical_data_full.csv")

In [None]:
if not to_load:
    dataset_proc = DataSetProcesser.new_dataset(dataroot, scan_3Tpioneer_bids, filters=[fm.filter_first_ses])
    full_dataset = dataset_proc.dataset
    dataset_proc.prepare_labels(["choroid_t1_flair", "pineal", "pituitary"], ["CH", "SRS", "ED", "DT"])
    dataset = dataset_proc.dataset

    inference_dataset_proc = DataSetProcesser.new_dataset(inference_root, scan_3Tpioneer_bids, filters=[fm.filter_first_ses])
    inference_dataset_proc.prepare_labels("flair.t1_choroid_pineal_pituitary3_pred")
    inference_dataset = inference_dataset_proc.dataset

In [21]:
try:
    df.insert(7, 'tiv', None)
except ValueError:
    pass
try:
    df.insert(7, 'pituitary_volume', None)
except ValueError:
    pass
try:
    df.insert(7, 'pineal_volume', None)
except ValueError:
    pass
try:
    df.insert(7, 'choroid_volume', None)
except ValueError:
    pass

In [22]:
if not to_load:
    volumes = dict(pituitary=[], pineal=[], choroid=[], tiv=[], subid=[])
    subids = [subid for subid, _ in df.iterrows()]
    for subid, _ in tqdm(df.iterrows(), total=len(df)):
        scan = dataset.find_scan(subid=str(subid))
        if len(scan) == 0:
            scan = inference_dataset.find_scan(subid=str(subid))
        if len(scan) == 0:
            continue
        scan = scan[0]

        try:
            vol_stats = utils.compute_volume(scan.label_path, index_mask_file=scan.label_path)
        except Exception:
            continue
        try:
            roi_vols = [stat[1] for stat in vol_stats]
        except Exception:
            roi_vols = [None, None, None]

        if len(roi_vols) < 3:
            continue

        scan = full_dataset.find_scan(subid=str(subid))[0]
        try:
            tiv = utils.compute_volume(scan.root / "t1.mask.nii.gz")[1]
        except Exception:
            continue

        df.loc[subid, 'choroid_volume'] = roi_vols[0]
        df.loc[subid, 'pineal_volume'] = roi_vols[1]
        df.loc[subid, 'pituitary_volume'] = roi_vols[2]
        df.loc[subid, 'tiv'] = tiv

        volumes['choroid'].append(roi_vols[0])
        volumes['pineal'].append(roi_vols[1])
        volumes['pituitary'].append(roi_vols[2])
        volumes['tiv'].append(tiv)
        volumes['subid'].append(subid)
        
    df.to_csv(project_dataroot / "clinical_data_full.csv")

  0%|          | 0/564 [00:00<?, ?it/s]

In [13]:
df.index.name = "subject_id"
df_full = df
not_nas = (~df_full['pineal_volume'].isna() & ~df_full['choroid_volume'].isna() & ~df_full['pituitary_volume'].isna())
df = df_full.loc[not_nas, :]

### Prepare Data

- Set categorical variables
  - ms_type
  - race
  - ethnicity
  - sex
- Set edss as ordinal variable

Do I need to factorize the categorical variables if I just set their dtype as "category"?

In [14]:
# edss is ordinal variable
edss_type = pd.CategoricalDtype(categories=np.linspace(0, 10, 21), ordered=True)
df['numEDSS'] = df['numEDSS'].astype(edss_type)
df.loc[df['dzdur'] == "#VALUE!", "dzdur"] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['numEDSS'] = df['numEDSS'].astype(edss_type)


In [15]:
try:
    df.insert(8, 'norm_pituitary_volume', df['pituitary_volume']/df['tiv'])
except ValueError:
    pass
try:
    df.insert(8, 'norm_pineal_volume', df['pineal_volume']/df['tiv'])
except ValueError:
    pass
try:
    df.insert(8, 'norm_choroid_volume', df['choroid_volume']/df['tiv'])
except ValueError:
    pass

In [16]:
try:
    df.insert(18, 'dz_type', df['ms_type'])
except Exception:
    pass

df.loc[:, 'dz_type'] = df['ms_type']

df.loc[df['ms_type'] == 'CIS', 'dz_type'] = 'RRMS'
df.loc[df['ms_type'].isin(['PPMS', 'SPMS', 'RPMS', 'PRMS']), 'dz_type'] = 'PMS'
df.loc[df['ms_type'].isin(['NIND', 'OIND', 'HC']), 'dz_type'] = '!MS'

In [17]:
try:
    df.insert(18, 'dz_type2', df['dz_type'])
except Exception:
    pass

df.loc[:, 'dz_type2'] = df['dz_type']

df.loc[df['dz_type'].isin(['RMS', 'PMS']), 'dz_type2'] = 'MS'

## Statistical Analysis

### Descriptive Stats


In [18]:
print(len(df), "patients in total")
print(sum(df['dz_type'] == "RRMS") / len(df), "patients with RRMS")
print(sum(df['dz_type'] == "PMS") / len(df), "patients with PMS")
print(sum(df['dz_type'] == "!MS") / len(df), "patients w/o MS")

403 patients in total
0.6724565756823822 patients with RRMS
0.18362282878411912 patients with PMS
0.13399503722084366 patients w/o MS


In [19]:
print("Proportion of female subjects: {:0.2f}".format(sum(df['sex'] == "Female") / len(df)) )
print("Average age at scan: {:0.2f}".format(df['age'].mean()))

Proportion of female subjects: 0.78
Average age at scan: 47.91


In [20]:
df_rrms = df.loc[df["dz_type"] == "RRMS"]
print("Fraction of female RRMS patients: {:0.2f}".format(sum(df_rrms['sex'] == "Female") / len(df_rrms)))
print("Average age of RRMS patient: {:0.2f}".format(df_rrms['age'].mean()))
print("\n")

df_pms = df.loc[df["dz_type"] == "PMS"]
print("Fraction of female PMS patients: {:0.2f}".format(sum(df_pms['sex'] == "Female") / len(df_pms)))
print("Average age of PMS patient: {:0.2f}".format(df_pms['age'].mean()))
print("\n")

df_notms = df.loc[df["dz_type"] == "!MS"]
print("Fraction of female non-MS patients: {:0.2f}".format(sum(df_notms['sex'] == "Female") / len(df_notms)))
print("Average age of non-MS patient: {:0.2f}".format(df_notms['age'].mean()))

Fraction of female RRMS patients: 0.82
Average age of RRMS patient: 44.78


Fraction of female PMS patients: 0.59
Average age of PMS patient: 58.66


Fraction of female non-MS patients: 0.81
Average age of non-MS patient: 48.97


#### Volumes

In [21]:
# choroid
print("Mean choroid volumes:")
print("RRMS:   {:0.2f}".format(df_rrms['norm_choroid_volume'].mean()*1000))
print("PMS:    {:0.2f}".format(df_pms['norm_choroid_volume'].mean()*1000))
print("not-MS: {:0.2f}".format(df_notms['norm_choroid_volume'].mean()*1000))
print("\n")

# pineal
print("Mean pineal volumes:")
print("RRMS:   {:0.2f}".format(df_rrms['norm_pineal_volume'].mean()*1000))
print("PMS:    {:0.2f}".format(df_pms['norm_pineal_volume'].mean()*1000))
print("not-MS: {:0.2f}".format(df_notms['norm_pineal_volume'].mean()*1000))
print("\n")

# pituitary
print("Mean pituitary volumes:")
print("RRMS:   {:0.2f}".format(df_rrms['norm_pituitary_volume'].mean()*1000))
print("PMS:    {:0.2f}".format(df_pms['norm_pituitary_volume'].mean()*1000))
print("not-MS: {:0.2f}".format(df_notms['norm_pituitary_volume'].mean()*1000))

Mean choroid volumes:
RRMS:   1.09
PMS:    1.12
not-MS: 1.15


Mean pineal volumes:
RRMS:   0.12
PMS:    0.10
not-MS: 0.15


Mean pituitary volumes:
RRMS:   0.43
PMS:    0.43
not-MS: 0.44


### Regression Models

Looking at how volume of ROI predicts ms_type

#### statsmodels usage

- `statsmodels.regression.linear_model.OLS`
- `statsmodels.discrete.discrete_model.Logit`
- `statsmodels.discrete.discrete_model.Probit`
- `statsmodels.miscmodels.ordinal_model.OrderedModel`

Why is OrderedModel not available under `statsmodels.api` unlike all the others? Also, OrderedModel isn't available under `statsmodels.miscmodels.api` either

Linear regression MS vs not-MS

In [25]:
import statsmodels.api as sm
import numpy as np

df_test = df[df['dz_type2'].isin(['MS', '!MS'])]

df_test.loc[:, 'sex_factor'], _ = df_test['sex'].factorize()
df_test.loc[:, 'dz_type2_factor'], _ = df_test['dz_type2'].factorize()
predictors = ["age", "sex_factor", "dz_type2_factor"]
outcome = "norm_choroid_volume"

x = np.asarray(df_test[predictors]).astype(np.float64)
Y = np.asarray(df_test[outcome]).astype(np.float64)

mod = sm.OLS(Y, x, missing='drop')

res = mod.fit()

print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.891
Model:                            OLS   Adj. R-squared (uncentered):              0.889
Method:                 Least Squares   F-statistic:                              342.0
Date:                Thu, 02 Jan 2025   Prob (F-statistic):                    4.64e-60
Time:                        18:38:05   Log-Likelihood:                          821.72
No. Observations:                 128   AIC:                                     -1637.
Df Residuals:                     125   BIC:                                     -1629.
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.loc[:, 'sex_factor'], _ = df_test['sex'].factorize()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.loc[:, 'dz_type2_factor'], _ = df_test['dz_type2'].factorize()


In [30]:
df_test = df[df['dz_type2'].isin(['MS', '!MS'])]

df_test.loc[:, 'sex_factor'], _ = df_test['sex'].factorize()
df_test.loc[:, 'dz_type2_factor'], _ = df_test['dz_type2'].factorize()
predictors = ["age", "sex_factor", "tiv", "pineal_volume"]
outcome = "dz_type2_factor"

x = np.asarray(df_test[predictors]).astype(np.float64)
Y = np.asarray(df_test[outcome]).astype(np.float64)

mod = sm.Logit(Y, x, missing='drop')

res = mod.fit()

print(res.summary())

Optimization terminated successfully.
         Current function value: 0.526666
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  128
Model:                          Logit   Df Residuals:                      124
Method:                           MLE   Df Model:                            3
Date:                Thu, 02 Jan 2025   Pseudo R-squ.:                  0.2265
Time:                        18:44:17   Log-Likelihood:                -67.413
converged:                       True   LL-Null:                       -87.154
Covariance Type:            nonrobust   LLR p-value:                 1.372e-08
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -0.0768      0.019     -3.983      0.000      -0.115      -0.039
x2            -1.0867      0.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.loc[:, 'sex_factor'], _ = df_test['sex'].factorize()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.loc[:, 'dz_type2_factor'], _ = df_test['dz_type2'].factorize()


In [None]:
ms_subtypes2 = ["RRMS", "PPMS", "SPMS"]
df_ms = df[df['ms_type'].isin(ms_subtypes2)]

In [None]:
df_ms.loc[:, 'sex_factor'], _ = df_ms['sex'].factorize()
df_ms.loc[:, 'ms_type_factor'], _ = df_ms['ms_type'].factorize()
predictors = ["age", "sex_factor", "ms_type_factor"]
outcome = "pineal_volume"

x = np.asarray(df_ms[predictors]).astype(np.float64)
Y = np.asarray(df_ms[outcome]).astype(np.float64)

mod = sm.OLS(Y, x, missing='drop')

res = mod.fit()

print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.688
Model:                            OLS   Adj. R-squared (uncentered):              0.683
Method:                 Least Squares   F-statistic:                              149.9
Date:                Mon, 30 Dec 2024   Prob (F-statistic):                    2.52e-51
Time:                        18:29:34   Log-Likelihood:                         -1275.1
No. Observations:                 207   AIC:                                      2556.
Df Residuals:                     204   BIC:                                      2566.
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [None]:
predictors = ["tiv"]
outcome = "pituitary_volume"

x = np.asarray(df_ms[predictors]).astype(np.float64)
Y = np.asarray(df_ms[outcome]).astype(np.float64)

mod = sm.OLS(Y, x, missing='drop')

res = mod.fit()

print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.885
Model:                            OLS   Adj. R-squared (uncentered):              0.885
Method:                 Least Squares   F-statistic:                              1592.
Date:                Mon, 30 Dec 2024   Prob (F-statistic):                    7.34e-99
Time:                        18:30:39   Log-Likelihood:                         -1413.3
No. Observations:                 207   AIC:                                      2829.
Df Residuals:                     206   BIC:                                      2832.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [None]:
import numpy as np 
np.asarray(x)

array([[2.01336035e+01, 1.26161339e+03, 0.00000000e+00, 0.00000000e+00],
       [2.88807972e+01, 1.45832702e+03, 0.00000000e+00, 0.00000000e+00],
       [6.64704594e+01, 1.35252909e+03, 0.00000000e+00, 0.00000000e+00],
       [4.89432185e+01, 1.43492793e+03, 0.00000000e+00, 0.00000000e+00],
       [5.90182336e+01, 1.34423632e+03, 0.00000000e+00, 0.00000000e+00],
       [3.57882057e+01, 1.82958525e+03, 0.00000000e+00, 0.00000000e+00],
       [5.94590155e+01, 1.35972546e+03, 0.00000000e+00, 0.00000000e+00],
       [           nan, 1.41245473e+03, 0.00000000e+00, 0.00000000e+00],
       [4.23178010e+01, 1.37809810e+03, 0.00000000e+00, 0.00000000e+00],
       [6.19640804e+01, 1.59898444e+03, 1.00000000e+00, 1.00000000e+00],
       [3.74801511e+01, 1.49433394e+03, 0.00000000e+00, 0.00000000e+00],
       [7.23484641e+01, 1.26129874e+03, 0.00000000e+00, 1.00000000e+00],
       [4.23506543e+01, 1.40538667e+03, 0.00000000e+00, 0.00000000e+00],
       [4.00208071e+01, 1.40781802e+03, 0.00000000e

In [None]:
spector_data = sm.datasets.spector.load()
spector_data.exog = sm.add_constant(spector_data.exog, prepend=False)

x1 = 