In [1]:
import pandas as pd
from reload_recursive import reload_recursive
import os
from loguru import logger
from pathlib import Path
import json
import numpy as np
from tqdm.notebook import tqdm
import re
import sys

import mri_data
import monai_training

In [2]:
reload_recursive(mri_data)
reload_recursive(monai_training)

from mri_data.file_manager import DataSet, scan_3Tpioneer_bids
from mri_data import file_manager as fm
from mri_data import utils
from monai_training.preprocess import DataSetProcesser

In [3]:
logger.remove()
logger.add(sys.stderr, level="INFO")

1

In [4]:
to_load = False
drive_root = fm.get_drive_root()
msmri_home = Path("/home/srs-9/Projects/ms_mri")
inference_root = drive_root / "3Tpioneer_bids_predictions"
dataroot = drive_root / "3Tpioneer_bids"
clinical_data_root = drive_root / "Secure_Data" / "Large"
project_dataroot = msmri_home / "data"

In [5]:
clinical_data_root

PosixPath('/mnt/h/Secure_Data/Large')

In [6]:
def subject_to_subid(subject):
    if not isinstance(subject, str):
        return None
    re_match = re.match(r"ms(\d{4})", subject)
    if re_match:
        return_val = int(re_match[1])
        return return_val
    else:
        return None

### Data Cleaning

- Add column which is subid as int
- Set the index to the new column
- Remove any rows where subid is NaN
- Rename columns
- Check "Working with missing data" on pandas documentation

In [10]:
if not to_load:
    df = pd.read_csv(clinical_data_root / "Clinical_Data_All.csv")
    df = df.convert_dtypes()

    with open(msmri_home / "data" / "clinical_data_columns_full.txt", 'r') as f:
        keep_columns = [line.rstrip() for line in f.readlines()]
    df = df[keep_columns]
    new_columns = {
        "ID": "subject",
        "age_at_obs_start": "age",
    }
    df.rename(columns=new_columns, inplace=True)
    df["subid"] = df["subject"].apply(subject_to_subid)
    df.drop(df[df["subid"].isna()].index, inplace=True)
    df["subid"] = df["subid"].astype(int)
    df = df.set_index("subid")

    new_columns = {}
    for col in df.columns:
        new_columns[col] = col.replace(" ", "_")
    df.rename(columns=new_columns, inplace=True)
    df.head()

In [11]:
dataset_proc = DataSetProcesser.new_dataset(dataroot, scan_3Tpioneer_bids, filters=[fm.filter_first_ses])
dataset = dataset_proc.dataset

### Additional Data

- ROI size

In [None]:
if not to_load:
    dataset_proc = DataSetProcesser.new_dataset(dataroot, scan_3Tpioneer_bids, filters=[fm.filter_first_ses])
    dataset = dataset_proc.dataset

    volumes = dict(pituitary=[], pineal=[], choroid=[], subid=[])
    for subid, _ in tqdm(df.iterrows(), total=len(df)):
        scan = dataset.find_scan(subid=str(subid))[0]
        volumes['subid'].append(subid)
        try:
            pituitary_label = fm.find_label(scan, "pituitary", ["CH", "DT"])
        except FileNotFoundError:
            volumes['pituitary'].append(None)
        else:
            volumes['pituitary'].append(utils.compute_volume(pituitary_label)[1])
        try:
            pineal_label = fm.find_label(scan, "pineal", ["CH", "SRS"])
        except FileNotFoundError:
            volumes['pineal'].append(None)
        else:
            volumes['pineal'].append(utils.compute_volume(pineal_label)[1])
        try:
            choroid_label = fm.find_label(scan, "choroid_t1_flair", ["CH", "ED"])
        except FileNotFoundError:
            volumes['choroid'].append(None)
        else:
            volumes['choroid'].append(utils.compute_volume(choroid_label)[1])


    df.insert(7, 'pituitary_volume', volumes['pituitary'])
    df.insert(8, 'pituitary_vol_frac', df['pituitary_volume'] / df['brain'])
    df.insert(7, 'pineal_volume', volumes['pineal'])
    df.insert(8, 'pineal_vol_frac', df['pineal_volume'] / df['brain'])
    df.insert(7, 'choroid_volume', volumes['choroid'])
    df.insert(8, 'choroid_vol_frac', df['choroid_volume'] / df['brain'])
        
    df.to_csv(project_dataroot / "clinical_data_full.csv")

In [None]:
if to_load:
    df = pd.read_csv(project_dataroot / "clinical_data.csv")

In [62]:
df_full = df
not_nas = (~df_full['pineal_volume'].isna() & ~df_full['choroid_volume'].isna() & ~df_full['pituitary_volume'].isna())
df = df_full.loc[not_nas, :]


### Prepare Data

- Set categorical variables
  - ms_type
  - race
  - ethnicity
  - sex
- Set edss as ordinal variable

Do I need to factorize the categorical variables if I just set their dtype as "category"?

In [65]:
categorical_vars = ['sex', 'ms_type']
for var in categorical_vars:
    df[var] = df[var].astype("category")

# edss is ordinal variable
edss_type = pd.CategoricalDtype(categories=np.linspace(0, 10, 21), ordered=True)
df['numEDSS'] = df['numEDSS'].astype(edss_type)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[var] = df[var].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['numEDSS'] = df['numEDSS'].astype(edss_type)


In [66]:
print(df.columns)
df.head()

Index(['subject', 'ms_type', 'age', 'sex', 'sz_onset', 'numEDSS', 'dzdur',
       'choroid_volume', 'choroid_vol_frac', 'pineal_volume',
       'pineal_vol_frac', 'pituitary_volume', 'pituitary_vol_frac', 'bpf',
       'cortical_thickness', 'vscaling', 'pgrey', 'grey', 'white', 'brain',
       'vcsf', 'thalamus', 'lesion_vol', 'lesion_vol_cubic'],
      dtype='object')


Unnamed: 0_level_0,subject,ms_type,age,sex,sz_onset,numEDSS,dzdur,choroid_volume,choroid_vol_frac,pineal_volume,...,cortical_thickness,vscaling,pgrey,grey,white,brain,vcsf,thalamus,lesion_vol,lesion_vol_cubic
subid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1010,ms1010,RRMS,20.133603,Female,1/18/2015,10.0,3.058095603,1933.325928,1.4231,150.01709,...,2.520112,1.424417,574.209274,761.14905,597.381813,1358.530863,34.530367,18.864839,1.4367,1.12838
1011,ms1011,RRMS,28.880797,Female,1/1/2010,10.0,8.692438263,2303.349365,1.558331,283.630951,...,2.420146,1.362021,611.632085,812.513441,665.573704,1478.087146,42.174506,19.926914,1.8185,1.220594
1019,ms1019,RRMS,66.470459,Female,1/1/2009,2.0,10.43092592,3353.079346,2.596452,133.119675,...,2.103185,1.38752,531.960248,699.879055,591.529002,1291.408057,89.537255,17.112361,3.7688,1.556208
1029,ms1029,NIND,67.395828,Female,.,3.0,#VALUE!,2429.476318,1.888807,154.114304,...,2.207249,1.433026,518.138503,693.608909,592.640526,1286.249435,51.680725,18.297979,5.4155,1.756088
1033,ms1033,RRMS,48.943219,Female,1/1/1992,1.0,25.87745715,1834.491211,1.244348,312.319214,...,2.364547,1.374984,592.647314,796.912496,677.345968,1474.258463,29.728157,19.121122,1.9574,1.250911


## Statistical Analysis

### Descriptive Stats


In [68]:
print(len(df), "patients in total")
print(sum(df['ms_type'] == "RRMS"), "patients with RRMS")
print(sum(df['ms_type'] == "OIND"), "patients with OIND")
print(sum(df['ms_type'] == "NIND"), "patients with NIND")

40 patients in total
20 patients with RRMS
9 patients with OIND
11 patients with NIND


In [69]:
print("Proportion of female subjects: {:0.2f}".format(sum(df['sex'] == "Female") / len(df)) )
print("Average age at scan: {:0.2f}".format(df['age'].mean()))

Proportion of female subjects: 0.90
Average age at scan: 50.08


In [70]:
df_rrms = df.loc[df["ms_type"] == "RRMS"]
print("Fraction of female RRMS patients: {:0.2f}".format(sum(df_rrms['sex'] == "Female") / len(df_rrms)))
print("Average age of RRMS patient: {:0.2f}".format(df_rrms['age'].mean()))
print("\n")

df_oind = df.loc[df["ms_type"] == "OIND"]
print("Fraction of female OIND patients: {:0.2f}".format(sum(df_oind['sex'] == "Female") / len(df_oind)))
print("Average age of OIND patient: {:0.2f}".format(df_oind['age'].mean()))
print("\n")

df_nind = df.loc[df["ms_type"] == "NIND"]
print("Fraction of female NIND patients: {:0.2f}".format(sum(df_nind['sex'] == "Female") / len(df_nind)))
print("Average age of NIND patient: {:0.2f}".format(df_nind['age'].mean()))

Fraction of female RRMS patients: 0.90
Average age of RRMS patient: 49.58


Fraction of female OIND patients: 0.89
Average age of OIND patient: 49.68


Fraction of female NIND patients: 0.91
Average age of NIND patient: 51.34


#### Volumes

In [72]:
# choroid
print("Mean choroid volumes:")
print("RRMS: {:0.2f}".format(df_rrms['choroid_volume'].mean()))
print("OIND: {:0.2f}".format(df_oind['choroid_volume'].mean()))
print("NIND: {:0.2f}".format(df_nind['choroid_volume'].mean()))
print("\n")

# pineal
print("Mean pineal volumes:")
print("RRMS: {:0.2f}".format(df_rrms['pineal_volume'].mean()))
print("OIND: {:0.2f}".format(df_oind['pineal_volume'].mean()))
print("NIND: {:0.2f}".format(df_nind['pineal_volume'].mean()))
print("\n")

# pituitary
print("Mean pituitary volumes:")
print("RRMS: {:0.2f}".format(df_rrms['pituitary_volume'].mean()))
print("OIND: {:0.2f}".format(df_oind['pituitary_volume'].mean()))
print("NIND: {:0.2f}".format(df_nind['pituitary_volume'].mean()))

Mean choroid volumes:
RRMS: 2049.62
OIND: 1957.32
NIND: 1873.22


Mean pineal volumes:
RRMS: 221.39
OIND: 214.76
NIND: 506.09


Mean pituitary volumes:
RRMS: 758.37
OIND: 676.86
NIND: 612.96


### Regression Models

Looking at how volume of ROI predicts ms_type

#### Pituitary Volume

In [74]:
from statsmodels.miscmodels.ordinal_model import OrderedModel

df['sex_factor'], _ = df['sex'].factorize()
predictors = ["age", "brain", "sex_factor", "pituitary_volume"]
outcome = "ms_type"

x = df[predictors]
Y = df[outcome]

mod_prob = OrderedModel(Y, x, distr='probit')
res_prob = mod_prob.fit(method='bfgs')
res_prob.summary()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sex_factor'], _ = df['sex'].factorize()


ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

#### statsmodels usage

- `statsmodels.regression.linear_model.OLS`
- `statsmodels.discrete.discrete_model.Logit`
- `statsmodels.discrete.discrete_model.Probit`
- `statsmodels.miscmodels.ordinal_model.OrderedModel`

Why is OrderedModel not available under `statsmodels.api` unlike all the others? Also, OrderedModel isn't available under `statsmodels.miscmodels.api` either

In [None]:
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel

spector_data = sm.datasets.spector.load()

Y = spector_data.endog
x = spector_data.exog

# Linear regression
mod = sm.OLS(Y, x)

# Regression with discrete dependent variable
logit_mod = sm.Logit(Y, x)
probit_mod = sm.Probit(Y, x)


# Ordinal regression
mod = OrderedModel(Y, x, distr='probit')

Linear regression predicting pituitary size

In [95]:
import statsmodels.api as sm
import numpy as np

df.loc[:, 'sex_factor'], _ = df['sex'].factorize()
df.loc[:, 'ms_type_factor'], _ = df['ms_type'].factorize()
predictors = ["age", "brain", "sex_factor", "ms_type_factor"]
outcome = "choroid_volume"

x = np.asarray(df[predictors]).astype(np.float64)
Y = np.asarray(df[outcome]).astype(np.float64)

mod = sm.OLS(Y, x, missing='drop')

res = mod.fit()

print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.934
Model:                            OLS   Adj. R-squared (uncentered):              0.927
Method:                 Least Squares   F-statistic:                              128.4
Date:                Mon, 16 Dec 2024   Prob (F-statistic):                    8.81e-21
Time:                        15:48:58   Log-Likelihood:                         -307.75
No. Observations:                  40   AIC:                                      623.5
Df Residuals:                      36   BIC:                                      630.3
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [87]:
x

array([[20.13360346, 1358.530863, 0, 0],
       [28.88079724, 1478.087146, 0, 0],
       [66.4704594, 1291.408057, 0, 0],
       [67.39582763, 1286.249435, 0, 1],
       [48.94321853, 1474.258463, 0, 0],
       [58.76362043, 1442.115777, 0, 0],
       [75.8610305, 1206.354359, 0, 0],
       [47.24305974, 1321.368209, 0, 0],
       [59.01823359, 1347.157896, 0, 0],
       [35.78820566, 1598.615826, 0, 0],
       [56.38175546, 1382.525872, 0, 0],
       [38.74226578, 1498.903488, 0, 0],
       [60.59793024, 1404.76219, 1, 0],
       [59.4590155, 1285.548297, 0, 0],
       [65.29595357, 1431.563638, 0, 0],
       [61.96408038, 1371.028524, 1, 2],
       [37.48015113, 1506.950728, 0, 0],
       [72.34846411, 1284.763022, 0, 2],
       [42.35065433, 1448.020589, 0, 0],
       [40.0208071, 1365.799575, 0, 0],
       [58.3036741, 1292.553651, 1, 0],
       [37.07222253, 1470.796433, 0, 0],
       [46.06855391, 1535.215347, 0, 1],
       [54.71444998, 1291.236497, 0, 0],
       [35.52537918, 1

In [23]:
import numpy as np 
np.asarray(x)

array([[2.01336035e+01, 1.26161339e+03, 0.00000000e+00, 0.00000000e+00],
       [2.88807972e+01, 1.45832702e+03, 0.00000000e+00, 0.00000000e+00],
       [6.64704594e+01, 1.35252909e+03, 0.00000000e+00, 0.00000000e+00],
       [4.89432185e+01, 1.43492793e+03, 0.00000000e+00, 0.00000000e+00],
       [5.90182336e+01, 1.34423632e+03, 0.00000000e+00, 0.00000000e+00],
       [3.57882057e+01, 1.82958525e+03, 0.00000000e+00, 0.00000000e+00],
       [5.94590155e+01, 1.35972546e+03, 0.00000000e+00, 0.00000000e+00],
       [           nan, 1.41245473e+03, 0.00000000e+00, 0.00000000e+00],
       [4.23178010e+01, 1.37809810e+03, 0.00000000e+00, 0.00000000e+00],
       [6.19640804e+01, 1.59898444e+03, 1.00000000e+00, 1.00000000e+00],
       [3.74801511e+01, 1.49433394e+03, 0.00000000e+00, 0.00000000e+00],
       [7.23484641e+01, 1.26129874e+03, 0.00000000e+00, 1.00000000e+00],
       [4.23506543e+01, 1.40538667e+03, 0.00000000e+00, 0.00000000e+00],
       [4.00208071e+01, 1.40781802e+03, 0.00000000e

In [None]:
spector_data = sm.datasets.spector.load()
spector_data.exog = sm.add_constant(spector_data.exog, prepend=False)

x1 = 