In [19]:
import pandas as pd
from reload_recursive import reload_recursive
import os
from loguru import logger
from pathlib import Path
import json
import numpy as np
from tqdm.notebook import tqdm
import re
import sys

import mri_data
import monai_training

In [20]:
reload_recursive(mri_data)
reload_recursive(monai_training)

from mri_data.file_manager import DataSet, scan_3Tpioneer_bids
from mri_data import file_manager as fm
from mri_data import utils
from monai_training.preprocess import DataSetProcesser

In [21]:
logger.remove()
logger.add(sys.stderr, level="INFO")

3

In [22]:
to_load = True
drive_root = fm.get_drive_root()
msmri_home = Path("/home/srs-9/Projects/ms_mri")
inference_root = drive_root / "3Tpioneer_bids_predictions"
dataroot = drive_root / "3Tpioneer_bids"
clinical_data_root = drive_root / "Secure_Data" / "Large"
project_dataroot = msmri_home / "data"

In [23]:
clinical_data_root

PosixPath('/mnt/h/Secure_Data/Large')

In [24]:
def subject_to_subid(subject):
    if not isinstance(subject, str):
        return None
    re_match = re.match(r"ms(\d{4})", subject)
    if re_match:
        return_val = int(re_match[1])
        return return_val
    else:
        return None

### Data Cleaning

- Add column which is subid as int
- Set the index to the new column
- Remove any rows where subid is NaN
- Rename columns
- Check "Working with missing data" on pandas documentation

In [25]:
if not to_load:
    df = pd.read_csv(clinical_data_root / "Clinical_Data_All.csv")
    df = df.convert_dtypes()

    with open(msmri_home / "data" / "clinical_data_columns_full.txt", 'r') as f:
        keep_columns = [line.rstrip() for line in f.readlines()]
    df = df[keep_columns]
    new_columns = {
        "ID": "subject",
        "age_at_obs_start": "age",
    }
    df.rename(columns=new_columns, inplace=True)
    df["subid"] = df["subject"].apply(subject_to_subid)
    df.drop(df[df["subid"].isna()].index, inplace=True)
    df["subid"] = df["subid"].astype(int)
    df = df.set_index("subid")

    new_columns = {}
    for col in df.columns:
        new_columns[col] = col.replace(" ", "_")
    df.rename(columns=new_columns, inplace=True)
    df.head()

### Additional Data

- ROI size

In [26]:
if not to_load:
    dataset_proc = DataSetProcesser.new_dataset(dataroot, scan_3Tpioneer_bids, filters=[fm.filter_first_ses])
    full_dataset = dataset_proc.dataset
    dataset_proc.prepare_labels(["choroid_t1_flair", "pineal", "pituitary"], ["CH", "SRS", "ED", "DT"])
    dataset = dataset_proc.dataset

    inference_dataset_proc = DataSetProcesser.new_dataset(inference_root, scan_3Tpioneer_bids, filters=[fm.filter_first_ses])
    inference_dataset_proc.prepare_labels("flair.t1_choroid_pineal_pituitary3_pred")
    inference_dataset = inference_dataset_proc.dataset

In [27]:
try:
    df.insert(7, 'tiv', None)
except ValueError:
    pass
try:
    df.insert(7, 'pituitary_volume', None)
except ValueError:
    pass
try:
    df.insert(7, 'pineal_volume', None)
except ValueError:
    pass
try:
    df.insert(7, 'choroid_volume', None)
except ValueError:
    pass

In [28]:
if not to_load:
    volumes = dict(pituitary=[], pineal=[], choroid=[], tiv=[], subid=[])
    subids = [subid for subid, _ in df.iterrows()]
    for subid, _ in tqdm(df.iterrows(), total=len(df)):
        scan = dataset.find_scan(subid=str(subid))
        if len(scan) == 0:
            scan = inference_dataset.find_scan(subid=str(subid))
        if len(scan) == 0:
            continue
        scan = scan[0]

        try:
            vol_stats = utils.compute_volume(scan.label_path, index_mask_file=scan.label_path)
        except Exception:
            continue
        try:
            roi_vols = [stat[1] for stat in vol_stats]
        except Exception:
            continue

        if len(roi_vols) < 3:
            continue

        scan = full_dataset.find_scan(subid=str(subid))[0]
        try:
            tiv = utils.compute_volume(scan.root / "t1.mask.nii.gz")[1]
        except Exception:
            continue

        df.loc[subid, 'choroid_volume'] = roi_vols[0]
        df.loc[subid, 'pineal_volume'] = roi_vols[1]
        df.loc[subid, 'pituitary_volume'] = roi_vols[2]
        df.loc[subid, 'tiv'] = tiv

        volumes['choroid'].append(roi_vols[0])
        volumes['pineal'].append(roi_vols[1])
        volumes['pituitary'].append(roi_vols[2])
        volumes['tiv'].append(tiv)
        volumes['subid'].append(subid)
        
    df.to_csv(project_dataroot / "clinical_data_full.csv")

In [29]:
if to_load:
    df = pd.read_csv(project_dataroot / "clinical_data_full.csv")

In [30]:
df_full = df
not_nas = (~df_full['pineal_volume'].isna() & ~df_full['choroid_volume'].isna() & ~df_full['pituitary_volume'].isna())
df = df_full.loc[not_nas, :]

### Prepare Data

- Set categorical variables
  - ms_type
  - race
  - ethnicity
  - sex
- Set edss as ordinal variable

Do I need to factorize the categorical variables if I just set their dtype as "category"?

In [32]:
categorical_vars = ['sex', 'ms_type']
for var in categorical_vars:
    df[var] = df[var].astype("category")

# edss is ordinal variable
edss_type = pd.CategoricalDtype(categories=np.linspace(0, 10, 21), ordered=True)
df['numEDSS'] = df['numEDSS'].astype(edss_type)
df.loc[df['dzdur'] == "#VALUE!", "dzdur"] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[var] = df[var].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['numEDSS'] = df['numEDSS'].astype(edss_type)


In [33]:
df['ms_type'].unique()

['RRMS', 'PPMS', 'NIND', 'UNK', 'SPMS', 'RPMS', 'PRMS', 'HC', 'OIND', 'CIS']
Categories (10, object): ['CIS', 'HC', 'NIND', 'OIND', ..., 'RPMS', 'RRMS', 'SPMS', 'UNK']

In [None]:
print(df.columns)
df.head()

Index([           'subject',            'ms_type',                'age',
                      'sex',           'sz_onset',            'numEDSS',
                    'dzdur',     'choroid_volume',      'pineal_volume',
         'pituitary_volume',                'tiv',                'bpf',
       'cortical_thickness',           'vscaling',              'pgrey',
                     'grey',              'white',              'brain',
                     'vcsf',           'thalamus',         'lesion_vol',
         'lesion_vol_cubic',        ('tiv', 1001)],
      dtype='object')


Unnamed: 0_level_0,subject,ms_type,age,sex,sz_onset,numEDSS,dzdur,choroid_volume,pineal_volume,pituitary_volume,...,vscaling,pgrey,grey,white,brain,vcsf,thalamus,lesion_vol,lesion_vol_cubic,"(tiv, 1001)"
subid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,ms1001,RRMS,50.057493,Female,2/1/2003,,14.03931446,1762.325195,37.888454,685.064148,...,1.368794,594.168618,793.050463,656.104788,1449.15525,46.571322,17.636198,2.9486,1.433965,8
1002,ms1002,RRMS,27.079341,Female,4/1/2019,10.0,1.138914746,1711.622437,344.577332,471.041748,...,1.517059,711.733447,946.13135,733.628024,1679.759374,30.769859,21.952066,1.7162,1.197262,8
1003,ms1003,PPMS,68.04468,Female,1/1/2003,,14.2391721,2240.688232,210.448578,942.154175,...,1.281489,533.740848,669.666823,654.020676,1323.687499,53.882858,16.765896,6.8224,1.896614,8
1004,ms1004,RRMS,28.18814,Female,10/1/2015,10.0,3.931446093,620.542419,146.943634,233.47139,...,1.653051,595.517415,772.225656,652.317795,1424.54345,30.127786,21.176831,1.3281,1.099201,8
1005,ms1005,PPMS,47.059629,Female,1/1/2001,9.0,19.51486612,2380.852051,135.682953,1026.582397,...,1.444747,543.994298,700.592183,632.319278,1332.911461,72.107552,13.087697,18.5138,2.645444,8


## Statistical Analysis

### Descriptive Stats


In [None]:
print(len(df), "patients in total")
print(sum(df['ms_type'] == "RRMS"), "patients with RRMS")
print(sum(df['ms_type'] == "PPMS"), "patients with PPMS")
print(sum(df['ms_type'] == "SPMS"), "patients with SPMS")
print(sum(df['ms_type'] == "RPMS"), "patients with RPMS")
print(sum(df['ms_type'] == "PRMS"), "patients with PRMS")
print(sum(df['ms_type'] == "OIND"), "patients with OIND")
print(sum(df['ms_type'] == "NIND"), "patients with NIND")
print(sum(df['ms_type'] == "HC"), "HC")
print(sum(df['ms_type'] == "UNK"), "UNK")
print(sum(df['ms_type'] == "CIS"), "CIS")

248 patients in total
172 patients with RRMS
16 patients with PPMS
19 patients with SPMS
5 patients with RPMS
1 patients with PRMS
11 patients with OIND
20 patients with NIND
1 HC
2 UNK
1 CIS


In [None]:
print("Proportion of female subjects: {:0.2f}".format(sum(df['sex'] == "Female") / len(df)) )
print("Average age at scan: {:0.2f}".format(df['age'].mean()))

Proportion of female subjects: 0.79
Average age at scan: 48.55


In [None]:
df_rrms = df.loc[df["ms_type"] == "RRMS"]
print("Fraction of female RRMS patients: {:0.2f}".format(sum(df_rrms['sex'] == "Female") / len(df_rrms)))
print("Average age of RRMS patient: {:0.2f}".format(df_rrms['age'].mean()))
print("\n")

df_oind = df.loc[df["ms_type"] == "OIND"]
print("Fraction of female OIND patients: {:0.2f}".format(sum(df_oind['sex'] == "Female") / len(df_oind)))
print("Average age of OIND patient: {:0.2f}".format(df_oind['age'].mean()))
print("\n")

df_nind = df.loc[df["ms_type"] == "NIND"]
print("Fraction of female NIND patients: {:0.2f}".format(sum(df_nind['sex'] == "Female") / len(df_nind)))
print("Average age of NIND patient: {:0.2f}".format(df_nind['age'].mean()))

Fraction of female RRMS patients: 0.82
Average age of RRMS patient: 46.34


Fraction of female OIND patients: 0.82
Average age of OIND patient: 49.42


Fraction of female NIND patients: 0.85
Average age of NIND patient: 48.19


#### Volumes

In [None]:
# choroid
print("Mean choroid volumes:")
print("RRMS: {:0.2f}".format(df_rrms['choroid_volume'].mean()))
print("OIND: {:0.2f}".format(df_oind['choroid_volume'].mean()))
print("NIND: {:0.2f}".format(df_nind['choroid_volume'].mean()))
print("\n")

# pineal
print("Mean pineal volumes:")
print("RRMS: {:0.2f}".format(df_rrms['pineal_volume'].mean()))
print("OIND: {:0.2f}".format(df_oind['pineal_volume'].mean()))
print("NIND: {:0.2f}".format(df_nind['pineal_volume'].mean()))
print("\n")

# pituitary
print("Mean pituitary volumes:")
print("RRMS: {:0.2f}".format(df_rrms['pituitary_volume'].mean()))
print("OIND: {:0.2f}".format(df_oind['pituitary_volume'].mean()))
print("NIND: {:0.2f}".format(df_nind['pituitary_volume'].mean()))

Mean choroid volumes:
RRMS: 1688.83
OIND: 1925.77
NIND: 1860.47


Mean pineal volumes:
RRMS: 183.63
OIND: 211.41
NIND: 285.26


Mean pituitary volumes:
RRMS: 625.91
OIND: 674.54
NIND: 684.56


In [None]:
df.head()

Unnamed: 0_level_0,subject,ms_type,age,dz_type,sex,sz_onset,numEDSS,dzdur,choroid_volume,pineal_volume,...,white,brain,vcsf,thalamus,lesion_vol,lesion_vol_cubic,"(tiv, 1001)",sex_factor,ms_type_factor,dz_type_factor
subid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,ms1001,RRMS,50.057493,MS,Female,2/1/2003,,14.03931446,1762.325195,37.888454,...,656.104788,1449.15525,46.571322,17.636198,2.9486,1.433965,8,0,0,0
1002,ms1002,RRMS,27.079341,MS,Female,4/1/2019,10.0,1.138914746,1711.622437,344.577332,...,733.628024,1679.759374,30.769859,21.952066,1.7162,1.197262,8,0,0,0
1003,ms1003,PPMS,68.04468,MS,Female,1/1/2003,,14.2391721,2240.688232,210.448578,...,654.020676,1323.687499,53.882858,16.765896,6.8224,1.896614,8,0,1,0
1004,ms1004,RRMS,28.18814,MS,Female,10/1/2015,10.0,3.931446093,620.542419,146.943634,...,652.317795,1424.54345,30.127786,21.176831,1.3281,1.099201,8,0,0,0
1005,ms1005,PPMS,47.059629,MS,Female,1/1/2001,9.0,19.51486612,2380.852051,135.682953,...,632.319278,1332.911461,72.107552,13.087697,18.5138,2.645444,8,0,1,0


In [40]:
try:
    df.insert(3, "dz_type", None)
except Exception:
    pass

ms_subtypes = ['RRMS', 'PPMS', 'SPMS', 'RPMS', 'PRMS']
for subid, _ in df.iterrows():
    if df.loc[subid, 'ms_type'] in ms_subtypes:
        df.loc[subid, 'dz_type'] = "MS"
    else:
        df.loc[subid, 'dz_type'] = "Not MS"

In [None]:
import matplotlib.pyplot as plt

c = []
for 

### Regression Models

Looking at how volume of ROI predicts ms_type

#### statsmodels usage

- `statsmodels.regression.linear_model.OLS`
- `statsmodels.discrete.discrete_model.Logit`
- `statsmodels.discrete.discrete_model.Probit`
- `statsmodels.miscmodels.ordinal_model.OrderedModel`

Why is OrderedModel not available under `statsmodels.api` unlike all the others? Also, OrderedModel isn't available under `statsmodels.miscmodels.api` either

Linear regression predicting pituitary size

In [85]:
check = df.loc[~df['dzdur'].isna(), 'dzdur'].astype('float64')
check.median()

10.43092592

In [72]:
df['dzdur'].isna()

0      False
1      False
2      False
3      False
4      False
       ...  
447     True
449     True
461     True
480     True
484     True
Name: dzdur, Length: 248, dtype: bool

In [46]:
import statsmodels.api as sm
import numpy as np

df.loc[:, 'sex_factor'], _ = df['sex'].factorize()
df.loc[:, 'dz_type_factor'], _ = df['dz_type'].factorize()
predictors = ["age", "sex_factor", "dz_type_factor"]
outcome = "choroid_volume"

x = np.asarray(df[predictors]).astype(np.float64)
Y = np.asarray(df[outcome]).astype(np.float64)

mod = sm.OLS(Y, x, missing='drop')

res = mod.fit()

print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.870
Model:                            OLS   Adj. R-squared (uncentered):              0.868
Method:                 Least Squares   F-statistic:                              544.6
Date:                Tue, 31 Dec 2024   Prob (F-statistic):                   4.87e-108
Time:                        14:12:52   Log-Likelihood:                         -1957.6
No. Observations:                 248   AIC:                                      3921.
Df Residuals:                     245   BIC:                                      3932.
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [None]:
ms_subtypes2 = ["RRMS", "PPMS", "SPMS"]
df_ms = df[df['ms_type'].isin(ms_subtypes2)]

In [None]:
df_ms.loc[:, 'sex_factor'], _ = df_ms['sex'].factorize()
df_ms.loc[:, 'ms_type_factor'], _ = df_ms['ms_type'].factorize()
predictors = ["age", "sex_factor", "ms_type_factor"]
outcome = "pineal_volume"

x = np.asarray(df_ms[predictors]).astype(np.float64)
Y = np.asarray(df_ms[outcome]).astype(np.float64)

mod = sm.OLS(Y, x, missing='drop')

res = mod.fit()

print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.688
Model:                            OLS   Adj. R-squared (uncentered):              0.683
Method:                 Least Squares   F-statistic:                              149.9
Date:                Mon, 30 Dec 2024   Prob (F-statistic):                    2.52e-51
Time:                        18:29:34   Log-Likelihood:                         -1275.1
No. Observations:                 207   AIC:                                      2556.
Df Residuals:                     204   BIC:                                      2566.
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [None]:
predictors = ["tiv"]
outcome = "pituitary_volume"

x = np.asarray(df_ms[predictors]).astype(np.float64)
Y = np.asarray(df_ms[outcome]).astype(np.float64)

mod = sm.OLS(Y, x, missing='drop')

res = mod.fit()

print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.885
Model:                            OLS   Adj. R-squared (uncentered):              0.885
Method:                 Least Squares   F-statistic:                              1592.
Date:                Mon, 30 Dec 2024   Prob (F-statistic):                    7.34e-99
Time:                        18:30:39   Log-Likelihood:                         -1413.3
No. Observations:                 207   AIC:                                      2829.
Df Residuals:                     206   BIC:                                      2832.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [None]:
import numpy as np 
np.asarray(x)

array([[2.01336035e+01, 1.26161339e+03, 0.00000000e+00, 0.00000000e+00],
       [2.88807972e+01, 1.45832702e+03, 0.00000000e+00, 0.00000000e+00],
       [6.64704594e+01, 1.35252909e+03, 0.00000000e+00, 0.00000000e+00],
       [4.89432185e+01, 1.43492793e+03, 0.00000000e+00, 0.00000000e+00],
       [5.90182336e+01, 1.34423632e+03, 0.00000000e+00, 0.00000000e+00],
       [3.57882057e+01, 1.82958525e+03, 0.00000000e+00, 0.00000000e+00],
       [5.94590155e+01, 1.35972546e+03, 0.00000000e+00, 0.00000000e+00],
       [           nan, 1.41245473e+03, 0.00000000e+00, 0.00000000e+00],
       [4.23178010e+01, 1.37809810e+03, 0.00000000e+00, 0.00000000e+00],
       [6.19640804e+01, 1.59898444e+03, 1.00000000e+00, 1.00000000e+00],
       [3.74801511e+01, 1.49433394e+03, 0.00000000e+00, 0.00000000e+00],
       [7.23484641e+01, 1.26129874e+03, 0.00000000e+00, 1.00000000e+00],
       [4.23506543e+01, 1.40538667e+03, 0.00000000e+00, 0.00000000e+00],
       [4.00208071e+01, 1.40781802e+03, 0.00000000e

In [None]:
spector_data = sm.datasets.spector.load()
spector_data.exog = sm.add_constant(spector_data.exog, prepend=False)

x1 = 