In [1]:
import pandas as pd
from reload_recursive import reload_recursive
import os
from loguru import logger
from pathlib import Path
import json
import numpy as np
from tqdm.notebook import tqdm
import re
import sys

import mri_data
import monai_training

In [2]:
reload_recursive(mri_data)
reload_recursive(monai_training)

from mri_data.file_manager import DataSet, scan_3Tpioneer_bids
from mri_data import file_manager as fm
from mri_data import utils
from monai_training.preprocess import DataSetProcesser

In [3]:
logger.remove()
logger.add(sys.stderr, level="INFO")

1

In [4]:
to_load = True
drive_root = fm.get_drive_root()
msmri_home = Path("/home/srs-9/Projects/ms_mri")
inference_root = drive_root / "3Tpioneer_bids_predictions"
dataroot = drive_root / "3Tpioneer_bids"
clinical_data_root = drive_root / "Secure_Data"
project_dataroot = msmri_home / "data"

In [5]:
clinical_data_root

PosixPath('/media/smbshare/Secure_Data')

In [6]:
def subject_to_subid(subject):
    if not isinstance(subject, str):
        return None
    re_match = re.match(r"ms(\d{4})", subject)
    if re_match:
        return_val = int(re_match[1])
        return return_val
    else:
        return None

### Data Cleaning

- Add column which is subid as int
- Set the index to the new column
- Remove any rows where subid is NaN
- Rename columns
- Check "Working with missing data" on pandas documentation

In [19]:
if not to_load:
    df = pd.read_excel(clinical_data_root / "manual_labeling_clinicaldata (UNSECURE).xlsx")
    df = df.convert_dtypes()

    with open(msmri_home / "data" / "clinical_data_columns.txt", 'r') as f:
        keep_columns = [line.rstrip() for line in f.readlines()]
    df = df[keep_columns]
    new_columns = {
        "ID#": "subject",
        "FLAIR contrast?": "FLAIR_contrast",
        "age_at_scan": "age",
    }
    df.rename(columns=new_columns, inplace=True)
    df["subid"] = df["subject"].apply(subject_to_subid)
    df.drop(df[df["subid"].isna()].index, inplace=True)
    df["subid"] = df["subid"].astype(int)
    df = df.set_index("subid")

    new_columns = {}
    for col in df.columns:
        new_columns[col] = col.replace(" ", "_")
    df.rename(columns=new_columns, inplace=True)
    df.head()

### Additional Data

- ROI size

In [21]:
if not to_load:
    dataset_proc = DataSetProcesser.new_dataset(dataroot, scan_3Tpioneer_bids, filters=[fm.filter_first_ses])
    dataset = dataset_proc.dataset

    volumes = dict(pituitary=[], pineal=[], choroid=[], subid=[])
    for subid, _ in tqdm(df.iterrows(), total=len(df)):
        scan = dataset.find_scan(subid=str(subid))[0]
        volumes['subid'].append(subid)
        try:
            pituitary_label = fm.find_label(scan, "pituitary", ["CH", "DT"])
        except FileNotFoundError:
            volumes['pituitary'].append(None)
        else:
            volumes['pituitary'].append(utils.compute_volume(pituitary_label)[1])
        try:
            pineal_label = fm.find_label(scan, "pineal", ["CH", "SRS"])
        except FileNotFoundError:
            volumes['pineal'].append(None)
        else:
            volumes['pineal'].append(utils.compute_volume(pineal_label)[1])
        try:
            choroid_label = fm.find_label(scan, "choroid_t1_flair", ["CH", "ED"])
        except FileNotFoundError:
            volumes['choroid'].append(None)
        else:
            volumes['choroid'].append(utils.compute_volume(choroid_label)[1])


    df.insert(7, 'pituitary_volume', volumes['pituitary'])
    df.insert(8, 'pituitary_vol_frac', df['pituitary_volume'] / df['vol_TIV'])
    df.insert(7, 'pineal_volume', volumes['pineal'])
    df.insert(8, 'pineal_vol_frac', df['pineal_volume'] / df['vol_TIV'])
    df.insert(7, 'choroid_volume', volumes['choroid'])
    df.insert(8, 'choroid_vol_frac', df['choroid_volume'] / df['vol_TIV'])
        
    df.to_csv(project_dataroot / "clinical_data.csv")

In [13]:
if to_load:
    df = pd.read_csv(project_dataroot / "clinical_data.csv")

### Prepare Data

- Set categorical variables
  - phenotype
  - race
  - ethnicity
  - sex
- Set edss as ordinal variable

Do I need to factorize the categorical variables if I just set their dtype as "category"?

In [22]:
categorical_vars = ['race','ethnicity', 'sex', 'phenotype', 'FLAIR_contrast']
for var in categorical_vars:
    df[var] = df[var].astype("category")

# edss is ordinal variable
edss_type = pd.CategoricalDtype(categories=np.linspace(0, 10, 21), ordered=True)
df['edss'] = df['edss'].astype(edss_type)

In [28]:
# move the columns for all the other random volumes to the end
cols = df.columns.to_list()
ind1 = cols.index("BPF")
ind2 = cols.index("Brainstem") + 1
cols = cols[:ind1] + cols[ind2:] + cols[ind1:ind2]
df = df[cols]

In [30]:
print(df.columns)
df.head()

Index(['subject', 'FLAIR_contrast', 'phenotype', 'age_at_scan', 'race',
       'ethnicity', 'sex', 'choroid_volume', 'choroid_vol_frac',
       'pineal_volume', 'pineal_vol_frac', 'pituitary_volume',
       'pituitary_vol_frac', 'symptom_onset', 'edss', 'dzdur', 'num_lesions',
       'lesion_vol', 'BPF', 'vol_GM', 'vol_WM', 'vol_CSF', 'vol_WMH',
       'vol_TIV', 'cortical_thickness', 'cortical_thickness_SD', 'VSCALING',
       'pgrey', 'grey', 'white', 'brain', 'vcsf', 'Lthal', 'Rthal', 'Lhippo',
       'Rhippo', 'Lamy', 'Ramy', 'Lputam', 'Rputam', 'Lcaud', 'Rcaud',
       'Laccumb', 'Raccumb', 'Lpall', 'Rpall', 'Brainstem'],
      dtype='object')


Unnamed: 0_level_0,subject,FLAIR_contrast,phenotype,age_at_scan,race,ethnicity,sex,choroid_volume,choroid_vol_frac,pineal_volume,...,Ramy,Lputam,Rputam,Lcaud,Rcaud,Laccumb,Raccumb,Lpall,Rpall,Brainstem
subid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1010,ms1010,no,RRMS,20.133603,White,Hispanic or Latino,Female,1933.325928,1.532423,150.01709,...,1.570915,6.533811,6.654875,5.372034,5.79357,0.883184,0.773059,2.308968,2.323554,29.891876
1011,ms1011,yes,RRMS,28.880797,Other,Hispanic or Latino,Female,2303.349365,1.579446,283.630951,...,2.057197,6.46169,6.791539,5.113006,5.948437,0.974902,0.72246,2.410756,2.308942,30.081795
1019,ms1019,no,RRMS,66.470459,White,Not Hispanic or Latino,Female,3353.079346,2.479118,133.119675,...,1.924501,4.778929,4.688707,3.812061,3.999609,0.628003,0.110114,1.035068,1.307155,31.498878
1033,ms1033,yes,RRMS,48.943219,White,Not Hispanic or Latino,Female,1834.491211,1.278455,312.319214,...,0.748343,6.117689,5.956475,4.562571,4.787144,0.864502,0.663864,2.187303,1.713516,28.571508
1065,ms1065,yes,RRMS,59.018234,White,Not Hispanic or Latino,Female,3014.648193,2.242647,161.279587,...,1.964459,5.314401,6.116707,3.193821,3.477359,0.60768,0.52927,1.988262,2.130381,26.807235


## Statistical Analysis

### Descriptive Stats


In [57]:
print(len(df), "patients in total")
print(sum(df['phenotype'] == "RRMS"), "patients with RRMS")
print(sum(df['phenotype'] == "OIND"), "patients with OIND")
print(sum(df['phenotype'] == "NIND"), "patients with NIND")

36 patients in total
15 patients with RRMS
9 patients with OIND
12 patients with NIND


In [None]:
print("Proportion of female subjects: {:0.2f}".format(sum(df['sex'] == "Female") / len(df)) )
print("Average age at scan: {:0.2f}".format(df['age'].mean()))

Proportion of female subjects: 0.92
Average age at scan: 46.75


In [76]:
df_rrms = df.loc[df["phenotype"] == "RRMS"]
print("Fraction of female RRMS patients: {:0.2f}".format(sum(df_rrms['sex'] == "Female") / len(df_rrms)))
print("Average age of RRMS patient: {:0.2f}".format(df_rrms['age'].mean()))
print("\n")

df_oind = df.loc[df["phenotype"] == "OIND"]
print("Fraction of female OIND patients: {:0.2f}".format(sum(df_oind['sex'] == "Female") / len(df_oind)))
print("Average age of OIND patient: {:0.2f}".format(df_oind['age'].mean()))
print("\n")

df_nind = df.loc[df["phenotype"] == "NIND"]
print("Fraction of female NIND patients: {:0.2f}".format(sum(df_nind['sex'] == "Female") / len(df_nind)))
print("Average age of NIND patient: {:0.2f}".format(df_nind['age'].mean()))

Fraction of female RRMS patients: 0.93
Average age of RRMS patient: 45.07


Fraction of female OIND patients: 0.89
Average age of OIND patient: 49.68


Fraction of female NIND patients: 0.92
Average age of NIND patient: 46.50


#### Volumes

In [80]:
# choroid
print("Mean choroid volumes:")
print("RRMS: {:0.2f}".format(df_rrms['choroid_volume'].mean()))
print("OIND: {:0.2f}".format(df_oind['choroid_volume'].mean()))
print("NIND: {:0.2f}".format(df_nind['choroid_volume'].mean()))
print("\n")

# pineal
print("Mean pineal volumes:")
print("RRMS: {:0.2f}".format(df_rrms['pineal_volume'].mean()))
print("OIND: {:0.2f}".format(df_oind['pineal_volume'].mean()))
print("NIND: {:0.2f}".format(df_nind['pineal_volume'].mean()))
print("\n")

# pituitary
print("Mean pituitary volumes:")
print("RRMS: {:0.2f}".format(df_rrms['pituitary_volume'].mean()))
print("OIND: {:0.2f}".format(df_oind['pituitary_volume'].mean()))
print("NIND: {:0.2f}".format(df_nind['pituitary_volume'].mean()))

Mean choroid volumes:
RRMS: 1849.70
OIND: 1957.32
NIND: 1814.22


Mean pineal volumes:
RRMS: 258.73
OIND: 203.84
NIND: 487.55


Mean pituitary volumes:
RRMS: 783.61
OIND: 712.28
NIND: 594.03


### Regression Models

Looking at how volume of ROI predicts phenotype

#### Pituitary Volume

In [None]:
from statsmodels.miscmodels.ordinal_model import OrderedModel

df['sex_factor'], _ = df['sex'].factorize()
predictors = ["age_at_scan", "vol_TIV", "sex_factor", "pituitary_volume"]
outcome = "phenotype"

x = df[predictors]
Y = df[outcome]

mod_prob = OrderedModel(Y, x, distr='probit')
res_prob = mod_prob.fit(method='bfgs')
res_prob.summary()


#### statsmodels usage

- `statsmodels.regression.linear_model.OLS`
- `statsmodels.discrete.discrete_model.Logit`
- `statsmodels.discrete.discrete_model.Probit`
- `statsmodels.miscmodels.ordinal_model.OrderedModel`

Why is OrderedModel not available under `statsmodels.api` unlike all the others? Also, OrderedModel isn't available under `statsmodels.miscmodels.api` either

In [None]:
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel

spector_data = sm.datasets.spector.load()

Y = spector_data.endog
x = spector_data.exog

# Linear regression
mod = sm.OLS(Y, x)

# Regression with discrete dependent variable
logit_mod = sm.Logit(Y, x)
probit_mod = sm.Probit(Y, x)


# Ordinal regression
mod = OrderedModel(Y, x, distr='probit')