In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
import numpy as np
import re
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import mannwhitneyu 

from mri_data import file_manager as fm

In [2]:
drive_root = fm.get_drive_root()
msmri_home = Path("/home/srs-9/Projects/ms_mri")
msmri_datadir = msmri_home / "data"
curr_dir = msmri_home / "monai_analysis" / "choroid_pineal_pituitary_T1-1"

## Data Cleaning

Load volume dataset

In [3]:
df_vols = pd.read_csv(curr_dir / "clinical_data_full.csv")
df_vols = df_vols.set_index("subid")
keep_cols = [
    "choroid_volume",
    "pineal_volume",
    "pituitary_volume",
    "tiv",
    "flair_contrast",
    "label",
    "scan_folder",
    "age",
]

not_nas = (
    ~df_vols["pineal_volume"].isna()
    & ~df_vols["choroid_volume"].isna()
    & ~df_vols["pituitary_volume"].isna()
)
df_vols = df_vols.loc[not_nas, keep_cols]

In [4]:
def subject_to_subid(subject):
    if not isinstance(subject, str):
        return None
    re_match = re.match(r"ms(\d{4})", subject)
    if re_match:
        return_val = int(re_match[1])
        return return_val
    else:
        return None

In [5]:
df_full = pd.read_csv(msmri_datadir / "Clinical_Data_All_updated.csv")
df_full.insert(0, "subid", df_full["ID"].map(subject_to_subid))
df_full = df_full.set_index("subid")

df = pd.merge(
    df_vols,
    df_full.loc[:, ~df_full.columns.isin(df_vols.columns)],
    how="outer",
    on="subid",
)

df.loc[:, "extracted_EDSS"] = [
    float(val) if val != "." else None for val in df["extracted_EDSS"]
]  #! figure out what to do with "."
df.loc[:, ["EDSS"]] = pd.Categorical(df["extracted_EDSS"], ordered=True)
df.loc[df["PRL"] == "#VALUE!", "PRL"] = None
df.loc[:, "PRL"] = [
    int(val) if val != "#VALUE!" and val is not None else None for val in df["PRL"]
]
df.loc[df["dzdur"] == "#VALUE!", "dzdur"] = None

In [6]:
prl_levels = [range(0, 1), range(1, 3), range(3, 5), range(5, df["PRL"].max() + 1)]
df.loc[:, ["PRL_LEVEL"]] = None
for i, level in enumerate(prl_levels):
    df.loc[df["PRL"].isin(level), ["PRL_LEVEL"]] = i
df.loc[:, ["PRL_LEVEL"]] = pd.Categorical(df["PRL_LEVEL"], ordered=True)

prl_levels = [range(0, 1), range(1, 4), range(4, df["PRL"].max() + 1)]
df.loc[:, ["PRL_LEVEL2"]] = None
for i, level in enumerate(prl_levels):
    df.loc[df["PRL"].isin(level), ["PRL_LEVEL2"]] = i
df.loc[:, ["PRL_LEVEL2"]] = pd.Categorical(df["PRL_LEVEL2"], ordered=True)

prl_levels = [range(0, 1), range(1, df["PRL"].max() + 1)]
df.loc[:, ["IS_PRL"]] = None
for i, level in enumerate(prl_levels):
    df.loc[df["PRL"].isin(level), ["IS_PRL"]] = i
df.loc[:, ["IS_PRL"]] = pd.Categorical(df["IS_PRL"], ordered=True)

In [11]:
df.loc[df["extracted_EDSS"] == ".", "extracted_EDSS"] = None

In [7]:
try:
    df.insert(18, "dz_type5", df["ms_type"])
except Exception:
    pass

df.loc[:, "dz_type5"] = df["ms_type"]

df.loc[df["ms_type"].isin(["CIS", "RRMS"]), "dz_type5"] = "RMS"
df.loc[df["ms_type"].isin(["PPMS", "SPMS", "RPMS", "PRMS"]), "dz_type5"] = "PMS"
# df.loc[(df['dz_type5'] == "NIND") & (df['subtype'].map(check_nind2)), "dz_type5"] = "HC"
print(df["dz_type5"].unique())

['RMS' 'PMS' 'NIND' 'UNK' 'HC' 'OIND' 'RIS']


In [9]:
def mean_sd(df, column, cond=None):
    if cond is None:
        cond = (df.index.isin(df.index))
    
    return df.loc[cond, column].mean(), df.loc[cond, column].std()

In [27]:
def percent_cat(df, column, cat, cond=None):
    if cond is None:
        cond = (df.index.isin(df.index))
    
    return sum(cond(df) & (df[column] == cat)) / len(df[cond(df)])

In [19]:
def rms_cond(df):
    return df['dz_type5']=="RMS"

def pms_cond(df):
    return df['dz_type5']=="PMS"

def oind_cond(df):
    return df['dz_type5']=="OIND"

def nind_cond(df):
    return df['dz_type5']=="NIND"

In [46]:
print("Age\n---")
print("RMS:  {:0.2f} ± {:0.2f}".format(*mean_sd(df, "age", cond=rms_cond)))
print("PMS:  {:0.2f} ± {:0.2f}".format(*mean_sd(df, "age", cond=pms_cond)))
print("NIND: {:0.2f} ± {:0.2f}".format(*mean_sd(df, "age", cond=nind_cond)))
print("OIND: {:0.2f} ± {:0.2f}".format(*mean_sd(df, "age", cond=oind_cond)))

print("\n")

print("Sex, Female\n-----------")
print("RMS:  {:0.2f}".format(percent_cat(df, "sex", "Female", cond=rms_cond)))
print("PMS:  {:0.2f}".format(percent_cat(df, "sex", "Female", cond=pms_cond)))
print("NIND: {:0.2f}".format(percent_cat(df, "sex", "Female", cond=nind_cond)))
print("OIND: {:0.2f}".format(percent_cat(df, "sex", "Female", cond=oind_cond)))

print("\n")

print("EDSS\n----")
print("RMS:  {:0.2f} ± {:0.2f}".format(*mean_sd(df, "extracted_EDSS", cond=rms_cond)))
print("PMS:  {:0.2f} ± {:0.2f}".format(*mean_sd(df, "extracted_EDSS", cond=pms_cond)))
print("NIND: {:0.2f} ± {:0.2f}".format(*mean_sd(df, "extracted_EDSS", cond=nind_cond)))
print("OIND: {:0.2f} ± {:0.2f}".format(*mean_sd(df, "extracted_EDSS", cond=oind_cond)))

print("\n")

print("TIV\n----")
print("RMS:  {:0.2f} ± {:0.2f}".format(*mean_sd(df, "tiv", cond=rms_cond)))
print("PMS:  {:0.2f} ± {:0.2f}".format(*mean_sd(df, "tiv", cond=pms_cond)))
print("NIND: {:0.2f} ± {:0.2f}".format(*mean_sd(df, "tiv", cond=nind_cond)))
print("OIND: {:0.2f} ± {:0.2f}".format(*mean_sd(df, "tiv", cond=oind_cond)))

print("\n")

print("lesion_vol_cubic\n----------------")
print("RMS:  {:0.2f} ± {:0.2f}".format(*mean_sd(df, "lesion_vol_cubic", cond=rms_cond)))
print("PMS:  {:0.2f} ± {:0.2f}".format(*mean_sd(df, "lesion_vol_cubic", cond=pms_cond)))
print("NIND: {:0.2f} ± {:0.2f}".format(*mean_sd(df, "lesion_vol_cubic", cond=nind_cond)))
print("OIND: {:0.2f} ± {:0.2f}".format(*mean_sd(df, "lesion_vol_cubic", cond=oind_cond)))

print("\n")

print("PRL\n----------------")
print("RMS:  {:0.2f} ± {:0.2f}".format(*mean_sd(df, "PRL", cond=rms_cond)))
print("PMS:  {:0.2f} ± {:0.2f}".format(*mean_sd(df, "PRL", cond=pms_cond)))
print("NIND: {:0.2f} ± {:0.2f}".format(*mean_sd(df, "PRL", cond=nind_cond)))
print("OIND: {:0.2f} ± {:0.2f}".format(*mean_sd(df, "PRL", cond=oind_cond)))

Age
---
RMS:  44.64 ± 11.94
PMS:  58.24 ± 9.35
NIND: 48.93 ± 11.78
OIND: 51.60 ± 11.60


Sex, Female
-----------
RMS:  0.84
PMS:  0.59
NIND: 0.82
OIND: 0.80


EDSS
----
RMS:  2.16 ± 1.48
PMS:  4.89 ± 1.92
NIND: 2.50 ± 1.75
OIND: 2.57 ± 1.71


TIV
----
RMS:  1493321.35 ± 130080.31
PMS:  1506403.56 ± 132494.95
NIND: 1536062.90 ± 151071.44
OIND: 1512073.80 ± 139567.19


lesion_vol_cubic
----------------
RMS:  1.57 ± 0.93
PMS:  2.02 ± 0.93
NIND: 1.36 ± 0.81
OIND: 1.19 ± 0.52


PRL
----------------
RMS:  0.96 ± 1.91
PMS:  0.65 ± 1.24
NIND: 0.02 ± 0.14
OIND: 0.05 ± 0.31
