In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import funcs.utils as utils
import funcs.plotting as plot
import funcs.amyloid as amyloid
import scipy
from tableone import TableOne


In [2]:
PROCESSED_DIR = "data/processed"

In [90]:
# Raw Data
data_df = pd.read_csv(os.path.join(PROCESSED_DIR,"AL_with_ccp_03.tsv"), sep="\t", index_col=0).rename(columns=amyloid.ddict_unclean)
data_df = data_df.dropna(subset="cluster")

# Fix Dates
data_df = pd.concat([pd.to_datetime(data_df[amyloid.dates][var], format="mixed") for var in amyloid.dates], axis=1, keys=amyloid.dates).join(
    data_df.drop(amyloid.dates, axis=1)  
)

# Not imputed
X = pd.read_csv(os.path.join(PROCESSED_DIR, "AL_for_ccp_02.tsv"), sep='\t', index_col=0).rename(columns=amyloid.ddict_unclean)

# Imputed
Xi_median = pd.read_csv("data/imputed/median_qvars_01.tsv", sep="\t", index_col=0).rename(columns=amyloid.ddict_unclean)
Xi_knn = pd.read_csv("data/imputed/knn_qvars_01.tsv", sep="\t", index_col=0).rename(columns=amyloid.ddict_unclean)
Xi_mice = pd.read_csv("data/imputed/mice_qvars_05.tsv", sep="\t").rename(columns={'X24_hr_UTP':'24_hr_UTP'}).rename(columns=amyloid.ddict_unclean)

In [91]:
# Order cluster subtypes
from pandas.api.types import CategoricalDtype

data_df["cluster"] = data_df["cluster"].astype(CategoricalDtype(categories=["Low","Intermediate","High"], ordered=True))
data_df["cluster4"] = data_df["cluster4"].astype(CategoricalDtype(categories=["Low","Low-Intermediate","Intermediate","High"], ordered=True))

# Sex
data_df["Sex"] = data_df["Sex"].apply(lambda x: "F" if x=="female" else "M")

# Collapse Race
data_df["Race"] = data_df["Race"].apply(lambda x: "Other" if x in ['American_Indian_Alaska_Native','Multiracial','Native_Hawaiian_Pacific', 'Unknown/other'] else x)

for x in amyloid.amyloid_ros + amyloid.amyloid_symptoms:
    data_df[x] = data_df[x].apply(lambda x: True if x in ["involved","yes"] else False)

In [92]:
columns = ["Age", "Sex", "Race", "Kappa or lambda PCD", "dFLC","eGFR","Bone marrow plasma cells (%)","Troponin","BNP","BU (BNP-based) cardiac staging"]
categorical = ["Sex","Race","Kappa or lambda PCD","BU (BNP-based) cardiac staging"]

nonnormal = ["dFLC","eGFR","Bone marrow plasma cells (%)","Troponin","eGFR"]

mytable = TableOne(data_df, columns, categorical, groupby=None, nonnormal=nonnormal)
print(mytable.tabulate(tablefmt="github"))


|                                              |            | Missing   | Overall           |
|----------------------------------------------|------------|-----------|-------------------|
| n                                            |            |           | 2074              |
| Age, mean (SD)                               |            | 0         | 60.9 (10.4)       |
| Sex, n (%)                                   | F          | 0         | 810 (39.1)        |
|                                              | M          |           | 1264 (60.9)       |
| Race, n (%)                                  | Asian      | 0         | 47 (2.3)          |
|                                              | Black      |           | 172 (8.3)         |
|                                              | Other      |           | 12 (0.6)          |
|                                              | White      |           | 1843 (88.9)       |
| Kappa or lambda PCD, n (%)                   | K          



In [97]:
columns = ["Age","Sex","Race","Kappa or lambda PCD", "Primary organ"] + list(Xi_mice.columns) #+ amyloid.amyloid_symptoms
#columns = ["Age","Sex","Race","Kappa or lambda PCD", "Primary organ"] + [amyloid.amyloid_ros[0]]
categorical = ["Sex","Race","Kappa or lambda PCD", "Primary organ"] #+ amyloid.amyloid_symptoms
#categorical = ["Sex","Race","Kappa or lambda PCD", "Primary organ"] + [amyloid.amyloid_ros[0]]

groupby = ["cluster"]

nonnormal = ["Bone marrow plasma cells (%)"]

mytable = TableOne(data_df, columns, categorical, groupby, nonnormal, pval=True)
#mytable.to_html("onetable.html")
print(mytable.tabulate(tablefmt="github"))

  self._groupbylvls = sorted(data.groupby(groupby).groups.keys())  # type: ignore
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,


|                                              |             | Missing   | Overall         | Low             | Intermediate    | High            | P-Value   |
|----------------------------------------------|-------------|-----------|-----------------|-----------------|-----------------|-----------------|-----------|
| n                                            |             |           | 2074            | 689             | 822             | 563             |           |
| Age, mean (SD)                               |             | 0         | 60.9 (10.4)     | 61.0 (10.0)     | 60.7 (11.0)     | 61.0 (10.2)     | 0.824     |
| Sex, n (%)                                   | F           | 0         | 810 (39.1)      | 276 (40.1)      | 342 (41.6)      | 192 (34.1)      | 0.015     |
|                                              | M           |           | 1264 (60.9)     | 413 (59.9)      | 480 (58.4)      | 371 (65.9)      |           |
| Race, n (%)                                 

In [113]:
# Table for abstract
columns = ["Age","Sex","Race","Kappa or lambda PCD", "Primary organ"] + ["dFLC","Bone marrow plasma cells (%)","BNP","Troponin","eGFR","24-hr UTP","Albumin","Uric acid","LDH","Alk phos"]
categorical = ["Sex","Race","Kappa or lambda PCD", "Primary organ"] 

groupby = ["cluster"]

nonnormal = ["dFLC","Bone marrow plasma cells (%)","BNP", "Troponin","eGFR","24-hr UTP"]

mytable = TableOne(data_df, columns, categorical, groupby, nonnormal, pval=True)
mytable.to_html("onetable.html")
#print(mytable.tabulate(tablefmt="html"))

  self._groupbylvls = sorted(data.groupby(groupby).groups.keys())  # type: ignore
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,


In [114]:
print(mytable.tabulate(tablefmt="github"))

|                                              |             | Missing   | Overall               | Low                  | Intermediate            | High                 | P-Value   |
|----------------------------------------------|-------------|-----------|-----------------------|----------------------|-------------------------|----------------------|-----------|
| n                                            |             |           | 2074                  | 689                  | 822                     | 563                  |           |
| Age, mean (SD)                               |             | 0         | 60.9 (10.4)           | 61.0 (10.0)          | 60.7 (11.0)             | 61.0 (10.2)          | 0.824     |
| Sex, n (%)                                   | F           | 0         | 810 (39.1)            | 276 (40.1)           | 342 (41.6)              | 192 (34.1)           | 0.015     |
|                                              | M           |           | 1264 (60.9