In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import funcs.utils as utils
import funcs.plotting as plot
import funcs.amyloid as amyloid
import scipy
from tableone import TableOne


In [2]:
PROCESSED_DIR = "data/processed"

### 1. Load Data
---

In [8]:
# Raw Data
data_df = pd.read_csv(os.path.join(PROCESSED_DIR,"AL_with_ccp_03.tsv"), sep="\t", index_col=0).rename(columns=amyloid.ddict_unclean)
data_df["cluster"] = data_df["fna3_cluster_n"]
data_df["cluster4"] = data_df["m02q4_cluster_n"]
data_df = data_df.dropna(subset="cluster")

# Fix Dates
data_df = pd.concat([pd.to_datetime(data_df[amyloid.dates][var], format="mixed") for var in amyloid.dates], axis=1, keys=amyloid.dates).join(
    data_df.drop(amyloid.dates, axis=1)  
)

# Not imputed
X = pd.read_csv(os.path.join(PROCESSED_DIR, "AL_for_ccp_02.tsv"), sep='\t', index_col=0).rename(columns=amyloid.ddict_unclean)

In [9]:
# Order cluster subtypes
from pandas.api.types import CategoricalDtype

data_df["cluster"] = data_df["cluster"].astype(CategoricalDtype(categories=["Low","Intermediate","High"], ordered=True))
data_df["cluster4"] = data_df["cluster4"].astype(CategoricalDtype(categories=["Low","Low-Intermediate","Intermediate","High"], ordered=True))

# Sex
data_df["Sex"] = data_df["Sex"].apply(lambda x: "F" if x=="female" else "M")

# Collapse Race
data_df["Race"] = data_df["Race"].apply(lambda x: "Other" if x in ['American_Indian_Alaska_Native','Multiracial','Native_Hawaiian_Pacific', 'Unknown/other'] else x)

for x in amyloid.amyloid_ros + amyloid.amyloid_symptoms:
    data_df[x] = data_df[x].apply(lambda x: True if x in ["involved","yes"] else False)

In [10]:
rename = {
    "Bone marrow plasma cells (%)":"BMPC (%)", 
    "Kappa or lambda PCD":"LC Isotype",
    "BU (BNP-based) cardiac staging":"BU Stage (2019)",
    "time":"OS (yr)"}

data_df["Renal Stage (Palladini)"] = data_df["Renal Stage (Palladini)"].apply(lambda x: {"Stage I":"I", "Stage II":"II", "Stage III":"III", np.nan:None, None:None}[x])
data_df["BU (BNP-based) cardiac staging"] = data_df["BU (BNP-based) cardiac staging"].apply(lambda x: {"stage I":"I", "stage II":"II", "stage III":"III", "stage IIIb":"IIIb", np.nan:None, None:None}[x])


In [15]:
utils.get_median_os(data_df, duration="time"), utils.get_median_os(data_df, groupby="cluster", duration="time"), utils.get_median_os(data_df, groupby="Era", duration="time")

(3.989048596851472,
 {'High': 1.2375085557837098,
  'Intermediate': 3.7207392197125255,
  'Low': 6.132785763175907},
 {'Era_1-2': 2.97056810403833,
  'Era_3': 3.3894592744695413,
  'Era_4': 5.938398357289528})

### 2. Abstract Table
---

In [20]:
# # Table for abstract
# columns = ["OS (yr)","LC Isotype","dFLC","BNP","Troponin","eGFR","24-hr UTP","Albumin","BU Stage (2019)"]
# categorical = ["LC Isotype","BU Stage (2019)",] 
# groupby = ["cluster"]
# nonnormal = ["OS (yr)","dFLC","BNP", "Troponin","eGFR","24-hr UTP"]

# mytable = TableOne(data_df, columns, categorical, groupby, nonnormal, pval=True, rename=rename)
# mytable.to_html("onetable.html")
# print(mytable.tabulate(tablefmt="github"))

### 3. Table 1
---

In [21]:
columns = ["Age","Sex","Race","Kappa or lambda PCD", "Primary organ","cluster"] + amyloid.qvars + ['eGFR'] + amyloid.amyloid_symptoms

categorical = ["Sex","Race","Kappa or lambda PCD", "Primary organ"] + amyloid.amyloid_symptoms
groupby = ["cluster"]
nonnormal = []

mytable = TableOne(data_df, columns, categorical, groupby, nonnormal, 
                   pval=True, overall=True, 
                   decimals = {'WBC':2, 'Hemoglobin':2, 'Troponin': 3, 'Calcium':2, 
                               'Bone marrow plasma cells (%)':2, 'Uric acid':2, 'Albumin':2, 'kappa:lambda ratio':2},
                   rename=amyloid.tableone_names)

print(mytable.tabulate(tablefmt="github"))
mytable.to_excel('tables/Table1.xlsx')

  self._groupbylvls = sorted(data.groupby(groupby).groups.keys())  # type: ignore


|                                  |             | Missing   | Overall         | Low             | Intermediate    | High             | P-Value   |
|----------------------------------|-------------|-----------|-----------------|-----------------|-----------------|------------------|-----------|
| n                                |             |           | 2067            | 914             | 757             | 396              |           |
| Age, mean (SD)                   |             | 0         | 60.9 (10.4)     | 61.6 (9.9)      | 60.4 (11.2)     | 60.1 (10.2)      | 0.011     |
| Sex, n (%)                       | F           | 0         | 806 (39.0)      | 351 (38.4)      | 312 (41.2)      | 143 (36.1)       | 0.214     |
|                                  | M           |           | 1261 (61.0)     | 563 (61.6)      | 445 (58.8)      | 253 (63.9)       |           |
| Race, n (%)                      | Asian       | 0         | 47 (2.3)        | 23 (2.5)        | 14 (1.8)     

  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,
  df_cont = pd.pivot_table(cont_data,


## 2. Table 2
---

In [19]:
columns = ["BU (BNP-based) cardiac staging","Renal Stage (Palladini)","cluster"]

categorical = ["BU (BNP-based) cardiac staging","Renal Stage (Palladini)"]
groupby = ["cluster"]
nonnormal = []

mytable = TableOne(data_df, columns, categorical, groupby, nonnormal, 
                   pval=True, overall=True,
                   rename={
                       "BU (BNP-based) cardiac staging":"Cardiac Stage (BU 2019)",
                       "Renal Stage (Palladini)":"Renal Stage (Palladini)"
                       })

print(mytable.tabulate(tablefmt="github"))
mytable.to_excel('tables/Table1B.xlsx')

|                                |      | Missing   | Overall    | Low        | Intermediate   | High       | P-Value   |
|--------------------------------|------|-----------|------------|------------|----------------|------------|-----------|
| n                              |      |           | 2067       | 914        | 757            | 396        |           |
| Cardiac Stage (BU 2019), n (%) | I    | 987       | 305 (28.2) | 200 (36.4) | 99 (31.2)      | 6 (2.8)    | <0.001    |
|                                | II   |           | 458 (42.4) | 254 (46.3) | 131 (41.3)     | 73 (34.1)  |           |
|                                | III  |           | 151 (14.0) | 50 (9.1)   | 58 (18.3)      | 43 (20.1)  |           |
|                                | IIIb |           | 166 (15.4) | 45 (8.2)   | 29 (9.1)       | 92 (43.0)  |           |
| Renal Stage (Palladini), n (%) | I    | 264       | 788 (43.7) | 418 (53.3) | 136 (19.3)     | 234 (74.3) | <0.001    |
|                       

  self._groupbylvls = sorted(data.groupby(groupby).groups.keys())  # type: ignore
