In [1]:
import numpy as np
from collections import Counter
from scipy.stats import entropy
import os
from itertools import combinations, product
import pandas as pd
from tslearn.metrics import dtw
from myf import calculate_f_jk, calculate_g_x
from joblib import Parallel, delayed
import pickle
from scipy import stats
from tableone import TableOne

In [2]:
# metrics could be like entropy or avg intracluster distance
def create_features_df(file_list, metrics=None):
    trial = [file[0] for file in file_list]
    synthetic = [file[2] for file in file_list]

    sex = [file.split("_")[4][0] for file in file_list]
    age = [file.split("_")[4][1] for file in file_list]
    treatment = [file.split("_")[4][2] for file in file_list]
    tir = [file.split("_")[4][3] for file in file_list]
    # d = {'col1': [1, 2], 'col2': [3, 4]}
    d = {
        "trial": trial,
        "synthetic": synthetic,
        "sex": sex,
        "age": age,
        "treatment": treatment,
        "tir": tir,
    }
    if metrics is not None:
        d.update(metrics)

    df = pd.DataFrame(d)
    return df

In [5]:
# load file list for single and multi
single_folder = "final_single_test"
single_file_list = os.listdir(single_folder)
multi_folder = "final_multi_test"
multi_file_list = os.listdir(multi_folder)

In [6]:
single_df = create_features_df(file_list=single_file_list)
multi_df = create_features_df(file_list=multi_file_list)

In [7]:
single_df["trial_syn"] = single_df["trial"] + "_" + single_df["synthetic"]
multi_df["trial_syn"] = multi_df["trial"] + "_" + multi_df["synthetic"]

In [16]:
def features_table(df):
    # columns to summarize
    columns = df.columns[2:].tolist()
    # columns containing categorical variables
    categorical = columns[:-1]

    # limit the binary variable "death" to a single row
    # limit = {"max_vaso_binary": 1}

    # optionally, a categorical variable for stratification
    groupby = "trial_syn"

    table = TableOne(
        df,
        columns=columns,
        categorical=categorical,
        groupby=groupby,
        label_suffix=True,
        # limit=limit,
        pval=True,
        dip_test=True,
        normal_test=True,
        tukey_test=True,
    )
    return table

In [19]:
concat_df = pd.concat([single_df, multi_df], ignore_index=True, axis=0)

In [20]:
features_combined = features_table(concat_df)

In [21]:
features_combined

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by trial_syn,Grouped by trial_syn,Grouped by trial_syn,Grouped by trial_syn,Grouped by trial_syn,Grouped by trial_syn,Grouped by trial_syn,Grouped by trial_syn,Grouped by trial_syn,Grouped by trial_syn,Grouped by trial_syn
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,a_0,b_0,g_0,l_0,s_0,s_1,t_0,w_0,P-Value
n,,,40067,6173,2101,22192,2736,94,3296,21,3454,
"sex, n (%)",1.0,,20956 (52.3),3097 (50.2),1118 (53.2),11360 (51.2),1104 (40.4),66 (70.2),2261 (68.6),11 (52.4),1939 (56.1),<0.001
"sex, n (%)",2.0,,19111 (47.7),3076 (49.8),983 (46.8),10832 (48.8),1632 (59.6),28 (29.8),1035 (31.4),10 (47.6),1515 (43.9),
"age, n (%)",0.0,,4040 (10.1),0 (0.0),0 (0.0),0 (0.0),170 (6.2),10 (10.6),405 (12.3),1 (4.8),3454 (100.0),<0.001
"age, n (%)",1.0,,4345 (10.8),0 (0.0),352 (16.8),2333 (10.5),801 (29.3),22 (23.4),827 (25.1),10 (47.6),0 (0.0),
"age, n (%)",2.0,,29403 (73.4),5519 (89.4),1708 (81.3),18563 (83.6),1690 (61.8),56 (59.6),1857 (56.3),10 (47.6),0 (0.0),
"age, n (%)",3.0,,2279 (5.7),654 (10.6),41 (2.0),1296 (5.8),75 (2.7),6 (6.4),207 (6.3),0 (0.0),0 (0.0),
"treatment, n (%)",0.0,,676 (1.7),0 (0.0),0 (0.0),0 (0.0),369 (13.5),0 (0.0),0 (0.0),3 (14.3),304 (8.8),<0.001
"treatment, n (%)",1.0,,6442 (16.1),6173 (100.0),0 (0.0),0 (0.0),25 (0.9),0 (0.0),0 (0.0),18 (85.7),226 (6.5),
"treatment, n (%)",2.0,,786 (2.0),0 (0.0),529 (25.2),0 (0.0),0 (0.0),0 (0.0),0 (0.0),0 (0.0),257 (7.4),
