In [None]:
import os.path, pickle, math
import numpy as np
import scipy as sp
import pandas as pd
from constants import PROCESSED_PATH, RAW_PATH
from runTraditionalModels import runTraditionalModels
from runAutoML import runAutoML, runTPot
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
from sklearn.utils import shuffle, resample
from sklearn.preprocessing import normalize, OneHotEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split

In [None]:
datafn = 'HOUR_00003.csv'

In [None]:
df = pd.read_csv(os.path.join(PROCESSED_PATH, datafn), na_values=['?', '!'])
df.replace('!.+', np.nan, regex=True, inplace=True)

In [None]:
df.head()

In [None]:
check_for_nan_columns = set(df.columns) - {'SUBJECT_ID', 'HADM_ID', 'AGE', 'GENDER', 'ETHNICITY','P TSTAGE','P STAGE','TSTAGE','STAGE'}
df = df.astype({k: np.float64 for k in check_for_nan_columns}, inplace=True)

In [None]:
# drop rows where all features=nan
row_nan_bool = np.logical_not(np.all(np.isnan(df.iloc[:,5:-1]), axis=1))
df = df[row_nan_bool]

In [None]:
df.sort_values(['SUBJECT_ID', 'HADM_ID'], inplace=True)

In [None]:
ids_fn = os.path.join(RAW_PATH, 'd_ids_split.pickle')

In [None]:
split_ids = pickle.load(open(ids_fn, 'rb'))

In [None]:
split_df = {}
for dataset in split_ids:
    split_df[dataset] = df[(df['SUBJECT_ID'].isin(split_ids[dataset][:,0])) & (df['HADM_ID'].isin(split_ids[dataset][:,1]))]
devel = split_df['devel']

In [None]:
for k in split_df:
    print(k, split_df[k].shape)
print('total', df.shape)

In [None]:
#drop columns where all rows=nan
check_nan = devel.isna().sum()
devel.drop(labels=check_nan[(check_nan == devel.shape[0])].keys(), axis=1, inplace=True)

In [None]:
devel.head()

In [None]:
devel = devel[devel.columns[[0,1,4,2,3] + list(range(5,len(devel.columns)))]]

In [None]:
devel.head()

In [None]:
data3 = devel.iloc[:,3:-2]
data3.head()

In [None]:
# calculate Kruskal-Wallis H-test for each feature
dfs_by_class = [data3.loc[devel['STAGE'] == c] for c in [0,1,2,3]]
kruskals = {}
for col in data3.columns:
    col_in_classes = [np.asarray(c[col].dropna()) for c in dfs_by_class]
    try:
        kruskals[col] = sp.stats.kruskal(*col_in_classes)[1]
    except ValueError:
        kruskals[col] = 0
           
devel_kruskal = devel[list(devel.columns[:3])+[k for k, v in kruskals.items() if v > 0.05]+list(devel.columns[-2:])]

In [None]:
devel_kruskal.shape

In [None]:
means = devel_kruskal.mean()
devel_kruskal.fillna(means, inplace=True)
print(devel_kruskal.head())

In [None]:
# calculate VIFs
features = devel_kruskal[devel_kruskal.columns[3:-2]]
print(features.columns)
done = False
while not done:
    vifs = {}
    for i, n in enumerate(features):
        if i in range(3,features.shape[1]):
            vifs[n] = variance_inflation_factor(np.asarray(features), i)
    
    drop_items = sorted(vifs.items(), reverse=True, key=lambda kv: kv[1])
    if len(drop_items) > 0 and drop_items[0][1] >= 5:
        features.drop(labels=[drop_item[0][1]], axis=1, inplace=True)
    else:
        done = True

In [None]:
devel_vif = devel_kruskal[list(devel_kruskal.columns[:3]) + list(features.columns) + list(devel_kruskal.columns[-2:])]
devel_vif.head()

In [None]:
devel_dist = {}
for s in [0,1,2,3]:
    devel_dist[s] = devel_vif[(devel_vif['STAGE'] == s)].shape[0] / devel_vif.shape[0]
devel_dist

In [None]:
counts = {subset: dict() for subset in ['test', 'valid']}
for subset in counts:
    for s in [0,1,2,3]:
        counts[subset][s] = split_df[subset][(split_df[subset]['STAGE'] == s)].shape[0]

dists = {subset: dict() for subset in ['test', 'valid']}
for subset in dists:
    for s in [0,1,2,3]:
        dists[subset][s] = counts[subset][s] / split_df[subset].shape[0]
ratios = {}
for subset in dists:
    print(subset, dists[subset])
    ratios[subset] = {c: dists[subset][c]/devel_dist[c] for c in devel_dist}
    print('ratio', ratios[subset])

In [None]:
split_df['test'].fillna(means, inplace=True)
split_df['valid'].fillna(means, inplace=True)

split_df['test'] = split_df['test'][devel_vif.columns]
split_df['valid'] = split_df['valid'][devel_vif.columns]

In [None]:
synthesize = {subset: dict() for subset in ['test', 'valid']}
for subset in ratios:
    for c in ratios[subset]:
        if ratios[subset][c] < 1:
            num_syn = math.ceil((1-ratios[subset][c])*counts[subset][c])
            print(subset, c, counts[subset][c], num_syn)
            synthesize[subset][c] = num_syn
synthesize

In [None]:
split_df['test'].head()


In [None]:
def mode_or_mean(x):
    if x.name in ["SUBJECT_ID", "HADM_ID"]:
        return 99999
    elif x.name in ["ETHNICITY", "TSTAGE", "STAGE"]:
        return np.random.choice(x.mode().dropna())
    else:
        return x.mean()

In [None]:
n_samples = 5
for subset in synthesize:
    for c in synthesize[subset]:
        for s in range(synthesize[subset][c]):
            samples = resample(
                    split_df[subset][(split_df[subset]['STAGE'] == c)],
                    n_samples=n_samples,
                )
            synth = samples.apply(mode_or_mean)
            split_df[subset] = split_df[subset].append(synth, ignore_index=True)
        print(subset, split_df[subset].shape)


In [None]:
for subset in synthesize:
    for c in synthesize[subset]:
        print(subset, c, split_df[subset][(split_df[subset]['STAGE'] == c)].shape[0])
counts

In [None]:
test = split_df['test']
valid = split_df['valid']

train = devel_vif.drop(['SUBJECT_ID','HADM_ID','ETHNICITY','TSTAGE'], axis=1)
testv = test.drop(['SUBJECT_ID','HADM_ID','ETHNICITY','TSTAGE'], axis=1)

x_train = train.values[:, :-2]
y_train = train.values[:, -1]
x_train = normalize(x_train, axis=0)
ohe = LabelBinarizer()
ohe.fit(y_train.reshape(-1, 1))

x_test = testv.values[:, :-2]
x_test = normalize(x_test, axis=0)
y_test = testv.values[:, -1]

ohe_y_train = ohe.transform(y_train.reshape(-1,1))
ohe_y_test = ohe.transform(y_test.reshape(-1,1))
print(ohe_y_train.shape)
print(ohe_y_test.shape)

In [None]:
# runTraditionalModels(x_train, ohe_y_train, x_test, ohe_y_test, datafn)

In [None]:
aMLScore = runAutoML(train, testv)
print(aMLScore)

In [None]:
runTPot(x_train, x_test, y_train, y_test)