In [1]:
import os.path, pickle, math
import numpy as np
import scipy as sp
import pandas as pd
from constants import PROCESSED_PATH, RAW_PATH
from runTraditionalModels import runTraditionalModels
from runAutoML import runAutoML, runTPot
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
from sklearn.utils import shuffle, resample
from sklearn.preprocessing import normalize, OneHotEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split

In [3]:
datafn = 'HOUR_00003.csv'

In [4]:
df = pd.read_csv(os.path.join(PROCESSED_PATH, datafn), na_values=['?', '!'])
df.replace('!.+', np.nan, regex=True, inplace=True)

In [5]:
df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,AGE,GENDER,ETHNICITY,P WEIGHT,P HEIGHT,P SYSTOLIC BP,P DIASTOLIC BP,P TEMPERATURE,...,MACROCYTES,PEEP,ATYPICAL LYMPHOCYTES,METAMYELOCYTES,MYELOCYTES,ANISOCYTOSIS,MICROCYTES,SODIUM.2,TSTAGE,STAGE
0,3,145834,76,1,WHITE,,,,,,...,,,,,,,,,8,2
1,4,185777,47,0,WHITE,53.6,,116.0,63.0,37.444422,...,,,,,,,,,0,0
2,9,150750,41,1,UNKNOWN/NOT SPECIFIED,104.0,182.88,168.0,88.75,35.277789,...,,,,,,,,,37,1
3,11,194540,50,0,WHITE,,,110.5,52.0,37.055553,...,,,,,,,,,0,0
4,13,143045,39,0,WHITE,73.5,144.78,149.0,72.5,37.277789,...,,,,,,,,,0,0


In [6]:
check_for_nan_columns = set(df.columns) - {'SUBJECT_ID', 'HADM_ID', 'AGE', 'GENDER', 'ETHNICITY','P TSTAGE','P STAGE','TSTAGE','STAGE'}
df = df.astype({k: np.float64 for k in check_for_nan_columns}, inplace=True)

In [7]:
# drop rows where all features=nan
row_nan_bool = np.logical_not(np.all(np.isnan(df.iloc[:,5:-1]), axis=1))
df = df[row_nan_bool]

In [8]:
df.sort_values(['SUBJECT_ID', 'HADM_ID'], inplace=True)

In [9]:
ids_fn = os.path.join(RAW_PATH, 'd_ids_split.pickle')

In [10]:
split_ids = pickle.load(open(ids_fn, 'rb'))

In [11]:
split_df = {}
for dataset in split_ids:
    split_df[dataset] = df[(df['SUBJECT_ID'].isin(split_ids[dataset][:,0])) & (df['HADM_ID'].isin(split_ids[dataset][:,1]))]
devel = split_df['devel']

In [12]:
for k in split_df:
    print(k, split_df[k].shape)
print('total', df.shape)

test (5579, 205)
devel (16609, 205)
valid (5492, 205)
total (27680, 205)


In [13]:
#drop columns where all rows=nan
check_nan = devel.isna().sum()
devel.drop(labels=check_nan[(check_nan == devel.shape[0])].keys(), axis=1, inplace=True)

In [14]:
devel.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,AGE,GENDER,ETHNICITY,P WEIGHT,P HEIGHT,P SYSTOLIC BP,P DIASTOLIC BP,P TEMPERATURE,...,WEIGHT,HEIGHT,SYSTOLIC BP,DIASTOLIC BP,TEMPERATURE,RESPIRATORY RATE,HEART RATE,SPO2,TSTAGE,STAGE
1,4,185777,47,0,WHITE,53.6,,116.0,63.0,37.444422,...,53.6,,116.0,63.0,37.444422,,105.5,98.0,0,0
2,9,150750,41,1,UNKNOWN/NOT SPECIFIED,104.0,182.88,168.0,88.75,35.277789,...,104.0,182.88,160.75,82.5,36.055578,10.625,87.0,98.8,37,1
4,13,143045,39,0,WHITE,73.5,144.78,149.0,72.5,37.277789,...,73.5,144.78,147.0,72.0,37.277789,15.0,77.0,98.0,0,0
6,18,188822,50,1,WHITE,,,151.4,82.2,37.666683,...,,,152.0,75.0,37.666683,21.0,116.0,98.666667,0,0
7,21,111970,87,1,WHITE,64.942857,175.26,130.144828,51.62069,36.383338,...,64.0,,115.0,32.0,37.611106,14.0,68.0,100.0,8,3


In [15]:
devel = devel[devel.columns[[0,1,4,2,3] + list(range(5,len(devel.columns)))]]

In [16]:
devel.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ETHNICITY,AGE,GENDER,P WEIGHT,P HEIGHT,P SYSTOLIC BP,P DIASTOLIC BP,P TEMPERATURE,...,WEIGHT,HEIGHT,SYSTOLIC BP,DIASTOLIC BP,TEMPERATURE,RESPIRATORY RATE,HEART RATE,SPO2,TSTAGE,STAGE
1,4,185777,WHITE,47,0,53.6,,116.0,63.0,37.444422,...,53.6,,116.0,63.0,37.444422,,105.5,98.0,0,0
2,9,150750,UNKNOWN/NOT SPECIFIED,41,1,104.0,182.88,168.0,88.75,35.277789,...,104.0,182.88,160.75,82.5,36.055578,10.625,87.0,98.8,37,1
4,13,143045,WHITE,39,0,73.5,144.78,149.0,72.5,37.277789,...,73.5,144.78,147.0,72.0,37.277789,15.0,77.0,98.0,0,0
6,18,188822,WHITE,50,1,,,151.4,82.2,37.666683,...,,,152.0,75.0,37.666683,21.0,116.0,98.666667,0,0
7,21,111970,WHITE,87,1,64.942857,175.26,130.144828,51.62069,36.383338,...,64.0,,115.0,32.0,37.611106,14.0,68.0,100.0,8,3


In [17]:
data3 = devel.iloc[:,3:-2]
data3.head()

Unnamed: 0,AGE,GENDER,P WEIGHT,P HEIGHT,P SYSTOLIC BP,P DIASTOLIC BP,P TEMPERATURE,P RESPIRATORY RATE,P HEART RATE,P SPO2,...,P TSTAGE,P STAGE,WEIGHT,HEIGHT,SYSTOLIC BP,DIASTOLIC BP,TEMPERATURE,RESPIRATORY RATE,HEART RATE,SPO2
1,47,0,53.6,,116.0,63.0,37.444422,,105.5,98.0,...,0,0,53.6,,116.0,63.0,37.444422,,105.5,98.0
2,41,1,104.0,182.88,168.0,88.75,35.277789,12.8,84.0,98.625,...,37,1,104.0,182.88,160.75,82.5,36.055578,10.625,87.0,98.8
4,39,0,73.5,144.78,149.0,72.5,37.277789,17.0,78.5,98.0,...,0,0,73.5,144.78,147.0,72.0,37.277789,15.0,77.0,98.0
6,50,1,,,151.4,82.2,37.666683,22.0,104.0,99.2,...,0,0,,,152.0,75.0,37.666683,21.0,116.0,98.666667
7,87,1,64.942857,175.26,130.144828,51.62069,36.383338,19.422535,76.583333,98.887324,...,8,3,64.0,,115.0,32.0,37.611106,14.0,68.0,100.0


In [18]:
# calculate Kruskal-Wallis H-test for each feature
dfs_by_class = [data3.loc[devel['STAGE'] == c] for c in [0,1,2,3]]
kruskals = {}
for col in data3.columns:
    col_in_classes = [np.asarray(c[col].dropna()) for c in dfs_by_class]
    try:
        kruskals[col] = sp.stats.kruskal(*col_in_classes)[1]
    except ValueError:
        kruskals[col] = 0
           
devel_kruskal = devel[list(devel.columns[:3])+[k for k, v in kruskals.items() if v > 0.05]+list(devel.columns[-2:])]

In [19]:
devel_kruskal.shape

(16609, 11)

In [20]:
means = devel_kruskal.mean()
devel_kruskal.fillna(means, inplace=True)
means = devel.mean()
devel.fillna(means, inplace=True)
print(devel_kruskal.head())

   SUBJECT_ID  HADM_ID              ETHNICITY       P PO2  P FREE CALCIUM  \
1           4   185777                  WHITE   99.508842        1.119181   
2           9   150750  UNKNOWN/NOT SPECIFIED   99.508842        1.119181   
4          13   143045                  WHITE   99.508842        1.119181   
6          18   188822                  WHITE   99.508842        1.160000   
7          21   111970                  WHITE  106.090909        1.231667   

   P CREATINE KINASE (CK)  P LIPASE  P LENGTH OF URINE COLLECTION  HEART RATE  \
1               98.652439  268.2714                     23.558282       105.5   
2              112.000000  268.2714                     23.558282        87.0   
4               74.000000  268.2714                     23.558282        77.0   
6              177.000000  268.2714                     23.558282       116.0   
7               98.652439  268.2714                     23.558282        68.0   

   TSTAGE  STAGE  
1       0      0  
2      37   

In [21]:
# calculate VIFs
features = devel_kruskal[devel_kruskal.columns[3:-2]]
print(features.columns)
done = False
while not done:
    vifs = {}
    for i, n in enumerate(features):
        if i in range(3,features.shape[1]):
            vifs[n] = variance_inflation_factor(np.asarray(features), i)
    
    drop_items = sorted(vifs.items(), reverse=True, key=lambda kv: kv[1])
    if len(drop_items) > 0 and drop_items[0][1] >= 5:
        print(drop_items[0])
        features.drop(labels=[drop_items[0][0]], axis=1, inplace=True)
    else:
        print(drop_items)
        done = True

Index(['P PO2', 'P FREE CALCIUM', 'P CREATINE KINASE (CK)', 'P LIPASE',
       'P LENGTH OF URINE COLLECTION', 'HEART RATE'],
      dtype='object')
('P LENGTH OF URINE COLLECTION', 405.7736788081544)
('HEART RATE', 20.317362508111817)
[('P LIPASE', 1.8036056061372452)]


In [22]:
devel_vif = devel_kruskal[list(devel_kruskal.columns[:3]) + list(features.columns) + list(devel_kruskal.columns[-2:])]
devel_vif.head()
# devel = devel_vif

Unnamed: 0,SUBJECT_ID,HADM_ID,ETHNICITY,P PO2,P FREE CALCIUM,P CREATINE KINASE (CK),P LIPASE,TSTAGE,STAGE
1,4,185777,WHITE,99.508842,1.119181,98.652439,268.2714,0,0
2,9,150750,UNKNOWN/NOT SPECIFIED,99.508842,1.119181,112.0,268.2714,37,1
4,13,143045,WHITE,99.508842,1.119181,74.0,268.2714,0,0
6,18,188822,WHITE,99.508842,1.16,177.0,268.2714,0,0
7,21,111970,WHITE,106.090909,1.231667,98.652439,268.2714,8,3


In [23]:
devel_dist = {}
for s in [0,1,2,3]:
    devel_dist[s] = devel[(devel['STAGE'] == s)].shape[0] / devel.shape[0]
devel_dist

{0: 0.7371906797519417,
 1: 0.10614726955265218,
 2: 0.06562706966102716,
 3: 0.09103498103437895}

In [24]:
counts = {subset: dict() for subset in ['test', 'valid']}
for subset in counts:
    for s in [0,1,2,3]:
        counts[subset][s] = split_df[subset][(split_df[subset]['STAGE'] == s)].shape[0]

dists = {subset: dict() for subset in ['test', 'valid']}
for subset in dists:
    for s in [0,1,2,3]:
        dists[subset][s] = counts[subset][s] / split_df[subset].shape[0]
ratios = {}
for subset in dists:
    print(subset, dists[subset])
    ratios[subset] = {c: dists[subset][c]/devel_dist[c] for c in devel_dist}
    print('ratio', ratios[subset])

test {0: 0.7397383043556193, 1: 0.10503674493636853, 2: 0.05968811614984764, 3: 0.09553683455816454}
ratio {0: 1.0034558556878863, 1: 0.9895378880590725, 2: 0.9095045148007517, 3: 1.0494519081855522}
valid {0: 0.7408958485069191, 1: 0.10342316096139839, 2: 0.06900946831755281, 3: 0.08667152221412965}
ratio {0: 1.0050260656526804, 1: 0.9743365175314044, 2: 1.0515396874185639, 3: 0.9520683283429097}


In [25]:
split_df['test'].fillna(means, inplace=True)
split_df['valid'].fillna(means, inplace=True)

split_df['test'] = split_df['test'][devel.columns]
split_df['valid'] = split_df['valid'][devel.columns]

In [26]:
synthesize = {subset: dict() for subset in ['test', 'valid']}
for subset in ratios:
    for c in ratios[subset]:
        if ratios[subset][c] < 1:
            num_syn = math.ceil((1-ratios[subset][c])*counts[subset][c])
            print(subset, c, counts[subset][c], num_syn)
            synthesize[subset][c] = num_syn
synthesize

test 1 586 7
test 2 333 31
valid 1 568 15
valid 3 476 23


{'test': {1: 7, 2: 31}, 'valid': {1: 15, 3: 23}}

In [27]:
split_df['test'].head()


Unnamed: 0,SUBJECT_ID,HADM_ID,ETHNICITY,AGE,GENDER,P WEIGHT,P HEIGHT,P SYSTOLIC BP,P DIASTOLIC BP,P TEMPERATURE,...,WEIGHT,HEIGHT,SYSTOLIC BP,DIASTOLIC BP,TEMPERATURE,RESPIRATORY RATE,HEART RATE,SPO2,TSTAGE,STAGE
3,11,194540,WHITE,50,0,81.843768,168.812596,110.5,52.0,37.055553,...,81.210406,169.685091,106.0,50.0,36.888894,18.0,95.0,96.0,0,0
12,33,176176,UNKNOWN/NOT SPECIFIED,82,1,79.6,168.812596,103.0,42.0,37.0,...,79.6,169.685091,120.0,44.0,37.0,16.0,67.0,97.0,0,0
29,75,112086,WHITE,76,0,81.843768,168.812596,124.208222,62.959688,36.658122,...,88.8,160.02,110.0,41.0,36.1,6.666667,89.0,100.0,0,0
30,77,142768,UNKNOWN/NOT SPECIFIED,45,1,98.0,172.72,133.666667,80.0,36.277789,...,98.0,172.72,124.0,70.0,36.277789,16.0,88.0,96.0,0,0
40,103,130744,UNKNOWN/NOT SPECIFIED,60,0,61.5,165.1,152.666667,47.333333,36.166683,...,61.5,165.1,158.0,41.0,36.166683,21.0,42.0,96.0,0,0


In [28]:
def mode_or_mean(x):
    if x.name in ["SUBJECT_ID", "HADM_ID"]:
        return 99999
    elif x.name in ["ETHNICITY", "TSTAGE", "STAGE"]:
        return np.random.choice(x.mode().dropna())
    else:
        return x.mean()

In [29]:
n_samples = 5
for subset in synthesize:
    for c in synthesize[subset]:
        for s in range(synthesize[subset][c]):
            samples = resample(
                    split_df[subset][(split_df[subset]['STAGE'] == c)],
                    n_samples=n_samples,
                )
            synth = samples.apply(mode_or_mean)
            split_df[subset] = split_df[subset].append(synth, ignore_index=True)
        print(subset, split_df[subset].shape)


test (5586, 102)
test (5617, 102)
valid (5507, 102)
valid (5530, 102)


In [30]:
for subset in synthesize:
    for c in synthesize[subset]:
        print(subset, c, split_df[subset][(split_df[subset]['STAGE'] == c)].shape[0])
counts

test 1 593
test 2 364
valid 1 583
valid 3 499


{'test': {0: 4127, 1: 586, 2: 333, 3: 533},
 'valid': {0: 4069, 1: 568, 2: 379, 3: 476}}

In [31]:
test = split_df['test']
valid = split_df['valid']

train = devel.drop(['SUBJECT_ID','HADM_ID','ETHNICITY','TSTAGE','P TSTAGE','P STAGE'], axis=1)
testv = test.drop(['SUBJECT_ID','HADM_ID','ETHNICITY','TSTAGE','P TSTAGE','P STAGE'], axis=1)

x_train = train.values[:, :-2]
y_train = train.values[:, -1]
x_train = normalize(x_train, axis=0)
ohe = LabelBinarizer()
ohe.fit(y_train.reshape(-1, 1))

x_test = testv.values[:, :-2]
x_test = normalize(x_test, axis=0)
y_test = testv.values[:, -1]

ohe_y_train = ohe.transform(y_train.reshape(-1,1))
ohe_y_test = ohe.transform(y_test.reshape(-1,1))
print(ohe_y_train.shape)
print(ohe_y_test.shape)

(16609, 4)
(5617, 4)


In [32]:
# runTraditionalModels(x_train, ohe_y_train, x_test, ohe_y_test, datafn)

In [33]:
aMLScore = runAutoML(train, testv)
print(aMLScore)

Welcome to auto_ml! We're about to go through and make sense of your data using machine learning, and give you a production-ready pipeline to get predictions with.

If you have any issues, or new feature ideas, let us know at http://auto.ml
You are running on version 2.9.10
Now using the model training_params that you passed in:
{}
After overwriting our defaults with your values, here are the final params that will be used to initialize the model:
{}
Running basic data cleaning
Performing feature scaling
Fitting DataFrameVectorizer
Now using the model training_params that you passed in:
{}
After overwriting our defaults with your values, here are the final params that will be used to initialize the model:
{}


AttributeError: 'NoneType' object has no attribute 'score'

In [None]:
# runTPot(x_train, x_test, y_train, y_test)