In [1]:
%load_ext autotime

In [2]:
# GROUP: Graphkrone
# Members: Marcello Negri     19-945-450
#          Riccardo Uslenghi  19-954-262

import numpy as np 
import pandas as pd
import matplotlib as mpl 
import matplotlib.pyplot as plt 
from tqdm import tqdm

from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, SVR, LinearSVR
from sklearn.linear_model import SGDRegressor, LinearRegression, ElasticNet, Lasso
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, r2_score

import warnings
warnings.filterwarnings("ignore")

data_trainX = np.loadtxt('train_features.csv', delimiter=",", skiprows=1)
data_trainY = np.loadtxt('train_labels.csv', delimiter=",", skiprows=1)
data_testX = np.loadtxt('test_features.csv', delimiter=",", skiprows=1)

pid = []
for c in range(int(len(data_testX)/12)):
    pid.append(int(data_testX[c*12,0]))

trainY1 = data_trainY[:,1]
trainY2 = data_trainY[:,2]
trainY3 = data_trainY[:,3]
trainY4 = data_trainY[:,4]
trainY5 = data_trainY[:,5]
trainY6 = data_trainY[:,6]
trainY7 = data_trainY[:,7]
trainY8 = data_trainY[:,8]
trainY9 = data_trainY[:,9]
trainY10 = data_trainY[:,10]
trainY11 = data_trainY[:,11]
trainY12 = data_trainY[:,12]
trainY13 = data_trainY[:,13]
trainY14 = data_trainY[:,14]
trainY15 = data_trainY[:,15]
trainY1_11 = data_trainY[:,1:12]
trainY12_16 = data_trainY[:,12:16]

time: 18.4 s


In [3]:
def preprocessing_derivative(data_train):
    train_diff_X = []
    for r in tqdm(range(int(len(data_train)/12))):
        differences = []
        data_matrix = data_train[r*12:(r+1)*12,1:]
        
        for c in range(len(data_matrix[0])):
            column = data_matrix[:,c]
            a, b = 0, 0
            found = False
            for i in range(len(column)):
                if np.isnan(column[i]) == False and found == False:
                    a = column[i]
                    found = True
                if np.isnan(column[i]) == False:
                    b = column[i]
            diff = b-a
            differences.append(diff)
        train_diff_X.append(differences)        
        
    return train_diff_X

def preprocessing_nan(data_train):
    trainX = []
    
    for r in range(int(len(data_train)/12)):
        data_matrix = data_train[r*12:(r+1)*12,1:]
        data_array = np.nanmean(data_matrix, axis=0)
        trainX.append(data_array)

    return trainX

def preprocessing_measures(data_train):
    trainX = []
    
    for r in tqdm(range(int(len(data_train)/12))):
        data_matrix = data_train[r*12:(r+1)*12,1:]
        n_measures = []
        for i in range(len(data_matrix[0])):
            column = data_matrix[:,i]
            n_measures.append(np.count_nonzero(~np.isnan(column)))
        trainX.append(n_measures)

    return trainX

def nans_to_zeros(trainX):
    trainX = np.array(trainX)
    trainX[np.isnan(trainX)] = 100000
    return trainX


def preprocessing_diff(data_train):
    train_mean_X = np.zeros((int(len(data_train)/12),len(data_train[0])-1))
    for r in tqdm(range(int(len(data_train)/12))):
        data_matrix = data_train[r*12:(r+1)*12,1:]
        for c in range(len(data_matrix[0])):
            avg = np.nanmean(data_matrix[:,c], axis=0)
            if np.isnan(avg) == False: train_mean_X[r,c] = avg
            else: train_mean_X[r,c] = np.nan
    
    M = np.zeros((int(len(data_train)/12),len(data_train[0])-1))
    for c in tqdm(range(len(train_mean_X[0]))):
        column = train_mean_X[:,c]
        column_mean = np.nanmean(column)
        column_std = np.nanstd(column)
        for r in range(len(train_mean_X)):  
            if np.isnan(train_mean_X[r,c]-column_mean): M[r,c] = 0
            else: M[r,c] = train_mean_X[r,c]-column_mean 
    return M


train_diff_X = preprocessing_diff(data_trainX)                 # deviations from mean columns values
train_measures_X = preprocessing_measures(data_trainX)         # number of measures over 12 hours
train_derivative_X = preprocessing_derivative(data_trainX)     # difference from first measurement to last one
train_mean_X = preprocessing_nan(data_trainX)                  # mean values over the 12 hours
train_mean_X = nans_to_zeros(train_mean_X)                     # impute nans with the number zero
trainX = np.c_[train_mean_X, train_diff_X, train_derivative_X, train_measures_X] 
# merging the three matrices together

test_diff_X = preprocessing_diff(data_testX)
test_measures_X = preprocessing_measures(data_testX) 
test_derivative_X = preprocessing_derivative(data_testX)
test_mean_X = preprocessing_nan(data_testX)
test_mean_X = nans_to_zeros(test_mean_X)
testX = np.c_[test_mean_X, test_diff_X, test_derivative_X, test_measures_X]

trainX = StandardScaler().fit_transform(trainX)
testX = StandardScaler().fit_transform(testX)

100%|██████████| 18995/18995 [00:57<00:00, 331.23it/s]
100%|██████████| 36/36 [00:03<00:00, 10.85it/s]
100%|██████████| 18995/18995 [00:03<00:00, 4981.18it/s]
100%|██████████| 18995/18995 [01:35<00:00, 198.38it/s]
100%|██████████| 12664/12664 [00:32<00:00, 388.64it/s]
100%|██████████| 36/36 [00:01<00:00, 21.15it/s]
100%|██████████| 12664/12664 [00:01<00:00, 6727.35it/s]
100%|██████████| 12664/12664 [01:04<00:00, 195.32it/s]


time: 4min 24s


In [6]:
classifier = MLPClassifier(alpha=3, n_iter_no_change=30, max_iter=500, verbose=1000)
classifier.fit(trainX, trainY1_11)
M_class = np.array(classifier.predict_proba(testX))

Iteration 1, loss = 5.91779308
Iteration 2, loss = 4.30433178
Iteration 3, loss = 4.02221120
Iteration 4, loss = 3.85293874
Iteration 5, loss = 3.73948157
Iteration 6, loss = 3.66014949
Iteration 7, loss = 3.60971103
Iteration 8, loss = 3.56678426
Iteration 9, loss = 3.53713095
Iteration 10, loss = 3.51593540
Iteration 11, loss = 3.49939886
Iteration 12, loss = 3.48591674
Iteration 13, loss = 3.47604141
Iteration 14, loss = 3.46763861
Iteration 15, loss = 3.45740239
Iteration 16, loss = 3.45451238
Iteration 17, loss = 3.44805414
Iteration 18, loss = 3.44468367
Iteration 19, loss = 3.43729927
Iteration 20, loss = 3.43582043
Iteration 21, loss = 3.43247561
Iteration 22, loss = 3.43307602
Iteration 23, loss = 3.42696897
Iteration 24, loss = 3.42483924
Iteration 25, loss = 3.42255173
Iteration 26, loss = 3.42132157
Iteration 27, loss = 3.41776462
Iteration 28, loss = 3.41752504
Iteration 29, loss = 3.41645041
Iteration 30, loss = 3.41686635
Iteration 31, loss = 3.41299398
Iteration 32, los

Iteration 253, loss = 3.37663127
Iteration 254, loss = 3.37781830
Iteration 255, loss = 3.37845816
Iteration 256, loss = 3.37598722
Iteration 257, loss = 3.37627515
Iteration 258, loss = 3.37840484
Iteration 259, loss = 3.37617740
Iteration 260, loss = 3.37585814
Iteration 261, loss = 3.37838579
Iteration 262, loss = 3.37743826
Training loss did not improve more than tol=0.000100 for 30 consecutive epochs. Stopping.
time: 46.6 s


In [7]:
regressor = ElasticNet(alpha=0.01, l1_ratio=1)
regressor.fit(trainX, trainY12_16)
M_regr = regressor.predict(testX)

time: 1.77 s


In [8]:
VITALS = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
TESTS = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']

M = np.c_[pid, M_class]
M = np.c_[M, M_regr]

df_predictions = pd.DataFrame(M, columns=['pid']+TESTS+['LABEL_Sepsis']+VITALS)
df_predictions.to_csv('prediction.zip', index=False, float_format='%.3f', compression='zip')

time: 599 ms
