In [1]:
import pandas as pd
import pickle
import numpy as np
import os
import io
import time
from functools import lru_cache

In [2]:
# Check if notebook is run in google colab

baseDir = os.getcwd()

inColab = False

if 'google.colab' in str(get_ipython()):
    inColab = True
    from google.colab import files
    from google.colab import drive
    drive.mount('/content/drive')
    baseDir = input('Input the base directory in you google drive: ')

In [3]:
# inputs to either compute new models ore load existing models
confirmation = ['y', 'yes', '1']
while True:
    try:
        runLabel = str(input('Label for this run: '))
        compRegModel = input('Compute regression model instead of loading it y/n: ').lower() in confirmation
        compClasModel = input('Compute classification model instead of loading it y/n: ').lower() in confirmation
        
        ph_r = "NEW" if compRegModel else "EXISTING"
        ph_c = "NEW" if compClasModel else "EXISTING"
        
        print(f"{ph_r} regression model will be used")
        print(f"{ph_c} classification model will be used")      
        
        break
    except ValueError:
        print("Wrong input, please correct your input.")

Label for this run: comAllModels
Compute regression model instead of loading it: yes
Compute classification model instead of loading it: yes
NEW regression model will be used
NEW classification model will be used


## Importing data

In [4]:
# Importing & sorting data
D_X_df = pd.read_csv(os.path.join(baseDir,'data','train_features.csv'), index_col = 'pid')
D_y_df = pd.read_csv(os.path.join(baseDir,'data','train_labels.csv'), index_col = 'pid')
D_test_df = pd.read_csv(os.path.join(baseDir,'data','test_features.csv'), index_col = 'pid')

# sorting rows by increasing index
# D_X_df = D_X_df.sort_index()
# D_y_df = D_y_df.sort_index()
# D_test_df  = D_test_df.sort_index()

# indices_train = list(D_X_df.index.unique())
# indices_test = list(D_test_df.index.unique())

## Aggregating values of same patient and extending feature vector

In [5]:
path_X = os.path.join(baseDir,'results','agg_X.csv')
path_test = os.path.join(baseDir,'results','agg_test.csv')

if not (os.path.exists(path_X) and os.path.exists(path_test)):  # only create file if the file doesn't exist yet

    print("Computing data ... ")

    # grouping by index
    D_X_df_min = D_X_df.groupby(['pid'],sort=False).min()
    D_test_df_min = D_test_df.groupby(['pid'],sort=False).min()

    D_X_df_mean = D_X_df.groupby(['pid'],sort=False).mean()
    D_test_df_mean = D_test_df.groupby(['pid'],sort=False).mean()

    D_X_df_max = D_X_df.groupby(['pid'],sort=False).max()
    D_test_df_max = D_test_df.groupby(['pid'],sort=False).max()

    D_X_df_diff = D_X_df_max - D_X_df_min
    D_test_df_diff = D_test_df_max - D_test_df_min

    D_X_df = pd.concat([D_X_df_min, D_X_df_mean, D_X_df_max,D_X_df_diff], axis=1, sort=False)
    D_test_df = pd.concat([D_test_df_min,D_test_df_mean, D_test_df_max,D_test_df_diff], axis=1, sort=False)

    D_X_df.to_csv(path_X, index=True, header = True, float_format='%.3f') #, compression='zip')
    D_test_df.to_csv(path_test, index=True, header = True, float_format='%.3f') #, compression='zip')
else:
    print("Loading data ... ")
    D_X_df = pd.read_csv(path_X, index_col = 'pid')
    D_test_df = pd.read_csv(path_test, index_col = 'pid')

# Visualizing data
D_X_percNan = np.round(pd.DataFrame(D_X_df.isna().sum()/D_X_df.shape[0]),3).T
display(D_X_df.head(20))
print("Percent nan values: ")
display(D_X_percNan)

Loading data ... 


Unnamed: 0_level_0,Time,Age,EtCO2,PTT,BUN,Lactate,Temp,Hgb,HCO3,BaseExcess,...,Alkalinephos.3,SpO2.3,Bilirubin_direct.3,Chloride.3,Hct.3,Heartrate.3,Bilirubin_total.3,TroponinI.3,ABPs.3,pH.3
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,34.0,,,12.0,,36.0,8.5,24.0,-2.0,...,,0.0,,3.0,2.4,41.0,,,42.0,0.08
10,1,71.0,,27.8,12.0,,36.0,14.6,,,...,0.0,3.0,,,0.0,19.0,0.0,0.0,39.0,
100,2,68.0,,20.9,21.0,,35.0,12.5,27.0,,...,,8.0,,0.0,0.0,37.0,,,61.0,
1000,1,79.0,26.0,,22.0,3.66,36.0,9.2,,,...,,5.0,,,0.0,41.0,,,67.0,0.0
10000,1,76.0,,25.7,22.0,,36.0,10.4,25.0,0.0,...,,3.0,,1.0,2.0,26.0,,,57.0,0.07
10002,1,73.0,19.0,31.3,18.0,1.78,36.0,10.4,,,...,0.0,4.0,,2.0,0.0,19.0,0.0,,65.0,0.09
10006,1,51.0,,,,,37.0,,,,...,,8.0,,,,12.0,,,59.0,
10007,1,60.0,,,,,38.0,,,,...,,4.0,,,,34.0,,0.0,46.0,
10009,1,69.0,,85.9,15.0,,37.0,12.2,21.0,,...,0.0,5.0,,0.0,0.0,23.0,0.0,,28.0,
1001,1,36.0,,31.2,10.0,1.4,37.0,10.4,31.0,4.0,...,,0.0,,0.0,1.5,11.0,,,37.0,0.26


Percent nan values: 


Unnamed: 0,Time,Age,EtCO2,PTT,BUN,Lactate,Temp,Hgb,HCO3,BaseExcess,...,Alkalinephos.3,SpO2.3,Bilirubin_direct.3,Chloride.3,Hct.3,Heartrate.3,Bilirubin_total.3,TroponinI.3,ABPs.3,pH.3
0,0.0,0.0,0.93,0.589,0.265,0.744,0.023,0.264,0.587,0.7,...,0.75,0.001,0.967,0.56,0.229,0.0,0.752,0.853,0.021,0.579


## Imputing data

In [6]:
# imputing nan data - changing to median
D_test_df = D_test_df.fillna(D_test_df.median())
D_X_df = D_X_df.fillna(D_X_df.median())

#Same as :
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(missing_values=np.nan, strategy='median').fit(D_X_df)
D_X_df = pd.DataFrame(imp_mean.transform(D_X_df), index = D_X_df.index)
imp_mean = SimpleImputer(missing_values=np.nan, strategy='median').fit(D_test_df)
D_test_df = pd.DataFrame(imp_mean.transform(D_test_df), index = D_test_df.index)

# visualizing
print("Train data: ")
display(D_X_df.head())
print("Labels for train data")
display(D_y_df.head())
print("Test data: ")
display(D_test_df.head())



Train data: 


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,134,135,136,137,138,139,140,141,142,143
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.0,34.0,29.5,31.0,12.0,1.64,36.0,8.5,24.0,-2.0,...,0.0,0.0,0.0,3.0,2.4,41.0,0.0,0.0,42.0,0.08
10,1.0,71.0,29.5,27.8,12.0,1.64,36.0,14.6,23.0,-2.0,...,0.0,3.0,0.0,0.0,0.0,19.0,0.0,0.0,39.0,0.04
100,2.0,68.0,29.5,20.9,21.0,1.64,35.0,12.5,27.0,-2.0,...,0.0,8.0,0.0,0.0,0.0,37.0,0.0,0.0,61.0,0.04
1000,1.0,79.0,26.0,31.0,22.0,3.66,36.0,9.2,23.0,-2.0,...,0.0,5.0,0.0,0.0,0.0,41.0,0.0,0.0,67.0,0.0
10000,1.0,76.0,29.5,25.7,22.0,1.64,36.0,10.4,25.0,0.0,...,0.0,3.0,0.0,1.0,2.0,26.0,0.0,0.0,57.0,0.07


Labels for train data


Unnamed: 0_level_0,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,12.1,85.4,100.0,59.9
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.8,100.6,95.5,85.5
100,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,16.5,88.3,96.5,108.1
1000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,19.4,77.2,98.3,80.9
10000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.6,76.8,97.7,95.3


Test data: 


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,134,135,136,137,138,139,140,141,142,143
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,39.0,30.0,38.5,17.0,1.63,36.0,9.1,13.0,-9.0,...,10.0,0.0,0.0,4.0,5.1,12.0,4.6,0.0,25.0,0.06
10001,1.0,62.0,30.0,30.8,17.0,1.63,37.0,10.5,24.0,-2.0,...,0.0,5.0,0.0,0.0,0.0,31.0,0.0,0.0,24.0,0.03
10003,1.0,72.0,30.0,30.8,17.0,2.0,36.0,10.5,24.0,-2.0,...,0.0,2.0,0.0,0.0,0.0,12.0,0.0,0.0,32.0,0.12
10004,1.0,44.0,30.0,30.8,10.0,1.63,36.0,12.7,24.0,-2.0,...,0.0,8.0,0.0,0.0,0.0,14.0,0.0,0.0,37.0,0.03
10005,1.0,88.0,30.0,39.9,42.0,1.63,36.0,8.8,25.0,-2.0,...,0.0,6.0,0.0,0.0,0.0,20.0,0.0,0.0,33.0,0.03


In [7]:
# ALTERNATIVE IMPUTING METHOD:

# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer

# imp_mean = IterativeImputer(random_state=0).fit(D_X_df)
# D_X_df = pd.DataFrame(imp_mean.transform(D_X_df),index=D_X_df.index)

# imp_mean = IterativeImputer(random_state=0).fit(D_test_df)
# D_test_df = pd.DataFrame(imp_mean.transform(D_test_df),index=D_test_df.index)

# D_test_df.head(20)

In [8]:
# defining numpy arrays for computing
D_X = np.array(D_X_df)
D_y = np.array(D_y_df)
D_test = np.array(D_test_df)

# labels
D_y_c_df = D_y_df.iloc[:,0:11]
D_y_r_df = D_y_df.iloc[:,11:]

D_y_c = np.array(D_y_c_df)
D_y_r = np.array(D_y_r_df)

print(f'The percentage of administered tests is : \n{np.round(D_y_c_df.mean()*100,1)}')

The percentage of administered tests is : 
LABEL_BaseExcess          26.8
LABEL_Fibrinogen           7.4
LABEL_AST                 24.0
LABEL_Alkalinephos        23.6
LABEL_Bilirubin_total     24.1
LABEL_Lactate             20.0
LABEL_TroponinI           10.0
LABEL_SaO2                23.4
LABEL_Bilirubin_direct     3.4
LABEL_EtCO2                6.6
LABEL_Sepsis               5.7
dtype: float64


## Normalizing data

In [9]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()

scaler.fit_transform(D_X)
D_X_stand = scaler.transform(D_X)
D_test_stand = scaler.transform(D_test)

# save standardization
name = os.path.join(baseDir, 'results', f'standardized_data.csv')

print('File will be saved ... ')
pd.DataFrame(D_X_stand).to_csv(name, index=True, header = True, float_format='%.3f') #, compression='zip')
print(f'File was saved under {name}')

display(pd.DataFrame(D_test_stand).head())

File will be saved ... 
File was saved under /home/sebas/Documents/ETHZ/IML/IML-projects/Project_2/results/standardized_data.csv


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,134,135,136,137,138,139,140,141,142,143
0,-0.160088,-1.402508,0.219735,0.349183,-0.228271,-0.144938,-0.401101,-0.765758,-3.589828,-2.863013,...,1.226002,-1.044044,-0.032137,2.649333,1.537779,-0.68505,29.772859,-0.050963,-0.736679,0.331745
1,-0.160088,-0.004486,0.219735,-0.182083,-0.228271,-0.144938,0.859057,-0.008376,0.282772,-0.006475,...,-0.05122,0.131623,-0.032137,-0.270078,-0.455239,0.915724,-0.092594,-0.050963,-0.791702,-0.385829
2,-0.160088,0.603349,0.219735,-0.182083,-0.228271,0.317117,-0.401101,-0.008376,0.282772,-0.006475,...,-0.05122,-0.573777,-0.032137,-0.270078,-0.455239,-0.68505,-0.092594,-0.050963,-0.351514,1.766893
3,-0.160088,-1.09859,0.219735,-0.182083,-0.638763,-0.144938,-0.401101,1.181795,0.282772,-0.006475,...,-0.05122,0.837022,-0.032137,-0.270078,-0.455239,-0.516547,-0.092594,-0.050963,-0.076396,-0.385829
4,-0.160088,1.575886,0.219735,0.445777,1.237773,-0.144938,-0.401101,-0.928054,0.634827,-0.006475,...,-0.05122,0.366756,-0.032137,-0.270078,-0.455239,-0.01104,-0.092594,-0.050963,-0.29649,-0.385829


## K-fold data splits

In [10]:
# for regression portion
from sklearn.model_selection import KFold

kf = KFold(3,shuffle=True,random_state=13)

ind = np.arange(D_X.shape[0])

ind_splits = list(kf.split(ind))

In [11]:
# for classification portion
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=3, shuffle=True,random_state=13)

ind_splits_skf = [list(skf.split(D_X,D_y[:,i])) for i in range(D_y_c.shape[1])]
# len(ind_splits_skf)

## Downselection of features - optional

In [12]:
# # using nan value thresholds to filter data, this could be a greedy selection however moving forward
# thresh = 1.0
# # print(D_X_percNan<thresh)
# selection = D_X_percNan <= thresh
# D_X = D_X[:,selection]
# D_test = D_test[:,selection]

# selection = selection[0:D_X_stand.shape[1]]
# D_X_stand = D_X_stand[:,selection]
# D_test_stand = D_test_stand[:,selection]

# print(D_X_stand.shape)
# print(D_X.shape)
# print(D_test.shape)
# print(D_test_stand.shape)


## OPTION2 ##
# from sklearn.svm import LinearSVC
# from sklearn.linear_model import LogisticRegression
# from sklearn.feature_selection import SelectFromModel

# lsvc = LinearSVC(C=0.01).fit(D_X_stand,D_y[:,2])
# selector = SelectFromModel(estimator=LogisticRegression(),max_features=4).fit(D_X_stand,D_y[:,3])

# D_X_stand_row2 = selector.transform(D_X_stand)
# D_X_stand_row2.shape

## PCA visualization - optional

In [13]:
# from sklearn.decomposition import PCA
# from matplotlib import pyplot as plt
# from mpl_toolkits.mplot3d import Axes3D

# def pcaPlotter(X,y, title='pca plotter', first=500):
#   pca = PCA(n_components = 3)
#   X_2d = pca.fit_transform(X)

#   colors = ['#1F77B4', '#FF7F0E']
#   markers = ['o', 's']
#   fig = plt.figure()
#   ax = fig.add_subplot(111, projection='3d')

#   for count, val in enumerate(y):
#     val=bool(val)
#     ax.scatter(X_2d[count,0],X_2d[count,1],X_2d[count,2],marker=markers[val],c=colors[val])

#     if count > first:
#       break

#   plt.title(title)

#   plt.show()

# pcaPlotter(D_X_stand_row2,D_y[:,3],title='Visualize',first=500)

# Train classification models; TO DO: replace model by ANN?

In [14]:
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
import sklearn.metrics as metrics

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

from sklearn.utils import shuffle

bestModels_c_seps = []
bestSelector_c_seps = []
bestKFold_c_seps = []
bestROCScore_c_seps = []

mult = 1.00
n_features = 60

# getting time when starting training
import datetime
now = datetime.datetime.now()
unixTime = round(now.timestamp())

# create a new model for each label
if compClasModel: # loadM == 'c':

  for idx_label in range(D_y_c.shape[1]):

    #idx_label = D_y_c.shape[1]-1

    # for stratified k-fold
    ind_splits = ind_splits_skf[idx_label]

    ## only pick best features

#     selector = SelectFromModel(estimator=LinearSVC(max_iter = 1e5, C=1),max_features=n_features).fit(D_X_stand,D_y[:,idx_label])
    #selector = SelectFromModel(estimator=svm.SVC(class_weight='balanced',kernel='linear'),max_features=n_features).fit(D_X_stand,D_y[:,idx_label])
    # plotting PCA selector for each label
    #pcaPlotter(selector.transform(D_X_stand),D_y[:,idx_label],title=str(D_y_c_df.columns[idx_label]),first=200)
    #####

    ######## classification per label

    f1_best = 0
    bestKFold = 0
    bestROCScore = 0
    
    for idx, (train_index, val_index) in enumerate(ind_splits):

      D_X_train = D_X_stand[train_index,:]
      D_y_train = D_y_c[train_index,idx_label]

      D_X_val = D_X_stand[val_index,:]
      D_y_val = D_y_c[val_index,idx_label]

      # print(idx)

      # balancing data set

      idx_ones = np.where(D_y_train == 1)[0]
      idx_zeroes = np.where(D_y_train == 0)[0]

      # print(idx_zeroes)

      out_vec = np.random.choice(idx_zeroes, len(idx_ones))

      D_X_train = D_X_train[np.concatenate((out_vec, idx_ones)),:]
      D_y_train = D_y_train[np.concatenate((out_vec, idx_ones))]
      
      D_X_train, D_y_train = shuffle(D_X_train, D_y_train, random_state=0)

      ####

      # only best features
      #D_X_train = selector.transform(D_X_train)
      #D_X_val = selector.transform(D_X_val)
      ####

      relWeight = round(D_y_train.shape[0]/np.sum(D_y_train),2)
      print(f'The relative weight of 1 to 0 is {relWeight}')

      svc = svm.SVC(gamma = 'scale', max_iter = 1e7, tol = 1e-4, decision_function_shape = 'ovo', C = 2, kernel='rbf', class_weight='balanced' , verbose=True, probability=True, cache_size=7500) #class_weight = {0:1,1:(mult*relWeight)}
      #svr = svm.SVR(gamma = 'scale', C = 0.1)
      
      model = svc.fit(D_X_train,D_y_train)
      #model_r = svr.fit(D_X_train,D_y_train)

      # validate
      y_pred = model.predict(D_X_val)
      
      f1 = precision_recall_fscore_support(y_pred, D_y_val , average='weighted')[2]

      # receiver curve score
      # w = np.array(model.coef_)
      # b = np.array(model.intercept_)

      #real_prediction = D_X_val.dot(np.transpose(w)) + b
      real_prediction = model.predict_proba(D_X_val)[:,1]
      #real_prediction_r = np.clip(model_r.predict(D_X_val),0,1)
      roc_score = metrics.roc_auc_score(D_y_val,real_prediction)
      print(f'\nFor fold {idx} the roc_score is {roc_score}')

      ####
    
      if roc_score > bestROCScore:
        bestROCScore = roc_score
        bestModel = model
        bestKFold = idx
        print(f'\nBest model for label {D_y_c_df.columns[idx_label]} is kfold {idx}. This model has  {np.round(D_y_c_df.mean()*100,1)[idx_label]} % tests')
        print(classification_report(D_y_val,y_pred))

      # print(precision_recall_fscore_support(y_pred, D_y_val , average='weighted'))  

    bestModels_c_seps.append(bestModel)
#     bestSelector_c_seps.append(selector)
    bestKFold_c_seps.append(bestKFold)
    bestROCScore_c_seps.append(bestROCScore)
    ######### end of calssification per label
    
    bestKFold_c_seps.append(bestKFold)
    bestROCScore_c_seps.append(bestROCScore)
    real_prediction = model.predict_proba(D_X_val)[:,1]
    real_prediction
    
    display(np.mean([np.mean(bestROCScore_c_seps[0:-2]),bestROCScore_c_seps[-1]]))

  print(f'All the ROC scores are: {bestROCScore_c_seps}')

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 0 the roc_score is 0.8803897672441491

Best model for label LABEL_BaseExcess is kfold 0. This model has  26.8 % tests
              precision    recall  f1-score   support

         0.0       0.93      0.80      0.86      4633
         1.0       0.61      0.83      0.70      1699

    accuracy                           0.81      6332
   macro avg       0.77      0.81      0.78      6332
weighted avg       0.84      0.81      0.82      6332

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 1 the roc_score is 0.8922290470124565

Best model for label LABEL_BaseExcess is kfold 1. This model has  26.8 % tests
              precision    recall  f1-score   support

         0.0       0.93      0.83      0.88      4633
         1.0       0.65      0.82      0.72      1699

    accuracy                           0.83      6332
   macro avg       0.79      0.83      0.80      6332
weighted avg       0.85      0.83      0.84      6332



  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


nan

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 0 the roc_score is 0.7884934217612192

Best model for label LABEL_Fibrinogen is kfold 0. This model has  7.4 % tests
              precision    recall  f1-score   support

         0.0       0.96      0.85      0.90      5865
         1.0       0.23      0.57      0.33       467

    accuracy                           0.83      6332
   macro avg       0.59      0.71      0.61      6332
weighted avg       0.91      0.83      0.86      6332

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 1 the roc_score is 0.7747856390484692
The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 2 the roc_score is 0.7576084212375005


0.8403612343868379

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 0 the roc_score is 0.7048232455513755

Best model for label LABEL_AST is kfold 0. This model has  24.0 % tests
              precision    recall  f1-score   support

         0.0       0.85      0.71      0.77      4814
         1.0       0.40      0.60      0.48      1518

    accuracy                           0.69      6332
   macro avg       0.62      0.66      0.63      6332
weighted avg       0.74      0.69      0.70      6332

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 1 the roc_score is 0.7110224323763639

Best model for label LABEL_AST is kfold 1. This model has  24.0 % tests
              precision    recall  f1-score   support

         0.0       0.85      0.72      0.78      4814
         1.0       0.40      0.58      0.47      1518

    accuracy                           0.69      6332
   macro avg       0.62      0.65      0.63      6332
weighted avg       0.74      0.69      0.71      6332

The relative w

0.7756918333816009

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 0 the roc_score is 0.7106642665525493

Best model for label LABEL_Alkalinephos is kfold 0. This model has  23.6 % tests
              precision    recall  f1-score   support

         0.0       0.85      0.72      0.78      4836
         1.0       0.39      0.59      0.47      1496

    accuracy                           0.69      6332
   macro avg       0.62      0.65      0.63      6332
weighted avg       0.74      0.69      0.71      6332

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 1 the roc_score is 0.7028759902336752
The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 2 the roc_score is 0.7074587610756561


0.7539562834679479

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 0 the roc_score is 0.7068484936115761

Best model for label LABEL_Bilirubin_total is kfold 0. This model has  24.1 % tests
              precision    recall  f1-score   support

         0.0       0.85      0.71      0.77      4809
         1.0       0.39      0.60      0.48      1523

    accuracy                           0.68      6332
   macro avg       0.62      0.65      0.62      6332
weighted avg       0.74      0.68      0.70      6332

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 1 the roc_score is 0.6989866244360887
The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 2 the roc_score is 0.7034951732885549


0.7412253927686117

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 0 the roc_score is 0.7637093780056912

Best model for label LABEL_Lactate is kfold 0. This model has  20.0 % tests
              precision    recall  f1-score   support

         0.0       0.89      0.77      0.82      5064
         1.0       0.40      0.61      0.48      1268

    accuracy                           0.74      6332
   macro avg       0.64      0.69      0.65      6332
weighted avg       0.79      0.74      0.75      6332

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 1 the roc_score is 0.762875104031177
The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 2 the roc_score is 0.7576268436467829


0.7627804551342621

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 0 the roc_score is 0.7868895736175884

Best model for label LABEL_TroponinI is kfold 0. This model has  10.0 % tests
              precision    recall  f1-score   support

         0.0       0.96      0.72      0.82      5700
         1.0       0.21      0.70      0.33       632

    accuracy                           0.71      6332
   macro avg       0.59      0.71      0.57      6332
weighted avg       0.88      0.71      0.77      6332

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 1 the roc_score is 0.8072185209860093

Best model for label LABEL_TroponinI is kfold 1. This model has  10.0 % tests
              precision    recall  f1-score   support

         0.0       0.96      0.70      0.81      5700
         1.0       0.22      0.77      0.34       632

    accuracy                           0.70      6332
   macro avg       0.59      0.73      0.58      6332
weighted avg       0.89      0.70      0.76      6332

Th

0.7846898471029927

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 0 the roc_score is 0.8017125843898308

Best model for label LABEL_SaO2 is kfold 0. This model has  23.4 % tests
              precision    recall  f1-score   support

         0.0       0.89      0.80      0.84      4852
         1.0       0.50      0.66      0.57      1480

    accuracy                           0.77      6332
   macro avg       0.69      0.73      0.71      6332
weighted avg       0.80      0.77      0.78      6332

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 1 the roc_score is 0.7952044712684654
The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 2 the roc_score is 0.7927979902197682


0.7851552607881915

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 0 the roc_score is 0.7168607502537724

Best model for label LABEL_Bilirubin_direct is kfold 0. This model has  3.4 % tests
              precision    recall  f1-score   support

         0.0       0.98      0.82      0.89      6117
         1.0       0.09      0.49      0.15       215

    accuracy                           0.81      6332
   macro avg       0.53      0.65      0.52      6332
weighted avg       0.95      0.81      0.87      6332

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 1 the roc_score is 0.738874505286449

Best model for label LABEL_Bilirubin_direct is kfold 1. This model has  3.4 % tests
              precision    recall  f1-score   support

         0.0       0.98      0.77      0.86      6117
         1.0       0.09      0.63      0.15       215

    accuracy                           0.76      6332
   macro avg       0.54      0.70      0.51      6332
weighted avg       0.95      0.76      0.84   

0.7558058866867055

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 0 the roc_score is 0.8766065600561801

Best model for label LABEL_EtCO2 is kfold 0. This model has  6.6 % tests
              precision    recall  f1-score   support

         0.0       0.98      0.86      0.92      5914
         1.0       0.28      0.75      0.41       418

    accuracy                           0.86      6332
   macro avg       0.63      0.81      0.66      6332
weighted avg       0.93      0.86      0.88      6332

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 1 the roc_score is 0.8795830346610833

Best model for label LABEL_EtCO2 is kfold 1. This model has  6.6 % tests
              precision    recall  f1-score   support

         0.0       0.98      0.88      0.92      5914
         1.0       0.29      0.73      0.42       418

    accuracy                           0.87      6332
   macro avg       0.64      0.80      0.67      6332
weighted avg       0.93      0.87      0.89      6332

The relative

0.8277683123338643

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 0 the roc_score is 0.6827073026984691

Best model for label LABEL_Sepsis is kfold 0. This model has  5.7 % tests
              precision    recall  f1-score   support

         0.0       0.96      0.71      0.82      5969
         1.0       0.10      0.55      0.17       363

    accuracy                           0.70      6332
   macro avg       0.53      0.63      0.50      6332
weighted avg       0.91      0.70      0.78      6332

The relative weight of 1 to 0 is 2.0
[LibSVM]
For fold 1 the roc_score is 0.7019628964526085

Best model for label LABEL_Sepsis is kfold 1. This model has  5.7 % tests
              precision    recall  f1-score   support

         0.0       0.97      0.68      0.80      5969
         1.0       0.11      0.63      0.18       363

    accuracy                           0.68      6332
   macro avg       0.54      0.65      0.49      6332
weighted avg       0.92      0.68      0.76      6332

The relati

0.7413481750144526

All the ROC scores are: [0.8922290470124565, 0.8922290470124565, 0.7884934217612192, 0.7884934217612192, 0.7110224323763639, 0.7110224323763639, 0.7106642665525493, 0.7106642665525493, 0.7068484936115761, 0.7068484936115761, 0.7637093780056912, 0.7637093780056912, 0.8072185209860093, 0.8072185209860093, 0.8017125843898308, 0.8017125843898308, 0.738874505286449, 0.738874505286449, 0.8865618857808236, 0.8865618857808236, 0.7019628964526085, 0.7019628964526085]


## Saving classification models

In [15]:
if compClasModel:
    pickle_out = open(os.path.join(baseDir, 'local','models', f'{unixTime}_{runLabel}_c'),'wb')
    pickle.dump([bestModels_c_seps], pickle_out)
    pickle_out.close()

# Train regression models, TO DO: replace model by ANN?

In [22]:
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support

bestModels_r = []
bestKFold_r = []
bestR2_score_r = []

from sklearn.linear_model import LinearRegression

if compRegModel:

  for idx_label in range(D_y_r.shape[1]):

    ######## classification per label

    score_best = float('-inf')
    for idx, (train_index, val_index) in enumerate(ind_splits):

      D_X_train = D_X_stand[train_index,:]
      D_y_train = D_y_r[train_index,idx_label]

      D_X_val = D_X_stand[val_index,:]
      D_y_val = D_y_r[val_index,idx_label]

      #svr prediction
      svc = svm.SVR(kernel='rbf',verbose=True)
      model = svc.fit(D_X_train,D_y_train)
      score_m = model.score(D_X_val, D_y_val)

      y_pred = model.predict(D_X_val)

      # validate
      # y_pred = model.predict(D_X_val)
      # score_m = model.score(D_X_val, D_y_val)

      # model = LinearRegression().fit(D_X_train, D_y_train)
      # score_m = model.score(D_X_train, D_y_train)
      # score_m = model.score(D_X_val, D_y_val)

      # Rsquared score minimize
      r2 = 0.5 + 0.5 * np.maximum(0, metrics.r2_score(D_y_val, y_pred))
      
      if r2 > score_best:
        score_best = r2
        bestModel = model
        print(f'\nBest model for label {idx_label} is index: {idx} with score {score_best}')

    # # print(precision_recall_fscore_support(y_pred, D_y_val , average='weighted'))  

    bestModels_r.append(bestModel)
    bestR2_score_r.append(score_best)
    ######### end of calssification per label

    print(f'The regression scores are: {bestR2_score_r}')

    # All scores
    totalScore = [np.mean(bestROCScore_c_seps[0:-2]), bestROCScore_c_seps[-1], np.mean(bestR2_score_r)]
    print(f'The total scores are: {totalScore} with a mean of: {np.mean(totalScore)}')

[LibSVM]
Best model for label 0 is index: 0 with score 0.6704408491485787
[LibSVM]
Best model for label 0 is index: 1 with score 0.6730675448779815
[LibSVM]
Best model for label 0 is index: 2 with score 0.6915486790885839
The regression scores are: [0.6915486790885839]
The total scores are: [0.7807334535762968, 0.7019628964526085, 0.6915486790885839] with a mean of: 0.7247483430391629
[LibSVM]
Best model for label 1 is index: 0 with score 0.7798101179113828
[LibSVM][LibSVM]The regression scores are: [0.6915486790885839, 0.7798101179113828]
The total scores are: [0.7807334535762968, 0.7019628964526085, 0.7356793984999833] with a mean of: 0.7394585828429628
[LibSVM]
Best model for label 2 is index: 0 with score 0.6431435850712759
[LibSVM]
Best model for label 2 is index: 1 with score 0.6657026863816782
[LibSVM]The regression scores are: [0.6915486790885839, 0.7798101179113828, 0.6657026863816782]
The total scores are: [0.7807334535762968, 0.7019628964526085, 0.7123538277938817] with a me

## Saving regression models

In [17]:
if compRegModel:
  pickle_out = open(os.path.join(baseDir, 'local','models', f'{unixTime}_{runLabel}_r'),"wb")
  pickle.dump(bestModels_r, pickle_out)
  pickle_out.close()

# Prediction - test set

In [18]:
# load models if they weren't computed

if not compClasModel:
    pathC = str(input('Input path of classification model: '))
    
    with open(pathC, 'rb') as file:
        print('File opened')
        temp = pickle.load(file)
        bestModels_c_seps = temp[0]  #[0]
        
if not compRegModel:
    pathR = str(input('Input path of regression model: '))
    with open(pathR,'rb') as file:
        bestModels_r = pickle.load(file)

Input path of classification model: ./models/1619152770_comAllModels_c
File opened
Input path of regression model: ./models/1619152770_comAllModels_r


In [19]:
# PREDICTING CLASSIFICATION RESULTS
result_c = [model.predict_proba(D_test_stand)[:,1] for model in bestModels_c_seps]
result_c = np.transpose(np.array(result_c))

# PREDICTING REGRESSION RESULTS
result_r = [mod.predict(D_test_stand) for mod in bestModels_r]
result_r = np.transpose(np.array(result_r))

## Casting np array back to dataframe

In [20]:
D_test_df.index
D_y_df.columns

df_c = pd.DataFrame(result_c,index=D_test_df.index)
df_r = pd.DataFrame(result_r,index=D_test_df.index)
df_out = pd.concat([df_c, df_r], axis=1, sort=False)
df_out.columns = D_y_df.columns

display(df_out.head())

Unnamed: 0_level_0,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,0.685871,0.793185,0.707765,0.709541,0.72267,0.754394,0.50594,0.702798,0.722455,0.692202,0.645916,18.491125,82.143927,97.567726,86.295773
10001,0.134958,0.28045,0.538778,0.592672,0.564861,0.296186,0.774944,0.192766,0.443373,0.234463,0.389322,17.27843,91.054505,94.926313,103.580036
10003,0.127964,0.400883,0.408537,0.408794,0.386382,0.460888,0.23319,0.705451,0.399816,0.290863,0.343492,19.192272,81.841416,97.357667,89.899452
10004,0.08274,0.240862,0.482736,0.395891,0.409841,0.230109,0.510324,0.264597,0.304867,0.208933,0.270404,15.468784,71.576145,95.687146,86.151419
10005,0.071586,0.268252,0.290733,0.28834,0.440928,0.293359,0.551,0.177506,0.263596,0.222153,0.413774,19.053253,74.543199,96.300625,56.609831


## Saving results

In [21]:
name = os.path.join(baseDir,'results',f'{unixTime}_result_{runLabel}.csv')
print(f'File was saved under {name}')
df_out.to_csv(name, index=True, header = True, float_format='%.3f') #, compression='zip')

File was saved under /home/sebas/Documents/ETHZ/IML/IML-projects/Project_2/results/1619152770_result_comAllModels.csv
