# Model training using Ziga's code

In [2]:
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt 
import seaborn as sns
import random

from classification_ziga import classify_leave_one_out_cv
from preprocessing import imputation_scaling, SupervisedSelector

sns.set_theme(style="whitegrid", palette=None, font_scale=1.2)

In [3]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier

from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.dummy import DummyClassifier 

from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import DotProduct
from sklearn.gaussian_process.kernels import Matern
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process.kernels import WhiteKernel

import warnings
warnings.filterwarnings('ignore')

# Prepare data

In [4]:
datasetTimepoint = "12_uponAdmission"
target = "disease_severity"
#### Feature selection:
vars = "boruta+bestSterols"    # boruta+bestSterols
varPath = f"../../results/featureSelection/{datasetTimepoint}/{vars}.txt"

''' 
Define paths
'''
resultsPath = f"../../results/prediction/featureSelection/{datasetTimepoint}/{vars}/modelComparison/zigaPipeline"
os.makedirs(resultsPath, exist_ok=True)
dataPath = "../../results/preprocessing/cleaned"
dataset = f"{datasetTimepoint}_{target}_cleaned.csv"

# Prepare classifiers

In [5]:
models = {
          'svc': SVC(probability=True),
          'rfc': RandomForestClassifier(),  
          'gpr': GaussianProcessClassifier(),
          'abc': AdaBoostClassifier(base_estimator = DecisionTreeClassifier(random_state = 11, max_features = "auto", class_weight = "balanced",max_depth = None)),
          'log': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'mlp': MLPClassifier(),
          'gnb': GaussianNB(),
          'qda': QuadraticDiscriminantAnalysis(),
          'mcl': DummyClassifier(strategy="most_frequent"),
         }               


grids = {'rfc':{
                'n_estimators': [100, 300, 1000],      ### changed
               'max_depth': [2,4,6],         
               'max_features': [2,4,6],  
               'ccp_alpha':  list(np.linspace(0, 0.025, 2)),   
               },
         'svc':{'C': [0.1, 1, 10, 100],  
                'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
                'kernel': ['rbf', 'poly', 'linear']     
               },
         'gpr':{'kernel':[1*RBF(), 1*DotProduct(), 1*Matern(),  1*RationalQuadratic(), 1*WhiteKernel()]},
         'abc':{"base_estimator__criterion" : ["gini", "entropy"],
                "base_estimator__splitter" :   ["best", "random"],
                "n_estimators": [1, 2]
               },
         'log':{'penalty': ['l1','l2'], 'C': [0.001,0.01,0.1,1,10,100,1000]},     
         'knn':{'n_neighbors': list(range(1, 15)),
               'weights': ['uniform', 'distance'],
               'metric': ['euclidean', 'manhattan']},
         'mlp':{'solver': ['adam'],    
                'max_iter': [50, 100, 200],        
                'alpha': 10.0 ** -np.arange(0, 5),            
                'hidden_layer_sizes': [(random.randrange(15, 41), random.randrange(5, 16)) for i in range(5)],           
               },  
         'gnb': {'var_smoothing': np.logspace(-9,9, num=100)},
         'qda': {'reg_param': (0.00001, 0.0001, 0.001,0.01, 0.1), 
                 'store_covariance': (True, False),
                 'tol': (0.0001, 0.001,0.01, 0.1)},
         'mcl': {},  
         }   

In [6]:
''' 
Read data
'''
data = pd.read_csv(f"{dataPath}/{dataset}", index_col=0)
X = data.drop(target, axis=1)
y = data[target]

# ###### FOR DEVELOPMENT PURPOSES: smaller dataset
# X = X.iloc[:5,:]
# y = y[:5]

''' 
Read in variables
'''
sel_variables = pd.read_csv(varPath, header=None)[0].tolist()

''' 
Impute & scale X (according to Sonja's pipeline)
only works like this with LOOCV (no MICE possible on test split!)
'''
num_columns = X.select_dtypes(include=["float64"]).columns
bin_columns = X.select_dtypes(include=["int64"]).columns
cat_columns = X.select_dtypes(include=["object"]).columns
preprocessor = imputation_scaling(num_columns, bin_columns, cat_columns, X)     
X_imputed = preprocessor.fit_transform(X)
X_imputed = SupervisedSelector(preprocessor, sel_variables).transform(X_imputed)

''' 
Run Pipeline
'''
X_imputed

Unnamed: 0,the_main_reason_for_hospital_admission,saturation_measured_based_on…,ferritin_admission_µg/L,ldh_admission_µkat/L,xray_admission_pathological_n/y,xray_admission_thickenings,saturation_%,crp_admission_mg/L,24-dehydrolathosterol,desmosterol,zymostenol,2425-dihydrolanosterol
0,0.2,1.0,0.222126,0.249123,1.0,0.5,92.0,25.0,0.103822,0.141685,0.062561,0.000774
1,0.0,0.0,0.684745,0.413380,1.0,0.5,96.0,141.0,0.205026,0.123957,0.110738,0.005252
2,0.0,0.0,0.052290,0.335965,1.0,0.0,96.0,46.0,0.166169,0.083826,0.076971,0.004725
3,0.6,1.0,0.020959,0.171930,1.0,0.5,98.0,31.0,0.134826,0.177607,0.004563,0.000190
4,1.0,1.0,0.023552,0.000000,0.0,1.0,97.0,5.0,0.047025,0.028622,0.079005,0.000202
...,...,...,...,...,...,...,...,...,...,...,...,...
159,0.6,1.0,0.961106,0.442982,1.0,0.5,94.0,233.0,0.174932,0.124550,0.211313,0.006207
160,0.0,1.0,0.027010,0.271930,1.0,0.5,93.0,109.0,0.429260,0.116214,0.090929,0.004995
161,1.0,1.0,0.114088,0.298246,1.0,0.5,94.0,157.0,0.105499,0.135780,0.083204,0.003731
162,0.6,1.0,0.006266,0.140351,1.0,0.5,93.0,33.0,0.291959,0.123161,0.100560,0.002619


In [7]:
model = "svc"
folder = resultsPath

df_before = pd.DataFrame()    
df_features = pd.DataFrame()
df_importances = pd.DataFrame() 

clf = GridSearchCV(models[model], grids[model], scoring='balanced_accuracy', verbose=5, cv=2)  ##cv=5
result = classify_leave_one_out_cv(clf, 
                          X_imputed, 
                          y,
                          model=model, 
                          save_to = folder + f"/{model}", 
                          select_features=True, 
                          permutation_repeats=100, 
                          scale_features = False)

''' Prepare results '''
result['model'] = model        
df_before = df_before.append(result['df_results'], ignore_index=True)  
df_importances = df_importances.append(result['importances_df'], ignore_index=True)       
del result['df_results']      
del result['importances_df']   
df_features = df_features.append(result, ignore_index=True)  

''' Save to file '''
df_before.to_csv((folder+f"/prediction_cv_test_{model}.csv"), index=False)          
df_features.to_csv((folder+f"/features_test_{model}.csv"), index=False)  
df_importances.to_csv((folder+f"/importances_test_{model}.csv"), index=False)           
                          

Fitting 2 folds for each of 60 candidates, totalling 120 fits
[CV 1/2] END ........C=0.1, gamma=1, kernel=rbf;, score=0.500 total time=   0.0s
[CV 2/2] END ........C=0.1, gamma=1, kernel=rbf;, score=0.500 total time=   0.0s
[CV 1/2] END .......C=0.1, gamma=1, kernel=poly;, score=0.762 total time=   0.5s
[CV 2/2] END .......C=0.1, gamma=1, kernel=poly;, score=0.821 total time=   1.3s
[CV 1/2] END .....C=0.1, gamma=1, kernel=linear;, score=0.670 total time=   0.0s
[CV 2/2] END .....C=0.1, gamma=1, kernel=linear;, score=0.700 total time=   0.0s
[CV 1/2] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.500 total time=   0.0s
[CV 2/2] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.500 total time=   0.0s
[CV 1/2] END .....C=0.1, gamma=0.1, kernel=poly;, score=0.762 total time=   0.4s
[CV 2/2] END .....C=0.1, gamma=0.1, kernel=poly;, score=0.821 total time=   1.3s
[CV 1/2] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.670 total time=   0.0s
[CV 2/2] END ...C=0.1, gamma=0.1, kernel=linear

: 

: 

In [15]:
# ''' Prepare results '''
# result['model'] = model        
# df_before = df_before.append(result['df_results'], ignore_index=True)  
# df_importances = df_importances.append(result['importances_df'], ignore_index=True)       
# del result['df_results']      
# del result['importances_df']   
# df_features = df_features.append(result, ignore_index=True)  

# ''' Save to file '''
# df_before.to_csv((folder+f"/prediction_cv_test_{model}.csv"), index=False)          
# df_features.to_csv((folder+f"/features_test_{model}.csv"), index=False)  
# df_importances.to_csv((folder+f"/importances_test_{model}.csv"), index=False)           

In [17]:
df_importances[df_importances["feature"] == "ldh_max"]

Unnamed: 0,feature,importance_0,importance_1,importance_2,importance_3,importance_4,importance_5,importance_6,importance_7,importance_8,importance_9,model
12,ldh_max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rfc
29,ldh_max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rfc
46,ldh_max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rfc
63,ldh_max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rfc
80,ldh_max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,rfc


In [57]:
result["df_results"]

Unnamed: 0,precision,recall,f1,accuracy,model,auc
0,0.75,1.0,0.857143,0.8,rfc,0.833333
