In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit
import random
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn import svm
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_validate
from sklearn.inspection import permutation_importance
from math import sqrt
from decimal import Decimal
import shap
import seaborn as sns


In [2]:
def test_data(df, random_int,split):
    #get rid of medical terms and separate features and labels
    all_np=df.to_numpy().transpose()[0:].transpose()

    all_np = np.asarray(all_np).astype('float32')

    all_features=all_np[0:-1].transpose()
    all_labels=all_np[-1].transpose()

    print(all_features.shape)
    print(all_features)

    print(all_labels.shape)
    print(all_labels)
    rskf = RepeatedStratifiedKFold(n_splits=split, n_repeats=200, random_state=random_int)

    avg_auc=0
    avg_auc_interval=0

    avg_accuracy=0
    avg_accuracy_interval=0

    avg_precision=0
    avg_precision_interval=0

    avg_sensitivity=0
    avg_sensitivity_interval=0

    avg_specificity=0
    avg_specificity_interval=0

    k=0


    for train_index, test_index in rskf.split(all_features, all_labels):

        X_train, X_test = all_features[train_index], all_features[test_index]
        y_train, y_test = all_labels[train_index], all_labels[test_index]

        clf=svm.SVC(probability=True)
        y_pred = clf.fit(X_train, y_train).predict(X_test)

        #Calculate necessary metrics
        accuracy=accuracy_score(y_test, y_pred)
        precision=precision_score(y_test, y_pred, average='macro')

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        specificity = tn / (tn+fp)
        sensitivity = tp / (tp+fn)

        auc=roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

        #Calculate Intervals
        accuracy_interval = 1.96 * sqrt((accuracy*(1-accuracy))/ len(y_test))
        precision_interval = 1.96 * sqrt((precision*(1-precision))/ len(y_test))
        specificity_interval = 1.96 * sqrt((specificity*(1-specificity))/ len(y_test))
        sensitivity_interval = 1.96 * sqrt((sensitivity*(1-sensitivity))/ len(y_test))
        auc_interval = 1.96 * sqrt((auc*(1-auc))/ len(y_test))

        #Add to to-be-averaged values
        avg_accuracy=avg_accuracy+accuracy
        avg_accuracy_interval=avg_accuracy_interval+accuracy_interval

        avg_precision=avg_precision+precision
        avg_precision_interval=avg_precision_interval+precision_interval

        avg_specificity=avg_specificity+specificity
        avg_specificity_interval=avg_specificity_interval+specificity_interval

        avg_sensitivity=avg_sensitivity+sensitivity
        avg_sensitivity_interval=avg_sensitivity_interval+sensitivity_interval

        avg_auc=avg_auc+auc
        avg_auc_interval=avg_auc_interval+auc_interval

        k=k+1

    print("TEST:", test_index)
    print("After "+ str(k) + " runs")

    avg_accuracy=round(Decimal(avg_accuracy/k),2)
    avg_accuracy_interval=round(Decimal(avg_accuracy_interval/k),2)
    avg_precision=round(Decimal(avg_precision/k),2)
    avg_precision_interval=round(Decimal(avg_precision_interval/k),2)
    avg_specificity=round(Decimal(avg_specificity/k),2)
    avg_specificity_interval=round(Decimal(avg_specificity_interval/k),2)
    avg_sensitivity=round(Decimal(avg_sensitivity/k),2)
    avg_sensitivity_interval=round(Decimal(avg_sensitivity_interval/k),2)
    avg_auc=round(Decimal(avg_auc/k),2)
    avg_auc_interval=round(Decimal(avg_auc_interval/k),2)

    print("Accuracy: \t" + str(avg_accuracy) +
          " [" + str((avg_accuracy)-(avg_accuracy_interval)) +"-"+
          str((avg_accuracy)+(avg_accuracy_interval)) + "]\n")

    print("Precison: \t" + str(avg_precision) +
          " [" + str((avg_precision)-(avg_precision_interval)) +"-"+
          str((avg_precision)+(avg_precision_interval)) + "]\n")

    print("Specificity: \t" + str(avg_specificity) +
          " [" + str((avg_specificity)-(avg_specificity_interval)) +"-"+
          str((avg_specificity)+(avg_specificity_interval)) + "]\n")

    print("Sensitivity: \t" + str(avg_sensitivity) +
          " [" + str((avg_sensitivity)-(avg_sensitivity_interval)) +"-"+
          str((avg_sensitivity)+(avg_sensitivity_interval)) + "]\n")

    print("AUC: \t\t" + str(avg_auc) +
          " [" + str((avg_auc)-(avg_auc_interval)) +"-"+
          str((avg_auc)+(avg_auc_interval)) + "]")

    print("=====================================")
    return avg_accuracy,avg_auc

In [3]:
# import excel sheet to pandas dataframe
rows_to_keep = np.arange(323)
df=pd.read_excel(open('../Data/SPS-Commorbiditiesver6.xlsx', 'rb'), index_col=0,sheet_name='Reorganized-with-labels (2)',skiprows = lambda x: x not in rows_to_keep, usecols=range(50))
df = df.drop(labels='Total', axis=1)

df

Unnamed: 0_level_0,1,2,3,7,8,9,10,11,12,13,...,46,47,48,49,50,51,52,53,54,55
Past medical history item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(cv) aneurysm,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(cv) aortic stenosis,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(cv) aortoiliac occlusive disease,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(cv) atherosclerotic cardiovascular disease,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(cv) chest tightness or pressure,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
(rheum/immuno) inflammatory arthritis (unspecified),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(rheum/immuno) psoriasis,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(rheum/immuno) rheumatoid arthritis,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(rheum/immuno) Sjogren's syndrome,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
test_data(df,362,6)

(48, 27)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [1. 0. 1. ... 0. 0. 1.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
(48,)
[1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1.
 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0.]


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined an

TEST: [ 1  6 25 31 35 37 38 46]
After 1200 runs
Accuracy: 	0.69 [0.39-0.99]

Precison: 	0.71 [0.43-0.99]

Specificity: 	0.72 [0.49-0.95]

Sensitivity: 	0.65 [0.40-0.90]

AUC: 		0.63 [0.36-0.90]


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


(Decimal('0.69'), Decimal('0.63'))

# Feature Reduction

In [7]:
rows_to_keep = np.arange(323)
df=pd.read_excel(open('../Data/SPS-Commorbiditiesver6.xlsx', 'rb'), index_col=0,sheet_name='Reorganized-with-labels (2)',skiprows = lambda x: x not in rows_to_keep, usecols=range(50))

#remove rows with all 0's
df = df.loc[~(df==0).all(axis=1)]

#Remove symptoms that only appear once (feature selection)
df = df[df.Total!= 1]
df = df[df.Total!= 2]
df = df[df.Total!= 3]
# df = df[df.Total!= 4] --> would boost AUC

df = df.drop(labels='Total', axis=1)

df

Unnamed: 0_level_0,1,2,3,7,8,9,10,11,12,13,...,46,47,48,49,50,51,52,53,54,55
Past medical history item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(cv) hypertension (unspecified),0,0,1,1,0,0,0,0,0,0,...,1,1,0,0,0,0,0,1,1,0
(endo) diabetes mellitus (unspecified),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(endo) dyslipidemia,0,0,1,1,0,0,0,0,0,0,...,1,1,0,0,0,1,1,0,1,0
(endo) Hashimoto's disease,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(endo) hypothyroidism (unspecified),0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
(endo) obesity,0,0,1,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
(endo) vitamin D deficiency,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(gi) dysphagia,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(gi) gastroesophageal reflux disease,0,1,1,0,0,1,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
"(id) urinary tract infection, recurrent",0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [8]:
#get rid of medical terms and separate features and labels
all_np=df.to_numpy().transpose()[0:].transpose()

all_np = np.asarray(all_np).astype('float32')

all_features=all_np[0:-1].transpose()
all_labels=all_np[-1].transpose()

print(all_features.shape)
print(all_features)

print(all_labels.shape)
print(all_labels)

(48, 27)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [1. 0. 1. ... 0. 0. 1.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
(48,)
[1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1.
 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0.]


This is the part different from the original notebook. Here, there are more repeats and a higher split

In [21]:
rskf = RepeatedStratifiedKFold(n_splits=6, n_repeats=200, random_state=362)

avg_auc=0
avg_auc_interval=0

avg_accuracy=0
avg_accuracy_interval=0

avg_precision=0
avg_precision_interval=0

avg_sensitivity=0
avg_sensitivity_interval=0

avg_specificity=0
avg_specificity_interval=0

k=0

# imp_np=np.zeros(27)

# avg_shap_np_0=np.zeros((24,27))
# avg_shap_np_1=np.zeros((24,27))


for train_index, test_index in rskf.split(all_features, all_labels):

    X_train, X_test = all_features[train_index], all_features[test_index]
    y_train, y_test = all_labels[train_index], all_labels[test_index]

    clf=svm.SVC(probability=True)
    y_pred = clf.fit(X_train, y_train).predict(X_test)

    #Calculate necessary metrics
    accuracy=accuracy_score(y_test, y_pred)
    precision=precision_score(y_test, y_pred, average='macro')
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn+fp)
    sensitivity = tp / (tp+fn)
    auc=roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

    #Calculate Intervals
    accuracy_interval = 1.96 * sqrt((accuracy*(1-accuracy))/ len(y_test))
    precision_interval = 1.96 * sqrt((precision*(1-precision))/ len(y_test))
    specificity_interval = 1.96 * sqrt((specificity*(1-specificity))/ len(y_test))
    sensitivity_interval = 1.96 * sqrt((sensitivity*(1-sensitivity))/ len(y_test))
    auc_interval = 1.96 * sqrt((auc*(1-auc))/ len(y_test))

    #Add to to-be-averaged values
    avg_accuracy=avg_accuracy+accuracy
    avg_accuracy_interval=avg_accuracy_interval+accuracy_interval

    avg_precision=avg_precision+precision
    avg_precision_interval=avg_precision_interval+precision_interval

    avg_specificity=avg_specificity+specificity
    avg_specificity_interval=avg_specificity_interval+specificity_interval

    avg_sensitivity=avg_sensitivity+sensitivity
    avg_sensitivity_interval=avg_sensitivity_interval+sensitivity_interval

    avg_auc=avg_auc+auc
    avg_auc_interval=avg_auc_interval+auc_interval

#     r = permutation_importance(clf, X_test, y_test,n_repeats=30,random_state=0)
#     imp_np=imp_np+r.importances_mean
    
#     explainer = shap.KernelExplainer(clf.predict_proba, X_train, link="logit")
#     shap_values = explainer.shap_values(X_test)
    
#     avg_shap_np_0=avg_shap_np_0+shap_values[0]
#     avg_shap_np_1=avg_shap_np_1+shap_values[1]
    
    k=k+1

print("TEST:", test_index)
print("After "+ str(k) + " runs")

avg_accuracy=round(Decimal(avg_accuracy/k),2)
avg_accuracy_interval=round(Decimal(avg_accuracy_interval/k),2)
avg_precision=round(Decimal(avg_precision/k),2)
avg_precision_interval=round(Decimal(avg_precision_interval/k),2)
avg_specificity=round(Decimal(avg_specificity/k),2)
avg_specificity_interval=round(Decimal(avg_specificity_interval/k),2)
avg_sensitivity=round(Decimal(avg_sensitivity/k),2)
avg_sensitivity_interval=round(Decimal(avg_sensitivity_interval/k),2)
avg_auc=round(Decimal(avg_auc/k),2)
avg_auc_interval=round(Decimal(avg_auc_interval/k),2)

# avg_shap_np_0=avg_shap_np_0/k
# avg_shap_np_1=avg_shap_np_1/k


print("Accuracy: \t" + str(avg_accuracy) +
      " [" + str((avg_accuracy)-(avg_accuracy_interval)) +"-"+
      str((avg_accuracy)+(avg_accuracy_interval)) + "]\n")

print("Precison: \t" + str(avg_precision) +
      " [" + str((avg_precision)-(avg_precision_interval)) +"-"+
      str((avg_precision)+(avg_precision_interval)) + "]\n")

print("Specificity: \t" + str(avg_specificity) +
      " [" + str((avg_specificity)-(avg_specificity_interval)) +"-"+
      str((avg_specificity)+(avg_specificity_interval)) + "]\n")

print("Sensitivity: \t" + str(avg_sensitivity) +
      " [" + str((avg_sensitivity)-(avg_sensitivity_interval)) +"-"+
      str((avg_sensitivity)+(avg_sensitivity_interval)) + "]\n")

print("AUC: \t\t" + str(avg_auc) +
      " [" + str((avg_auc)-(avg_auc_interval)) +"-"+
      str((avg_auc)+(avg_auc_interval)) + "]")

print("=====================================")

Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined an

TEST: [ 1  6 25 31 35 37 38 46]
After 1200 runs
Accuracy: 	0.69 [0.39-0.99]

Precison: 	0.71 [0.43-0.99]

Specificity: 	0.72 [0.49-0.95]

Sensitivity: 	0.65 [0.40-0.90]

AUC: 		0.62 [0.35-0.89]


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


In [19]:
print(k)

1200
