# **Load dataset**

In [1]:

import pandas as pd
from sklearn.datasets import make_classification
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

df=pd.read_csv('---')  #path to dataset
X=df.iloc[:,3:7]          
y=df["Health_status"]
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y.ravel()
X.head()
# split into train test sets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# scale data
t = MinMaxScaler()
t.fit(X_train)
X_train = t.transform(X_train)
X_test = t.transform(X_test)

# **Build Classification models**

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score

**K nearest neighbors**

In [3]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(3) # Define classifier
knn.fit(X_train, y_train) # Train model

# Make predictions
y_train_pred = knn.predict(X_train)
y_test_pred = knn.predict(X_test)

# Training set performance

# Calculate Accuracy
knn_train_accuracy = accuracy_score(y_train, y_train_pred) 
# Calculate MCC
knn_train_mcc = matthews_corrcoef(y_train, y_train_pred) 
# Calculate F1-score
knn_train_f1 = f1_score(y_train, y_train_pred, average='weighted') 

# Test set performance
knn_test_accuracy = accuracy_score(y_test, y_test_pred) 
knn_test_mcc = matthews_corrcoef(y_test, y_test_pred) 
knn_test_f1 = f1_score(y_test, y_test_pred, average='weighted') 

print('Model performance for Training set')
print('- Accuracy: %s' % knn_train_accuracy)
print('- MCC: %s' % knn_train_mcc)
print('- F1 score: %s' % knn_train_f1)


print('Model performance for Test set')
print('- Accuracy: %s' % knn_test_accuracy)
print('- MCC: %s' % knn_test_mcc)
print('- F1 score: %s' % knn_test_f1)

Model performance for Training set
- Accuracy: 0.9848484848484849
- MCC: 0.9308539843557129
- F1 score: 0.9845515625098287
Model performance for Test set
- Accuracy: 0.947075208913649
- MCC: 0.7613055127277564
- F1 score: 0.9438674728262543


**Support vector machine (Radial basis function kernel)**

In [4]:
from sklearn.svm import SVC

svm_rbf = SVC(gamma=2, C=1)
svm_rbf.fit(X_train, y_train)

# Make predictions
y_train_pred = svm_rbf.predict(X_train)
y_test_pred = svm_rbf.predict(X_test)

# Training set performance

# Calculate Accuracy
svm_rbf_train_accuracy = accuracy_score(y_train, y_train_pred) 
# Calculate MCC
svm_rbf_train_mcc = matthews_corrcoef(y_train, y_train_pred) 
# Calculate F1-score
svm_rbf_train_f1 = f1_score(y_train, y_train_pred, average='weighted') 

# Test set performance
svm_rbf_test_accuracy = accuracy_score(y_test, y_test_pred) 
svm_rbf_test_mcc = matthews_corrcoef(y_test, y_test_pred) 
svm_rbf_test_f1 = f1_score(y_test, y_test_pred, average='weighted') 

print('Model performance for Training set')
print('- Accuracy: %s' % svm_rbf_train_accuracy)
print('- MCC: %s' % svm_rbf_train_mcc)
print('- F1 score: %s' % svm_rbf_train_f1)

print('Model performance for Test set')
print('- Accuracy: %s' % svm_rbf_test_accuracy)
print('- MCC: %s' % svm_rbf_test_mcc)
print('- F1 score: %s' % svm_rbf_test_f1)

Model performance for Training set
- Accuracy: 0.8856749311294766
- MCC: 0.2942384613166517
- F1 score: 0.8420470917890267
Model performance for Test set
- Accuracy: 0.8774373259052924
- MCC: 0.27038964251150893
- F1 score: 0.8294501459320993


**Decision tree**

In [5]:
from sklearn.tree import DecisionTreeClassifier

 # Define classifier
dt = DecisionTreeClassifier(max_depth=5)
dt.fit(X_train, y_train) 

# Make predictions
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

# Training set performance

# Calculate Accuracy
dt_train_accuracy = accuracy_score(y_train, y_train_pred) 
# Calculate MCC
dt_train_mcc = matthews_corrcoef(y_train, y_train_pred) 
# Calculate F1-score
dt_train_f1 = f1_score(y_train, y_train_pred, average='weighted') 

# Test set performance
dt_test_accuracy = accuracy_score(y_test, y_test_pred)
dt_test_mcc = matthews_corrcoef(y_test, y_test_pred) 
dt_test_f1 = f1_score(y_test, y_test_pred, average='weighted') 

print('Model performance for Training set')
print('- Accuracy: %s' % dt_train_accuracy)
print('- MCC: %s' % dt_train_mcc)
print('- F1 score: %s' % dt_train_f1)

print('Model performance for Test set')
print('- Accuracy: %s' % dt_test_accuracy)
print('- MCC: %s' % dt_test_mcc)
print('- F1 score: %s' % dt_test_f1)

Model performance for Training set
- Accuracy: 1.0
- MCC: 1.0
- F1 score: 1.0
Model performance for Test set
- Accuracy: 0.9888579387186629
- MCC: 0.9526246368967016
- F1 score: 0.9879294336118849


**Random forest**

In [6]:
from sklearn.ensemble import RandomForestClassifier

# Define classifier
rf = RandomForestClassifier(n_estimators=10) 
rf.fit(X_train, y_train)

# Make predictions
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

# Training set performance

# Calculate Accuracy
rf_train_accuracy = accuracy_score(y_train, y_train_pred) 
# Calculate MCC
rf_train_mcc = matthews_corrcoef(y_train, y_train_pred) 
# Calculate F1-score
rf_train_f1 = f1_score(y_train, y_train_pred, average='weighted') 

# Test set performance
rf_test_accuracy = accuracy_score(y_test, y_test_pred) 
rf_test_mcc = matthews_corrcoef(y_test, y_test_pred) 
rf_test_f1 = f1_score(y_test, y_test_pred, average='weighted') 

print('Model performance for Training set')
print('- Accuracy: %s' % rf_train_accuracy)
print('- MCC: %s' % rf_train_mcc)
print('- F1 score: %s' % rf_train_f1)

print('Model performance for Test set')
print('- Accuracy: %s' % rf_test_accuracy)
print('- MCC: %s' % rf_test_mcc)
print('- F1 score: %s' % rf_test_f1)

Model performance for Training set
- Accuracy: 1.0
- MCC: 1.0
- F1 score: 1.0
Model performance for Test set
- Accuracy: 0.9888579387186629
- MCC: 0.9521744292994492
- F1 score: 0.9868910003253648


**Neural network**

In [8]:
from sklearn.neural_network import MLPClassifier

# Define classifier
mlp = MLPClassifier()
mlp.fit(X_train, y_train)

# Make predictions
y_train_pred = mlp.predict(X_train)
y_test_pred = mlp.predict(X_test)

# Training set performance

# Calculate Accuracy
mlp_train_accuracy = accuracy_score(y_train, y_train_pred) 
# Calculate MCC
mlp_train_mcc = matthews_corrcoef(y_train, y_train_pred) 
# Calculate F1-score
mlp_train_f1 = f1_score(y_train, y_train_pred, average='weighted') 

# Test set performance
mlp_test_accuracy = accuracy_score(y_test, y_test_pred) 
mlp_test_mcc = matthews_corrcoef(y_test, y_test_pred) 
mlp_test_f1 = f1_score(y_test, y_test_pred, average='weighted') 

print('Model performance for Training set')
print('- Accuracy: %s' % mlp_train_accuracy)
print('- MCC: %s' % mlp_train_mcc)
print('- F1 score: %s' % mlp_train_f1)

print('Model performance for Test set')
print('- Accuracy: %s' % mlp_test_accuracy)
print('- MCC: %s' % mlp_test_mcc)
print('- F1 score: %s' % mlp_test_f1)

Model performance for Training set
- Accuracy: 0.8939393939393939
- MCC: 0.3814596386916604
- F1 score: 0.8584078363902627
Model performance for Test set
- Accuracy: 0.883008356545961
- MCC: 0.33209512847952
- F1 score: 0.840832661554067




# **Build Stacked model**

In [9]:
# Define estimators
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

estimator_list = [
    ('knn',knn),
    ('svm_rbf',svm_rbf),
    ('dt',dt),
    ('rf',rf),
    ('mlp',mlp) ]

# Build stack model
stack_model = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression()
)

# Train stacked model
stack_model.fit(X_train, y_train)

# Make predictions
y_train_pred = stack_model.predict(X_train)
y_test_pred = stack_model.predict(X_test)

# Training set model performance


# Calculate Accuracy
stack_model_train_accuracy = accuracy_score(y_train, y_train_pred) 

# Calculate MCC
stack_model_train_mcc = matthews_corrcoef(y_train, y_train_pred) 

# Calculate F1-score
stack_model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') 

# Test set model performance
stack_model_test_accuracy = accuracy_score(y_test, y_test_pred) 
stack_model_test_mcc = matthews_corrcoef(y_test, y_test_pred) 
stack_model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') 

print('Model performance for Training set')
print('Accuracy: %s' % stack_model_train_accuracy)
print('MCC: %s' % stack_model_train_mcc)
print('F1 score: %s' % stack_model_train_f1)

print('Model performance for Test set')
print('Accuracy: %s' % stack_model_test_accuracy)
print('MCC: %s' % stack_model_test_mcc)
print('F1 score: %s' % stack_model_test_f1)



Model performance for Training set
Accuracy: 1.0
MCC: 1.0
F1 score: 1.0
Model performance for Test set
Accuracy: 0.9888579387186629
MCC: 0.9521744292994492
F1 score: 0.9868910003253648




# **Results**

In [10]:
acc_train_list = {'knn':knn_train_accuracy,
'svm_rbf': svm_rbf_train_accuracy,
'dt': dt_train_accuracy,
'rf': rf_train_accuracy,
'mlp': mlp_train_accuracy,
'stack': stack_model_train_accuracy}

mcc_train_list = {'knn':knn_train_mcc,
'svm_rbf': svm_rbf_train_mcc,
'dt': dt_train_mcc,
'rf': rf_train_mcc,
'mlp': mlp_train_mcc,
'stack': stack_model_train_mcc}

f1_train_list = {'knn':knn_train_f1,
'svm_rbf': svm_rbf_train_f1,
'dt': dt_train_f1,
'rf': rf_train_f1,
'mlp': mlp_train_f1,
'stack': stack_model_train_f1}

In [11]:
mcc_train_list

{'knn': 0.9308539843557129,
 'svm_rbf': 0.2942384613166517,
 'dt': 1.0,
 'rf': 1.0,
 'mlp': 0.3814596386916604,
 'stack': 1.0}

In [12]:
import pandas as pd

acc_df = pd.DataFrame.from_dict(acc_train_list, orient='index', columns=['Accuracy'])
mcc_df = pd.DataFrame.from_dict(mcc_train_list, orient='index', columns=['MCC'])
f1_df = pd.DataFrame.from_dict(f1_train_list, orient='index', columns=['F1'])
df = pd.concat([acc_df, mcc_df, f1_df], axis=1)
df

Unnamed: 0,Accuracy,MCC,F1
knn,0.984848,0.930854,0.984552
svm_rbf,0.885675,0.294238,0.842047
dt,1.0,1.0,1.0
rf,1.0,1.0,1.0
mlp,0.893939,0.38146,0.858408
stack,1.0,1.0,1.0


In [20]:
df.to_csv('results.csv')