In [None]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, KFold

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [None]:
# Importing the dataset
dataset = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
x = dataset.iloc[:, :-2]
y = dataset.iloc[:,-1]

In [None]:
# Feature Ranking
plt.rcParams['figure.figsize']=15,6 
sns.set_style("darkgrid")

model = ExtraTreesClassifier()
model.fit(x,y)
print(model.feature_importances_) 
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(12).plot(kind='barh')
plt.show()

In [None]:
#Feature Ranking
plt.rcParams['figure.figsize']=15,6 
sns.set_style("darkgrid")

model = RandomForestClassifier()
model.fit(x,y)
print(model.feature_importances_) 
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(12).plot(kind='barh')
plt.show()


In [None]:
#Using different variations of factors
#Some indexes (4-ejection fraction,7-serum_creatinine,0-age,2-creatinine_phosphokinase,6-platelets,8-serum_sodium)
x = dataset.iloc[:,[4,7,0]].values
y = dataset.iloc[:,-1].values

In [None]:
#Defining function which takes model, x, y and returns accuracy, f1score, matthews_correlation_coefficient
def get_model_results(model,x,y):
    
    model_confusion_matrix = np.array([[0,0],[0,0]])
    acclist, f1slist = [],[]
    
    # Splitting the dataset into training set and test set
    kf = KFold(n_splits=10)
    for train_index, test_index in kf.split(x):
        #print("train:",train_index,"test:",test_index)
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Feature Scaling
        sc = StandardScaler()
        x_train = sc.fit_transform(x_train)
        x_test = sc.transform(x_test)

        #Model training
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        model_confusion_matrix += confusion_matrix(y_test,y_pred)
        acclist.append(accuracy_score(y_test,y_pred))
        f1slist.append(f1_score(y_test,y_pred))
    
    [[tp,fp],[tn,fn]] = model_confusion_matrix
    mcc = ((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    metriclist = [np.mean(acclist),np.mean(f1slist),mcc]
    return metriclist


In [None]:
modellist = []
modellist.append()

Logistic Regression

In [None]:
cfm = []
model_confusion_matrix = np.array([[0,0],[0,0]])
acclist, f1slist, mcclist = [],[],[]
# Splitting the dataset into training set and test set
from sklearn.model_selection import train_test_split
for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,random_state=i)

    # Feature Scaling
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)

    #Logistic Regression
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression()
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    model_confusion_matrix += confusion_matrix(y_test,y_pred)
    calc_acc_mcc_f1s(y_test,y_pred)
    append_eval()
[[tp,fp],[tn,fn]] = model_confusion_matrix
mcc = ((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
lrlist = [mean(acclist),mean(f1slist),mcc]

In [None]:
#Using 10-fold cross validation
model_confusion_matrix = np.array([[0,0],[0,0]])
acclist, f1slist, mcclist = [],[],[]
# Splitting the dataset into training set and test set
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import matthews_corrcoef, confusion_matrix, f1_score, accuracy_score
import numpy as np

kf = KFold(n_splits=10)
for train_index, test_index in kf.split(x):
    #print("train:",train_index,"test:",test_index)
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    #x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,random_state=i)

    # Feature Scaling
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)

    #Logistic Regression
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression()
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    model_confusion_matrix += confusion_matrix(y_test,y_pred)
    acclist.append(accuracy_score(y_test,y_pred))
    f1slist.append(f1_score(y_test,y_pred))
    
[[tp,fp],[tn,fn]] = model_confusion_matrix
mcc = ((tp*tn)-(fp*fn))/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
lrklist = [mean(acclist),mean(f1slist),mcc]


In [None]:
print(lrlist)
print(lrklist)

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
get_model_results(lr,x,y)

K Nearest Neighbour

In [None]:
acclist, f1slist, mcclist = [],[],[]
# Splitting the dataset into training set and test set
from sklearn.model_selection import train_test_split
for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,random_state=i)

    # Feature Scaling
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)

    #K Nearest Neighbor
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier()
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    calc_acc_mcc_f1s(y_test,y_pred)
    append_eval()
knnlist = [mean(acclist),mean(f1slist),mean(mcclist)]

In [None]:
knnlist

Support Vector Classifier

In [None]:
acclist, f1slist, mcclist = [],[],[]
# Splitting the dataset into training set and test set
from sklearn.model_selection import train_test_split
for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,random_state=i)

    # Feature Scaling
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)

    #Support Vector Classifier
    from sklearn.svm import SVC
    classifier = SVC()
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    calc_acc_mcc_f1s(y_test,y_pred)
    append_eval()

svclist = [mean(acclist),mean(f1slist),mean(mcclist)]

In [None]:
svclist

Decision Tree Classifier

In [None]:
acclist, f1slist, mcclist = [],[],[]
# Splitting the dataset into training set and test set
from sklearn.model_selection import train_test_split
for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,random_state=i)

    # Feature Scaling
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)

    #Decision Tree Classifier
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(max_leaf_nodes=10,max_depth=3,criterion='entropy')
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    calc_acc_mcc_f1s(y_test,y_pred)
    append_eval()

dtclist = [mean(acclist),mean(f1slist),mean(mcclist)]

In [None]:
dtclist

Random Forest Classification

In [None]:
acclist, f1slist, mcclist = [],[],[]
# Splitting the dataset into training set and test set
from sklearn.model_selection import train_test_split
for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,random_state=i)

    # Feature Scaling
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)

    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(criterion='gini', random_state=0)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    calc_acc_mcc_f1s(y_test,y_pred)
    append_eval()

rfclist = [mean(acclist),mean(f1slist),mean(mcclist)]

In [None]:
rfclist

XGBoost Classifier

In [None]:
acclist, f1slist, mcclist = [],[],[]
# Splitting the dataset into training set and test set
from sklearn.model_selection import train_test_split
for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,random_state=i)

    # Feature Scaling
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)

    from xgboost import XGBClassifier
    classifier = XGBClassifier()
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    calc_acc_mcc_f1s(y_test,y_pred)
    append_eval()

xgbclist = [mean(acclist),mean(f1slist),mean(mcclist)]

In [None]:
xgbclist

CatBoost Classifier

In [None]:
acclist, f1slist, mcclist = [],[],[]
# Splitting the dataset into training set and test set
from sklearn.model_selection import train_test_split
for i in range(0,10):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,random_state=i)

    # Feature Scaling
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)

    from catboost import CatBoostClassifier
    classifier = CatBoostClassifier()
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    calc_acc_mcc_f1s(y_test,y_pred)
    append_eval()

cbclist = [mean(acclist),mean(f1slist),mean(mcclist)]

In [None]:
cbclist

In [None]:
mylist2 = ["Logistic Regression", "KNearestNeighbours","SupportVector","DecisionTree","RandomForest", "XGBOOST","CATBOOST"]
myacclist = [lrlist[0],knnlist[0],svclist[0],dtclist[0],rfclist[0],xgbclist[0],cbclist[0]]
mymcclist = [lrlist[1],knnlist[1],svclist[1],dtclist[1],rfclist[1],xgbclist[1],cbclist[1]]
myf1slist = [lrlist[2],knnlist[2],svclist[2],dtclist[2],rfclist[2],xgbclist[2],cbclist[2]]

In [None]:
plt.rcParams['figure.figsize']=15,6 
sns.set_style("darkgrid")
ax = sns.barplot(x=mylist2, y=mymcclist, palette = "rocket", saturation =1.5)
plt.xlabel("Classifier Models", fontsize = 20 )
plt.ylabel("MCC", fontsize = 20)
plt.title("MCC of different Classifier Models", fontsize = 20)
plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 13)
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{height:.2}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()