In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import pylab
import matplotlib.pyplot as plt
import numpy as np

In [None]:
dataset = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

<h3><b>The ratio of Dead to Survived in the dependent column</b></h3>

In [None]:
death_events = dataset['DEATH_EVENT']
death_events = death_events.value_counts().tolist()                             #taking count of unique values in death events
death_events_labels = ['Surivival','Death']

plt.pie(x = death_events,labels = death_events_labels, autopct = '%.2f%%')

<h3><b>Correlation matrix</h3></b>

In [None]:
correlation = dataset.corr()
plt.imshow(correlation, aspect='auto')
ax = pylab.subplot()
dataset.columns

<h2><b>Apply learning</h2></b>
<h3><b>Split the data into training, test and validation set</h3></b>

In [None]:
X = dataset.iloc[ : , :-1].values
Y = dataset.iloc[ : ,-2].values

from sklearn.model_selection import train_test_split
X,X_test,Y,Y_test = train_test_split(X, Y, test_size= 0.20, random_state = 4)
X_train,X_validate,Y_train,Y_validate = train_test_split(X, Y, test_size=0.125, random_state = 2)

model_results = []           #The models with the best accuracy will be stored in this list

<h3><b>Logistic Regression</h3></b>

In [None]:
logistic_regression_C_values = [1, 0.1, 0.01, 0.001, 0.0001]
logistic_regression_results = []

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
import statistics

for c in logistic_regression_C_values :
    logistic_regression_classifier = LogisticRegression(C=c, max_iter = 10000)
    logistic_regression_classifier.fit(X_train, Y_train)
    
    Y_pred_test = logistic_regression_classifier.predict(X_test)
    test_accuracy = accuracy_score(Y_test, Y_pred_test)

    Y_pred_validate = logistic_regression_classifier.predict(X_validate)
    validation_accuracy = accuracy_score(Y_validate, Y_pred_validate)

    #storing accuracy score for each value of C in a list
    logistic_regression_results.append({'test_accuracy':test_accuracy, 
                                        'validation_accuracy': validation_accuracy, 
                                        'total_accuracy':statistics.mean([test_accuracy,validation_accuracy])
                                        })

In [None]:
print('C','\t','Test accuracy','\t','Validation accuracy','\t','Total accuracy')
i = 0
for i in range(len(logistic_regression_C_values)) :
    print(logistic_regression_C_values[i],'\t',
          "{:.2f}%".format(logistic_regression_results[i]['test_accuracy']*100),'\t',
          "{:.2f}%".format(logistic_regression_results[i]['validation_accuracy']*100),'\t\t',
          "{:.2f}%".format(logistic_regression_results[i]['total_accuracy']*100),'\n')

In [None]:
#Adding the result with best accuracy to algorithm_results
model_results.append({'model_name':'Logistic regression',
                      'accuracy':logistic_regression_results[3]['total_accuracy']
                          })

<h3><b>KNN</h3></b>

In [None]:
knn_neighbors = [8 , 9, 10, 11, 12, 13]
knn_results = []

from sklearn.neighbors import KNeighborsClassifier

for n in knn_neighbors :
    k_neighbor_classifier = KNeighborsClassifier(n_neighbors = n, metric = 'minkowski', p = 1)
    k_neighbor_classifier.fit(X_train, Y_train)

    Y_pred_test = k_neighbor_classifier.predict(X_test)
    test_accuracy = accuracy_score(Y_test, Y_pred_test)

    Y_pred_validate = k_neighbor_classifier.predict(X_validate)
    validation_accuracy = accuracy_score(Y_validate, Y_pred_validate)

    #storing accuracy score for each value of neighbors in a list
    knn_results.append({'test_accuracy':test_accuracy, 
                        'validation_accuracy': validation_accuracy, 
                        'total_accuracy':statistics.mean([test_accuracy,validation_accuracy])
                        })

In [None]:
print('Neighbors','\t','Test accuracy','\t','Validation accuracy','\t','Total accuracy')
for i in range(len(knn_neighbors)) :
    print(knn_neighbors[i],'\t\t',
          "{:.2f}%".format(knn_results[i]['test_accuracy']*100),'\t\t',
          "{:.2f}%".format(knn_results[i]['validation_accuracy']*100),'\t',
          "{:.2f}%".format(knn_results[i]['total_accuracy']*100),'\n')

In [None]:
#Adding the result with best accuracy to algorithm_results
model_results.append({'model_name':'KNN',
                     'accuracy':knn_results[2]['total_accuracy']
                          })

<h3><b>Linear SVM</h3></b>

In [None]:
svm_C_values = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
svm_results = []

from sklearn.svm import SVC
for c in svm_C_values :
    svc_classifier = SVC(kernel = 'linear', C = c)
    svc_classifier.fit(X_train, Y_train)

    Y_pred_test = svc_classifier.predict(X_test)
    test_accuracy = accuracy_score(Y_test, Y_pred_test)

    Y_pred_validate = svc_classifier.predict(X_validate)
    validation_accuracy = accuracy_score(Y_validate, Y_pred_validate)

    #storing accuracy score for each value of neighbors in a list
    svm_results.append({'test_accuracy':test_accuracy, 
                        'validation_accuracy': validation_accuracy, 
                        'total_accuracy':statistics.mean([test_accuracy,validation_accuracy])
                        })

In [None]:
print('C','\t','Test accuracy','\t','Validation accuracy','\t','Total accuracy')
for i in range(len(svm_C_values)) :
    print(svm_C_values[i],'\t\t',
          "{:.2f}%".format(svm_results[i]['test_accuracy']*100),'\t\t',
          "{:.2f}%".format(svm_results[i]['validation_accuracy']*100),'\t',
          "{:.2f}%".format(svm_results[i]['total_accuracy']*100),'\n')

In [None]:
#Adding the result with best accuracy to algorithm_results
model_results.append({'model_name':'Linear SVM',
                     'accuracy':svm_results[2]['total_accuracy']
                          })

<h3><b>Naive Bayes</h3></b>

In [None]:
from sklearn.naive_bayes import GaussianNB

naive_bayes_classifier = GaussianNB()
naive_bayes_classifier.fit(X_train, Y_train)

Y_pred_test = naive_bayes_classifier.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_pred_test)

Y_pred_validate = naive_bayes_classifier.predict(X_validate)
validation_accuracy = accuracy_score(Y_validate, Y_pred_validate)

total_accuracy = statistics.mean([test_accuracy,validation_accuracy])

In [None]:
print('Test set accuracy : ',"{:.2f}%".format(100*test_accuracy))
print('\nValidation set accuracy : ',"{:.2f}%".format(100*accuracy_score(Y_validate, Y_pred_validate)))
print('\nTotal accuracy : ',"{:.2f}%".format(100*total_accuracy))

In [None]:
#Adding the result with best accuracy to algorithm_results
model_results.append({'model_name':'Naive Bayes',
                      'accuracy':total_accuracy
                      })

<h3><b>Decision tree</h3></b>

In [None]:
decision_tree_depth = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
decision_tree_results = []

from sklearn.tree import DecisionTreeClassifier
for d in decision_tree_depth :
    decision_tree_classifier = DecisionTreeClassifier(criterion = 'gini', random_state = 0, max_depth = d)
    decision_tree_classifier.fit(X_train, Y_train)

    Y_pred_test = decision_tree_classifier.predict(X_test)
    test_accuracy = accuracy_score(Y_test, Y_pred_test)

    Y_pred_validate = decision_tree_classifier.predict(X_validate)
    validation_accuracy = accuracy_score(Y_validate, Y_pred_validate)

    #storing accuracy score for each value of neighbors in a list
    decision_tree_results.append({'test_accuracy':test_accuracy, 
                                  'validation_accuracy': validation_accuracy, 
                                  'total_accuracy':statistics.mean([test_accuracy,validation_accuracy])
                                  })

In [None]:
print('Max depth','\t','Test accuracy','\t','Validation accuracy','\t','Total accuracy')
for i in range(len(decision_tree_depth)) :
    print(decision_tree_depth[i],'\t\t',
          "{:.2f}%".format(decision_tree_results[i]['test_accuracy']*100),'\t\t',
          "{:.2f}%".format(decision_tree_results[i]['validation_accuracy']*100),'\t',
          "{:.2f}%".format(decision_tree_results[i]['total_accuracy']*100),'\n')

In [None]:
#Adding the result with best accuracy to algorithm_results
model_results.append({'model_name':'Decision tree',
                      'accuracy':decision_tree_results[4]['total_accuracy']
                     })

<h3><b>Random forest</h3></b>

In [None]:
random_forest_estimators = [5, 7, 10, 12, 15, 18, 20, 22]
random_forest_results = []

from sklearn.ensemble import RandomForestClassifier

for n in random_forest_estimators :
    random_forest_classifier = RandomForestClassifier(n_estimators = n, criterion = 'entropy', random_state = 5, max_depth = 4)
    random_forest_classifier.fit(X_train, Y_train)

    Y_pred_test = random_forest_classifier.predict(X_test)
    test_accuracy = accuracy_score(Y_test, Y_pred_test)

    Y_pred_validate = random_forest_classifier.predict(X_validate)
    validation_accuracy = accuracy_score(Y_validate, Y_pred_validate)

    #storing accuracy score for each value of neighbors in a list
    random_forest_results.append({'test_accuracy':test_accuracy, 
                                  'validation_accuracy': validation_accuracy, 
                                  'total_accuracy':statistics.mean([test_accuracy,validation_accuracy])
                                  })

In [None]:
print('Estimators','\t','Test accuracy','\t','Validation accuracy','\t','Total accuracy')
for i in range(len(random_forest_estimators)) :
    print(random_forest_estimators[i],'\t\t',
          "{:.2f}".format(random_forest_results[i]['test_accuracy']*100),'\t\t',
          "{:.2f}%".format(random_forest_results[i]['validation_accuracy']*100),'\t\t',
          "{:.2f}%".format(random_forest_results[i]['total_accuracy']*100),'\n')

In [None]:
#Adding the result with best accuracy to algorithm_results
model_results.append({'model_name':'Random forest',
                      'accuracy':random_forest_results[5]['total_accuracy']
                      })

<h3><b>Accuracy of each model</h3></b>

In [None]:
print("{:<25}".format('Model'),'Accuracy')
for model in model_results :
    print("{:<25}".format(model['model_name']),
          "{:.2f}%".format(model['accuracy']*100))