In [1]:
import numpy as np
import pandas
import sklearn
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# import data from csv
dataframe = pandas.read_csv('heart.csv',usecols = range(0,14), engine='python')
dataset = dataframe.values

# shuffle dataset
np.random.shuffle(dataset)

# divide data (X) and targets (Y)
X = np.delete(dataset, obj=13, axis=1)
Y = dataset[:,13]

# normalize data (X)
X = sklearn.preprocessing.normalize(X, norm='max', axis=0)


In [2]:
# method to calculate accuracy
def calcAccuracy(actual, predicted):
    correct = 0
    for i in range(0, len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
#         print "Actual: {0} Predicted: {1}".format(actual[i], predicted[i])
    return float(correct)/len(actual)


# method to test train/test split by percentage
def splitByPercentageTest(train_size=0.7):
    X_train, X_test, Y_train, Y_test = \
        train_test_split(X, Y, train_size = train_size, random_state=2017)

    # run on test data
    clf = buildClf(X_train, Y_train)
    predicted = clf.predict(X_test)    
    accuracy = calcAccuracy(Y_test, predicted)
    
    return accuracy


# method for k-fold cross validation
def kFoldValidation(n_splits=10):
    kFold = KFold(n_splits=n_splits)

    # run on test data
    results = []
    for train_index, test_index in kFold.split(X):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        # build classifier for each set
        clf = buildClf(X_train, Y_train)
        predicted = clf.predict(X_test)
        accuracy = calcAccuracy(Y_test, predicted)
        results.append(accuracy)
    
    return np.mean(results)


# method for leave one out cross validation
def looValidation():
    loo = sklearn.model_selection.LeaveOneOut()

    # run on test data
    results = []
    for train_index, test_index in loo.split(X):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        # build classifier for each set
        clf = buildClf(X_train, Y_train)
        predicted = clf.predict(X_test)
        accuracy = calcAccuracy(Y_test, predicted)
        results.append(accuracy)
        
    return np.mean(results)
    

In [3]:
# method to build classifier
def buildClf(train_data, train_target):
#     clf = DecisionTreeClassifier()
#     clf = GaussianNB()
#     clf = LogisticRegression()
#     clf = SVC()
#     clf = RandomForestClassifier()
    clf = KNeighborsClassifier()
    clf.fit(train_data, train_target)
    return clf

In [4]:
print "Split by % Accuracy: {}%".format(splitByPercentageTest(0.7)*100)
print "K-Fold Accuracy: {}%".format(kFoldValidation(10)*100)
print "Leave-One-Out Accuracy: {}%".format(looValidation()*100)

Split by % Accuracy: 66.6666666667%
K-Fold Accuracy: 72.5925925926%
Leave-One-Out Accuracy: 73.7037037037%


In [None]:
# visualize the decision tree
# from: http://scikit-learn.org/stable/modules/tree.html

# import pydotplus 
# dot_data = tree.export_graphviz(clf, out_file=None) 
# graph = pydotplus.graph_from_dot_data(dot_data) 
# graph.write_pdf("decision-tree.pdf") 
