In [40]:
import numpy as np
import pandas
import sklearn
from sklearn.model_selection import train_test_split, KFold

# import data from csv
dataframe = pandas.read_csv('heart.csv',usecols = range(0,14), engine='python')
dataset = dataframe.values

# shuffle dataset
np.random.shuffle(dataset)

# divide data (X) and targets (Y)
X = np.delete(dataset, obj=13, axis=1)
Y = dataset[:,13]

# normalize data (X)
X = sklearn.preprocessing.normalize(X, norm='max', axis=0)


In [41]:
# method to calculate accuracy
def calcAccuracy(actual, predicted):
    correct = 0
    for i in range(0, len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
#         print "Actual: {0} Predicted: {1}".format(actual[i], predicted[i])
    return float(correct)/len(actual)

# build classifier
def buildClf(train_data, train_target):
#     clf = sklearn.tree.DecisionTreeClassifier()
#     clf = sklearn.naive_bayes.GaussianNB()
    clf = sklearn.linear_model.LogisticRegression()
    clf.fit(train_data, train_target)
    return clf

In [42]:
# split into train and test - 70/30
X_train, X_test, Y_train, Y_test = \
    train_test_split(X, Y, train_size = 0.7, random_state=2017)

# run on test data
clf = buildClf(X_train, Y_train)
predicted = clf.predict(X_test)    
accuracy = calcAccuracy(Y_test, predicted)

print "Accuracy: {}%".format(accuracy*100)

Accuracy: 90.1234567901%


In [43]:
# k-fold cross validation
kFold = KFold(n_splits=10)

# run on test data
results = []
for train_index, test_index in kFold.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    # build classifier for each set
    clf = buildClf(X_train, Y_train)
    predicted = clf.predict(X_test)
    accuracy = calcAccuracy(Y_test, predicted)
    results.append(accuracy)
    
print "Accuracy: {}%".format(np.mean(results)*100)

Accuracy: 85.5555555556%


In [44]:
# leave one out cross validation
loo = sklearn.model_selection.LeaveOneOut()

# run on test data
results = []
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    # build classifier for each set
    clf = buildClf(X_train, Y_train)
    predicted = clf.predict(X_test)
    accuracy = calcAccuracy(Y_test, predicted)
    results.append(accuracy)
    
print "Accuracy: {}%".format(np.mean(results)*100)

Accuracy: 85.1851851852%


In [228]:
# visualize the decision tree
# from: http://scikit-learn.org/stable/modules/tree.html

# import pydotplus 
# dot_data = tree.export_graphviz(clf, out_file=None) 
# graph = pydotplus.graph_from_dot_data(dot_data) 
# graph.write_pdf("decision-tree.pdf") 
