### Random Forest and Logistic Regression Prediction Accuracies Comparison
We are going to predict diagnostics of cardiac Single Proton Emission Computed Tomography (SPECT) images by splitting the dataset into traingin and test set's. Each of the patients is classified into two categories: normal(specified as 0 in the dataset's first column) and abnormal(specified as 1). Properties are described here https://archive.ics.uci.edu/ml/datasets/SPECTF+Heart

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import random
from decision_tree import DecisionTree
from random_forest import RandomForest
from logistic_regression import gradient_descent
from logistic_regression import sigmoid

#### loading the dataset

In [3]:
filename = 'SPECTF.dat'
data = np.loadtxt(filename, delimiter=',')

#### printing a sample from the dataset (first and last rows)

In [8]:
print(data[[1, -1]])
print(data.shape)

[[  1.  72.  62.  69.  67.  78.  82.  74.  65.  69.  63.  70.  70.  72.
   74.  70.  71.  72.  75.  66.  65.  73.  78.  74.  79.  74.  69.  69.
   70.  71.  69.  72.  70.  62.  65.  65.  71.  63.  60.  69.  73.  67.
   71.  56.  58.]
 [  0.  64.  66.  68.  71.  62.  64.  74.  73.  63.  67.  66.  74.  70.
   74.  59.  64.  75.  73.  70.  66.  79.  81.  79.  78.  61.  62.  76.
   72.  67.  67.  71.  75.  65.  62.  70.  69.  68.  65.  75.  72.  62.
   64.  57.  54.]]
(267, 45)


#### the below function computes the accuracy of evaluation

In [5]:
def accuracy_score(Y_true, Y_predict):
    accuracy = 0
    for i in range(len(Y_true)):
        if Y_true[i] == Y_predict[i]:
            accuracy = accuracy + 1
    return accuracy / len(Y_true) * 100.0

#### the main function to evaluate performance

In [7]:
def evaluate_performance():
    X = data[:, 1:]
    y = np.array([data[:, 0]]).T
    n, d = X.shape
    all_accuracies_forest = []
    all_accuracies_log_reg = []
    NUMBER_OF_TREES = 10
    MAX_TREE_DEPTH_RF = 100
    MAX_TREE_DEPTH_TR = 100
    RATIO_PER_TREE = 0.75

    for trial in range(50):
        print('trial', trial + 1)
        idx = np.arange(n)
        np.random.seed(13)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]

        # divide data into train-test
        folds, i = 10, 3
        Xtest = X[i::folds]  # i::folds = i-th element, (i+folds)th element, ...
        ytest = y[i::folds]
        Xtrain = np.array([X[j] for j in range(len(X)) if (j % folds) != i])
        ytrain = np.array([y[j] for j in range(len(y)) if (j % folds) != i])
        train_data = np.column_stack((Xtrain, ytrain))

        # train the random forest
        classifier_forest = RandomForest(NUMBER_OF_TREES, MAX_TREE_DEPTH_RF, RATIO_PER_TREE)
        classifier_forest.fit(train_data)
        y_pred_forest = classifier_forest.predict(Xtest)
        accuracy_forest = accuracy_score(ytest, np.array(y_pred_forest))
        all_accuracies_forest.append(accuracy_forest)

        # train by logistic regression
        beta = gradient_descent(np.column_stack((np.ones(len(Xtrain)),Xtrain)), ytrain, max_steps=5)
        y_pred_log_reg = []
        for i in range(Xtest.shape[0]):
            sigm = sigmoid((np.column_stack((np.ones(len(Xtest)), Xtest))).dot(beta))
            if sigm[i] >= 0.5:
                y_pred_log_reg.append(1)
            else:
                y_pred_log_reg.append(0)
        accuracy_log_reg = accuracy_score(ytest, np.array(y_pred_log_reg))
        all_accuracies_log_reg.append(accuracy_log_reg)

    meanRandomForestAccuracy = np.mean(all_accuracies_forest)
    stddevRandomForestAccuracy = np.std(all_accuracies_forest)

    meanLogisticRegressionAccuracy = np.mean(all_accuracies_log_reg)
    stddevLogisticRegressionAccuracy = np.std(all_accuracies_log_reg)

    stats = np.zeros(2)
    stats[0] = meanRandomForestAccuracy
    stats[1] = meanLogisticRegressionAccuracy
    return stats

if __name__ == "__main__":
    stats = evaluate_performance()
    print("Random Forest Accuracy = ", stats[0])
    print("Logistic Reg. Accuracy = ", stats[1])

trial 1
trial 2
trial 3
trial 4
trial 5
trial 6
trial 7
trial 8
trial 9
trial 10
trial 11
trial 12
trial 13
trial 14
trial 15
trial 16
trial 17
trial 18
trial 19
trial 20
trial 21
trial 22
trial 23
trial 24
trial 25
trial 26
trial 27
trial 28
trial 29
trial 30
trial 31
trial 32
trial 33
trial 34
trial 35
trial 36
trial 37
trial 38
trial 39
trial 40
trial 41
trial 42
trial 43
trial 44
trial 45
trial 46
trial 47
trial 48
trial 49
trial 50
Random Forest Accuracy =  36.4444444444
Logistic Reg. Accuracy =  79.2592592593
