## Creating a Test Harness

In [1]:
# 1. Function to load CSV
from csv import reader

def load_csv(filename):
    dataset = list()
    
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        
        for row in csv_reader:
            if not row:
                continue
            
            dataset.append(row)
    return dataset

In [2]:
# 2. Convert string columns to float
def convert_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column])

In [3]:
# 3. Split into train and test
from random import randrange

def train_test_split(dataset, split):
    train_set = list()
    train_size = split * len(dataset)
    test_set = list(dataset)
    
    while len(train_set) < train_size:
        rand_index = randrange(len(test_set))
        
        train_set.append(test_set.pop(rand_index))
        
    return train_set, test_set

In [16]:
# 4. Accuracy metric
def accuracy_metric(actual, predicted):
    correct = 0
    
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
            
    return correct / float(len(actual)) * 100.0

In [14]:
# 5. Baseline prediction algorithm
def zero_rule_algo_classification(train, test):
    train_outputs = [row[-1] for row in train]
    
    prediction = max(set(train_outputs), key=train_outputs.count)
    predicted = [prediction for i in range(len(test))]
    
    return predicted

In [28]:
# 5. Evaluation function
def evaluate_model(dataset, algorithm, split, *args):
    train, test = train_test_split(dataset, split)
    test_actual = [row[-1] for row in test]
    
    # Remove the actual col in test
    test_set = list()
    
    for row in test:
        row_copy = list(row)
        row_copy[-1] = None
        test_set.append(row_copy)
        
    predicted = algorithm(train, test_set, *args)
    
    accuracy = accuracy_metric(test_actual, predicted)
    
    return accuracy

In [35]:
from random import seed

seed(1)

filename = 'data/pima-indians-diabetes.data.csv'
dataset = load_csv(filename)

dataset[:5]

[['6', '148', '72', '35', '0', '33.6', '0.627', '50', '1'],
 ['1', '85', '66', '29', '0', '26.6', '0.351', '31', '0'],
 ['8', '183', '64', '0', '0', '23.3', '0.672', '32', '1'],
 ['1', '89', '66', '23', '94', '28.1', '0.167', '21', '0'],
 ['0', '137', '40', '35', '168', '43.1', '2.288', '33', '1']]

In [36]:
for col in range(len(dataset[0])):
    convert_to_float(dataset, col)

In [37]:
dataset[:5]

[[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0],
 [1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.351, 31.0, 0.0],
 [8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 1.0],
 [1.0, 89.0, 66.0, 23.0, 94.0, 28.1, 0.167, 21.0, 0.0],
 [0.0, 137.0, 40.0, 35.0, 168.0, 43.1, 2.288, 33.0, 1.0]]

In [38]:
split = 0.7

accuracy = evaluate_model(dataset, zero_rule_algo_classification, split)

print('Accuracy: %.3f%%' % accuracy)

Accuracy: 68.696%
