In [None]:
import random
import numpy as np
from numpy import *
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import *

In [None]:
# read data from csv.
data = genfromtxt('heart.csv', delimiter=',')

# get features 
feature_columns_indices = [i for i in range(len(data[0])) if i != 5 and i != 13]
feature_columns_data= data[1:, feature_columns_indices]

# get labels, which are tuples (heartDz diagnosis, kidneyDz diagnosis).
label_columns_indices = [i for i in range(len(data[0])) if i == 5 or i == 13]
label_columns_data = data[1:, label_columns_indices]

# convert labels from tuples to scalar labels (format needed for prediction).
scalar_label_data = [] 
for label_pair in label_columns_data:
    if label_pair[0] == 1 and label_pair[1] == 1:
        scalar_label_data.append(1)
    elif label_pair[0] == 1 and label_pair[1] == 0:
        scalar_label_data.append(2)
    elif label_pair[0] == 0 and label_pair[1] == 1:
        scalar_label_data.append(3)
    else:
        scalar_label_data.append(4)

        
# isolate and delete one example of each class from the dataset - will need this later for k-fold.
class_one_example_features = feature_columns_data[0]
class_two_example_features = feature_columns_data[169]
class_three_example_features = feature_columns_data[1]
class_four_example_features = feature_columns_data[165]

feature_columns_data = np.delete(feature_columns_data, (0,169,1,165),axis=0)
scalar_label_data = np.delete(scalar_label_data, (0,169,1,165),axis=0)

In [None]:
# setup model with 7-Fold Validation
random.seed(1)
num_splits = 7
k_fold = KFold(n_splits=num_splits)

model = SGDClassifier(loss="modified_huber", penalty="l1", max_iter=100000, tol=1e-3)

# establish metrics
averageEqualWeightedScore =0
averagePrecision =0
averageRecall =0
averageFOne=0

# train and validate the model for each fold.
for k, (train, test) in enumerate(k_fold.split(feature_columns_data, scalar_label_data)):
    
    # create training and test sets for this fold
    train_x = feature_columns_data[train[0]:train[-1],]
    train_y = scalar_label_data[train[0]:train[-1]]
    test_x = feature_columns_data[test[0]:test[-1],] 
    test_y = scalar_label_data[test[0]:test[-1]]
 
    
    # train and test on fold data
    model.fit(train_x, train_y)
    y_pred = model.predict(test_x) 
  
    # compute metrics  for this fold
    equalWeightedScore = clf.score(test_x,test_y)
    precision = precision_score(test_y, y_pred, labels=None, pos_label=1, average='macro', sample_weight=None)
    recall = recall_score(test_y, y_pred, labels=None, pos_label=1, average='macro', sample_weight=None)
    f_one= f1_score(test_y, y_pred, labels=None, pos_label=1, average='macro', sample_weight=None)
    
    # accumulate metrics for whole-dataset (across folds) analysis
    averageEqualWeightedScore += equalWeightedScore
    averagePrecision += precision
    averageRecall += recall
    averageFOne += f_one

# Display whole-dataset metrics    
print("averageEqualWeightedScore: ", averageEqualWeightedScore/num_splits)
print("averagePrecision: ",averagePrecision/num_splits)
print("averageRecall: " , averageRecall/num_splits)
print("averageFOne: " , averageFOne/num_splits)