In [1]:
#import libraries
import sys
from os.path import dirname, join as pjoin
import random
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
import scipy.linalg as la
from statistics import mean 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
np.set_printoptions(threshold=1000)

In [2]:
#load heart data into arr
heart_data_dir= "heart.mat"
heart_data = sio.loadmat(heart_data_dir)
arr = heart_data['dat']

In [3]:
#load the labels describing presence or absence of disease
labels = heart_data['label']

In [4]:
def featureSubsetScore(arr, labels):   
    # instantiate the model
    logreg = LogisticRegression(solver="liblinear", C=0.303)
    trainScores = [0]*10
    testScores = [0]*10
    nTrain = 200
    nTest = 70
    #repeating 10 times inside for loop
    for i in range(0,10):
        #Resetting the default random number generator with a new random seed 
        random.seed(i+1)
        
        #initialize randomMatrix from original matrix
        randomArr = arr.copy()
        randomLabels = labels.copy();

        #randomize by swapping rows in random Matrix's randomArr and randomLabels
        randomIndices = list(range(0,270))
        random.shuffle(randomIndices)
        for j in range(0,270):            
            temp = randomArr[j]
            randomArr[j] = randomArr[randomIndices[j]]
            randomArr[randomIndices[j]] = temp
            
            temp = randomLabels[j]
            randomLabels[j] = randomLabels[randomIndices[j]]
            randomLabels[randomIndices[j]] = temp
        #First 200 for training    
        trainingArr = randomArr[range(0,nTrain),:]     
        trainingLabels = randomLabels[range(0,nTrain)]
        
        #remaining 70 for testing
        testingArr = randomArr[range(nTrain, nTrain+nTest),:]    
        testingLabels = randomLabels[range(nTrain, nTrain+nTest)].ravel()
        
        #Training using logistic regression
        logreg.fit(trainingArr,trainingLabels.ravel())
        
        #prediction on the training set
        y_pred_train=logreg.predict(trainingArr)
        #prediction on the test set
        y_pred_test=logreg.predict(testingArr)
        
        nTrainCorrect = 0        
        for j in range(0,nTrain):
            #if predicted value is equal to the actual value increase the count
            if y_pred_train[j] == trainingLabels[j]:
                nTrainCorrect= nTrainCorrect + 1
        #storing the prediction accuracy         
        trainScores[i] = nTrainCorrect/nTrain
        
        nTestCorrect = 0        
        for j in range(0,nTest):
            #if predicted value is equal to the actual value increase the count
            if y_pred_test[j] == testingLabels[j]:
                nTestCorrect = nTestCorrect +1
        #storing the prediction accuracy
        testScores[i] = nTestCorrect/nTest
    
    #finding mean of training scores for 10 passes
    trainScoresMean = mean(trainScores)
    #finding mean of testing scores for 10 passes
    testScoresMean = mean(testScores)
    return [trainScoresMean, testScoresMean]       

In [10]:
#no of selected features
bestK = 0
bestKScore = 0 
for k in range(1,8):
    scoreBest = 0
    selectedFeatures = []
    #list of feature indices
    allIndices = list(range(0,13))
    #1000 trials
    for i in range(0,1000):
        random.seed(i+1)
        #selecting 7 random feature indices 
        selectedIndices = random.sample(allIndices, k)
        #taking the features corresponding to selected feature indices
        trialFeatures = arr[:,selectedIndices]
        #calling the featureSubsetScore function with the selected features and getting the mean train and test score
        trainScore, testScore = featureSubsetScore(trialFeatures, labels)
        #compare the current mean test score with the best score and assign current mean test score to best score if greater 
        if testScore >scoreBest:
            scoreBest = testScore
            selectedFeatures = selectedIndices
    if bestK==0 or scoreBest > bestKScore:
        bestKScore = scoreBest
        bestK = k
    print('At k=', k, ' The best score is', scoreBest, ' The selected features are', selectedFeatures) 
    
print('\nThe best score is', bestKScore, ' at k=', bestK)
    
    
    

At k= 1  The best score is  0.76  The selected features are  [2]
At k= 2  The best score is  0.7771428571428571  The selected features are  [12, 8]
At k= 3  The best score is  0.8285714285714286  The selected features are  [12, 8, 11]
At k= 4  The best score is  0.8328571428571429  The selected features are  [12, 8, 11, 5]
At k= 5  The best score is  0.8314285714285714  The selected features are  [12, 0, 8, 5, 11]
At k= 6  The best score is  0.8514285714285714  The selected features are  [8, 1, 7, 2, 9, 11]
At k= 7  The best score is  0.8442857142857143  The selected features are  [8, 2, 9, 1, 10, 11, 7]
The best score is  0.8514285714285714  at k= 6
