# Week 5: K-fold Cross-validation and the Bayes Classifier

## Lecture 13: K-fold Cross-validation

For the `scikit-learn` implementation details, see https://scikit-learn.org/stable/modules/cross_validation.html. We will frequently use the method in the future.

## Lecture 14: The Bayes Classifier

In [192]:
from keras.datasets import mnist
from sklearn import datasets
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np

In [225]:
class BayesClassifier:
    def fit(self, X, Y):     
        # find the unique labels
        uniqueY = np.unique(Y)
        
        # find the dimensions
        n = X.shape[0]
        self.d = X.shape[1]
        self.k = uniqueY.shape[0]
        
        # initialize the outputs
        self.prior = np.zeros([self.k, 1])
        self.mu = np.zeros([self.k, self.d])
        self.Sigma = np.zeros([self.k, self.d, self.d])
        
        # compute class prior probabilities, sample means, and sample covariances
        for i, y in enumerate(uniqueY):
            print('Training for class', y)
            # split the X into its classes
            Xi = X[Y == y]
            
            # compute the size of each class
            ni = Xi.shape[0]
            
            # compute the priors
            self.prior[i] = ni / n
            
            # compute the sample mean
            self.mu[i] = np.mean(Xi, axis = 0)
            
            # compute the centered data
            XiBar = Xi - self.mu[i]
            
            # compute the sample covariance
            self.Sigma[i] = (1/ni) * XiBar.T @ XiBar
            
    def predict(self, X):
        n = X.shape[0]

        # compute Gaussian density
        coefficients = 1/(np.sqrt(2 * np.pi) ** self.d * np.sqrt(np.linalg.det(self.Sigma)))
        
        expTerm = np.zeros([n, self.k])
        
        print('Computing the exponential terms')
        for i in range(n):
            print('for i =', i)
            for j in range(self.k):
                expTerm[i, j] = np.exp(-(X[i] - self.mu[j]).T @ np.linalg.inv(self.Sigma[j]) @ (X[i] - self.mu[j])/2)
            
        # find values proportional to posterior probabilities
        posterior = coefficients * expTerm * self.prior.T
        
        # find the label for each datapoint
        predictions = np.argmax(posterior, axis = 1)
        
        return predictions

In [220]:
#(trainX, trainY), (testX, testY) = mnist.load_data()
iris = datasets.load_iris()

#print(iris.target_names)

# find the data and labels
X = iris.data
Y = iris.target

# split the data into train and test sets
trainX, testX, trainY, testY = train_test_split(X, Y, test_size = 0.25)

# build the Bayes classifier
model = BayesClassifier()

# fit the Bayes classifier to the training data
model.fit(trainX, trainY)

# predict the labels of the test set
predictedY = model.predict(testX)

print(predictedY)
print(testY)

Training for class 0
Training for class 1
Training for class 2
[0 0 0 0 1 2 0 2 0 1 2 0 1 0 0 1 0 1 1 2 1 2 2 0 1 2 1 0 2 2 0 0 2 0 2 0 2
 2]
[0 0 0 0 1 2 0 2 0 1 2 0 1 0 0 1 0 1 1 2 1 2 2 0 1 2 1 0 2 2 0 0 2 0 2 0 2
 2]


In [221]:
#(trainX, trainY), (testX, testY) = mnist.load_data()
breastcancer = datasets.load_breast_cancer()

#print(iris.target_names)

# find the data and labels
X = breastcancer.data
Y = breastcancer.target

# split the data into train and test sets
trainX, testX, trainY, testY = train_test_split(X, Y, test_size = 0.25)

# build the Bayes classifier
model = BayesClassifier()

# fit the Bayes classifier to the training data
model.fit(trainX, trainY)

# predict the labels of the test set
predictedY = model.predict(testX)

print(classification_report(testY, predictedY))

Training for class 0
Training for class 1
              precision    recall  f1-score   support

           0       0.96      0.96      0.96        47
           1       0.98      0.98      0.98        96

    accuracy                           0.97       143
   macro avg       0.97      0.97      0.97       143
weighted avg       0.97      0.97      0.97       143



In [None]:
(trainX, trainY), (testX, testY) = mnist.load_data()

trainX = trainX.reshape(trainX.shape[0], trainX.shape[1] * trainX.shape[2]).astype('float')
testX = testX.reshape(testX.shape[0], testX.shape[1] * testX.shape[2]).astype('float')

trainNoise = np.random.uniform(-1, 1, size = trainX.shape)
#testNoise = np.random.uniform(-1, 1, size = testX.shape)

trainX += trainNoise
#testX += testNoise

# build the Bayes classifier
model = BayesClassifier()

# fit the Bayes classifier to the training data
model.fit(trainX, trainY)

# predict the labels of the test set
predictedY = model.predict(testX)

print(classification_report(testY, predictedY))

Training for class 0
Training for class 1
Training for class 2
Training for class 3
Training for class 4
Training for class 5
Training for class 6
Training for class 7
Training for class 8
Training for class 9




Computing the exponential terms
for i = 0
for i = 1
for i = 2
for i = 3
for i = 4
for i = 5
for i = 6
for i = 7
for i = 8
for i = 9
for i = 10
for i = 11
for i = 12
for i = 13
for i = 14
for i = 15
for i = 16
for i = 17
for i = 18
for i = 19
for i = 20
for i = 21
for i = 22
for i = 23
for i = 24
for i = 25
for i = 26
for i = 27
for i = 28
for i = 29
for i = 30
for i = 31
for i = 32
for i = 33
for i = 34
for i = 35
for i = 36
for i = 37
for i = 38
for i = 39
for i = 40
for i = 41
for i = 42
for i = 43
for i = 44
for i = 45
for i = 46
for i = 47
for i = 48
for i = 49
for i = 50
for i = 51
for i = 52
for i = 53
for i = 54
for i = 55
for i = 56
for i = 57
for i = 58
for i = 59
for i = 60
for i = 61
for i = 62
for i = 63
for i = 64
for i = 65
for i = 66
for i = 67
for i = 68
for i = 69
for i = 70
for i = 71
for i = 72
for i = 73
for i = 74
for i = 75
for i = 76
for i = 77
for i = 78
for i = 79
for i = 80
for i = 81
for i = 82
for i = 83
for i = 84
for i = 85
for i = 86
for i = 87
for i = 88