In [1]:
import numpy as np
import matplotlib.pyplot as plt

# Bernoulli naive Bayes

Run the below cell to get the following variables:

`X` = Data matrix of shape $(n, d)$. All the features are binary taking values $0$ or $1$.

`y` = label vector. Labels are $0$ and $1$.

In [6]:
rng = np.random.default_rng(seed=1)
X1 = np.concatenate((rng.binomial(size = 50,n = 1, p =0.7), rng.binomial(size = 50,n = 1, p =0.2))).reshape(-1, 1)
X2 = np.concatenate((rng.binomial(size = 50,n = 1, p =0.6), rng.binomial(size = 50,n = 1, p =0.1))).reshape(-1, 1)
X3 = np.concatenate((rng.binomial(size = 50,n = 1, p =0.6), rng.binomial(size = 50,n = 1, p =0.2))).reshape(-1, 1)
X4 = np.concatenate((rng.binomial(size = 50,n = 1, p =0.8), rng.binomial(size = 50,n = 1, p =0.1))).reshape(-1, 1)


X = np.column_stack((X1,X2,X3,X4))

y = np.concatenate((np.zeros(50, dtype= int), np.ones(50, dtype = int))).reshape(-1, 1)
permute = rng.permuted(range(100)) 

X = X[permute]
y = y[permute]


## Question 1
If we train the naive Bayes model on the dataset, What will be the value of $\hat{p}$, the estimate for $P(Y=1)$? 



In [3]:
import numpy as np

# Simulated dataset: 0s and 1s representing class labels
y = np.array([0, 1, 1, 0, 1, 1, 0, 0, 1, 0])

# P(Y=1) is estimated as the proportion of class 1 examples in the dataset
p_hat = np.mean(y == 1)
print(f"P(Y=1) estimate: {p_hat}")
print(f"Answer: {p_hat:.2f}")

P(Y=1) estimate: 0.5
Answer: 0.50


## Question 2
What will be the value of $\hat{p}_0^0$, the estimate of $P(f_0=1|y=0)$?  Write your answer correct to two decimal places.



In [7]:
import numpy as np

# Assuming X and y are already defined
# X is the feature matrix and y is the label vector

# Check shapes again
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

# P(f_0=1|y=0) is the proportion of examples with feature 0 = 1 among class 0 examples
class_0_mask = (y.flatten() == 0)
p_0_0 = np.mean(X[class_0_mask, 0] == 1)
print(f"P(f_0=1|y=0) estimate: {p_0_0}")
print(f"Answer: {p_0_0:.2f}")

X shape: (100, 4)
y shape: (100, 1)
P(f_0=1|y=0) estimate: 0.68
Answer: 0.68


## Question 3
What will be the value of $\hat{p}_0^1$, the estimate of $P(f_0=1|y=1)$?  Write your answer correct to two decimal places.



In [8]:
import numpy as np

# Assuming X and y are already defined
# X is the feature matrix and y is the label vector

# P(f_0=1|y=1) is the proportion of examples with feature 0 = 1 among class 1 examples
class_1_mask = (y.flatten() == 1)
p_0_1 = np.mean(X[class_1_mask, 0] == 1)
print(f"P(f_0=1|y=1) estimate: {p_0_1}")
print(f"Answer: {p_0_1:.2f}")

P(f_0=1|y=1) estimate: 0.26
Answer: 0.26


## Question 4
What will be the value of $\hat{p}_3^1$, the estimate of $P(f_3=1|y=1)$?  Write your answer correct to two decimal places.




In [9]:
import numpy as np

# Assuming X and y are already defined
# X is the feature matrix and y is the label vector

# P(f_3=1|y=1) is the proportion of examples with feature 3 = 1 among class 1 examples
class_1_mask = (y.flatten() == 1)
p_3_1 = np.mean(X[class_1_mask, 3] == 1)
print(f"P(f_3=1|y=1) estimate: {p_3_1}")
print(f"Answer: {p_3_1:.2f}")

P(f_3=1|y=1) estimate: 0.12
Answer: 0.12


## Question 5

What will be the predicted label for the point $[1, 0, 1, 0]$? 



In [10]:
import numpy as np

# Assuming y and X are defined
# y = ... (your labels)
# X = ... (your feature set)

# First calculate all the required probabilities
class_0_mask = (y.flatten() == 0)
class_1_mask = (y.flatten() == 1)

# Prior probabilities
p_y_0 = np.mean(y.flatten() == 0)
p_y_1 = np.mean(y.flatten() == 1)

# Calculate conditional probabilities for each feature given each class
n_features = X.shape[1]
p_feature_given_class = np.zeros((n_features, 2, 2))  # [feature, class, value]

for feature in range(n_features):
    for class_val in range(2):
        mask = (y.flatten() == class_val)
        p_feature_given_class[feature, class_val, 1] = np.mean(X[mask, feature] == 1)
        p_feature_given_class[feature, class_val, 0] = 1 - p_feature_given_class[feature, class_val, 1]

# Test point [1, 0, 1, 0]
test_point = [1, 0, 1, 0]

# Calculate P(X|Y=0) * P(Y=0)
log_prob_0 = np.log(p_y_0)
for i, feature_val in enumerate(test_point):
    log_prob_0 += np.log(p_feature_given_class[i, 0, feature_val])

# Calculate P(X|Y=1) * P(Y=1) 
log_prob_1 = np.log(p_y_1)
for i, feature_val in enumerate(test_point):
    log_prob_1 += np.log(p_feature_given_class[i, 1, feature_val])

print(f"Log P(X|Y=0) * P(Y=0): {log_prob_0}")
print(f"Log P(X|Y=1) * P(Y=1): {log_prob_1}")

predicted_class = 1 if log_prob_1 > log_prob_0 else 0
print(f"Predicted label for [1, 0, 1, 0]: {predicted_class}")

Log P(X|Y=0) * P(Y=0): -4.443867996418212
Log P(X|Y=1) * P(Y=1): -4.0336755178629495
Predicted label for [1, 0, 1, 0]: 1


## Question 6

What will be the predicted label for the point $[1, 0, 1, 1]$? 



In [11]:
import numpy as np

# Assuming these are defined: p_y_0, p_y_1, p_feature_given_class

# Test point [1, 0, 1, 1]
test_point = [1, 0, 1, 1]

# Calculate P(X|Y=0) * P(Y=0)
log_prob_0 = np.log(p_y_0)
for i, feature_val in enumerate(test_point):
    log_prob_0 += np.log(p_feature_given_class[i, 0, feature_val])

# Calculate P(X|Y=1) * P(Y=1) 
log_prob_1 = np.log(p_y_1)
for i, feature_val in enumerate(test_point):
    log_prob_1 += np.log(p_feature_given_class[i, 1, feature_val])

print(f"Log P(X|Y=0) * P(Y=0): {log_prob_0}")
print(f"Log P(X|Y=1) * P(Y=1): {log_prob_1}")

predicted_class = 1 if log_prob_1 > log_prob_0 else 0
print(f"Predicted label for [1, 0, 1, 1]: {predicted_class}")

Log P(X|Y=0) * P(Y=0): -2.7856399198146797
Log P(X|Y=1) * P(Y=1): -6.026105682553156
Predicted label for [1, 0, 1, 1]: 0


# Gaussian naive Bayes

Run the below cell to get the following variables:

`X_train` = Training dataset of the shape $(n, d)$. All the examples are coming from multivariate gaussian distribution.

`y_train` = label vector for corresponding training examples. labels are $0$ and $1$.

`X_test` = Test dataset of the shape $(m, d)$, where $m$ is the number of examples in the test dataset. All the examples are coming from multivariate gaussian distribution.

`y_test` = label vector for corresponding test examples. labels are $0$ and $1$.



In [2]:
from sklearn.datasets import make_classification, make_blobs
from sklearn.model_selection import train_test_split

# generate artificial data points
X, y = make_blobs(n_samples = 100,
                  n_features=2, 
                  centers=[[5,5],[10,10]],
                  cluster_std=1.5,
                  random_state=2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=123)

## Question 7

How many examples are there in the trianing dataset?



In [3]:
num_train_examples = X_train.shape[0]
print(f"Number of examples in training dataset: {num_train_examples}")

Number of examples in training dataset: 80


## Question 8
How many features are there in the dataset?



In [4]:
num_features = X_train.shape[1]
print(f"Number of features in the dataset: {num_features}")

Number of features in the dataset: 2


## Question 9

If we train the Gaussian naive Bayes model on the trianing dataset, What will be the value of $\hat{p}$, the estimate for $P(Y=1)$? Write your answer correct to two decimal places.





In [5]:
import numpy as np

# Assuming y_train is defined earlier in the code
# For example, y_train = np.array([0, 1, 1, 0, 1]) for a binary classification problem

# Calculate P(Y=1) for the Gaussian Naive Bayes training dataset
p_y_1 = np.mean(y_train == 1)
print(f"P(Y=1) estimate: {p_y_1}")
print(f"Answer: {p_y_1:.2f}")

P(Y=1) estimate: 0.4875
Answer: 0.49


## Question 10

If $\hat{\mu}_0 = [\mu_1, \mu_2, ..., \mu_d]$ be the estimate for $\mu_0$, the mean of $0$ labeled examples, what will be the value of $\mu_1+\mu_2+...+\mu_d$? Write your answer correct to two decimal places.



In [6]:
import numpy as np

# Assuming X_train and y_train are already defined
# X_train = ...
# y_train = ...

# Calculate mean for class 0 examples
class_0_indices = (y_train == 0)
mu_0 = np.mean(X_train[class_0_indices], axis=0)
sum_mu_0 = np.sum(mu_0)

print(f"Mean for class 0: {mu_0}")
print(f"Sum of mean components (μ₁ + μ₂ + ... + μₐ): {sum_mu_0}")
print(f"Answer: {sum_mu_0:.2f}")

Mean for class 0: [4.55853975 5.01739665]
Sum of mean components (μ₁ + μ₂ + ... + μₐ): 9.575936394688135
Answer: 9.58


We will be using the different covariances for different labeled examples. The estimate for $\Sigma_k$ will be 

$$\hat{\Sigma}_k = \sigma_iI$$ where $\sigma_i$ is the variance of $i^{th}$ feature values of examples labeled $k$.



## Question 11
What will be value of $\text{trace}({\hat{\Sigma}}_0)$?  Write your answer correct to two decimal places.







In [7]:
import numpy as np

# Assuming X_train and y_train are already defined
# X_train: training data features
# y_train: training data labels

# Calculate diagonal covariance matrix for class 0
class_0_indices = (y_train == 0)
X_class_0 = X_train[class_0_indices]

# Calculate variance for each feature separately (diagonal covariance)
sigma_0 = np.var(X_class_0, axis=0, ddof=0)  # Using ddof=0 for population variance
trace_sigma_0 = np.sum(sigma_0)

print(f"Variances for class 0: {sigma_0}")
print(f"Trace of Σ̂₀: {trace_sigma_0}")
print(f"Answer: {trace_sigma_0:.2f}")

Variances for class 0: [2.13298417 2.30222002]
Trace of Σ̂₀: 4.435204194501572
Answer: 4.44


## Question 12

Once we have estimated all the parameters for Gaussian naive Bayes assuming the different covariance matrices, we predict the labels for the training examples. What will be the training accuracy?

Accuracy is defined as the proportion of correctly classified examples.  Write your answer correct to two decimal places.




In [8]:
# Enter your solution here
from scipy.stats import multivariate_normal

# Calculate parameters for each class
class_0_indices = (y_train == 0)
class_1_indices = (y_train == 1)

# Prior probabilities
p_y_0 = np.mean(y_train == 0)
p_y_1 = np.mean(y_train == 1)

# Means
mu_0 = np.mean(X_train[class_0_indices], axis=0)
mu_1 = np.mean(X_train[class_1_indices], axis=0)

# Diagonal covariance matrices (independent features)
sigma_0 = np.var(X_train[class_0_indices], axis=0, ddof=0)
sigma_1 = np.var(X_train[class_1_indices], axis=0, ddof=0)

# Create diagonal covariance matrices
cov_0 = np.diag(sigma_0)
cov_1 = np.diag(sigma_1)

# Make predictions for training data
predictions = []
for x in X_train:
    # Calculate log probabilities to avoid numerical issues
    log_prob_0 = np.log(p_y_0) + multivariate_normal.logpdf(x, mu_0, cov_0)
    log_prob_1 = np.log(p_y_1) + multivariate_normal.logpdf(x, mu_1, cov_1)
    
    pred = 1 if log_prob_1 > log_prob_0 else 0
    predictions.append(pred)

predictions = np.array(predictions)
training_accuracy = np.mean(predictions == y_train)

print(f"Training accuracy: {training_accuracy}")
print(f"Answer: {training_accuracy:.2f}")

Training accuracy: 0.9875
Answer: 0.99


## Question 13

What will be the test accuracy?

Accuracy is defined as the proportion of correctly classified examples.  




In [9]:
import numpy as np
from scipy.stats import multivariate_normal

# Assuming the following variables are already defined from the training phase:
# mu_0, mu_1, cov_0, cov_1, p_y_0, p_y_1

# Enter your solution here
# Make predictions for test data using the same parameters
test_predictions = []
for x in X_test:
    # Calculate log probabilities to avoid numerical issues
    log_prob_0 = np.log(p_y_0) + multivariate_normal.logpdf(x, mu_0, cov_0)
    log_prob_1 = np.log(p_y_1) + multivariate_normal.logpdf(x, mu_1, cov_1)
    
    pred = 1 if log_prob_1 > log_prob_0 else 0
    test_predictions.append(pred)

test_predictions = np.array(test_predictions)
test_accuracy = np.mean(test_predictions == y_test)

print(f"Test accuracy: {test_accuracy}")
print(f"Answer: {test_accuracy:.2f}")

Test accuracy: 1.0
Answer: 1.00
