1. Read the Bernoulli Mixture Model Derivation.
2. Read about Stochastic Expectation-Maximization (EM) Algorithm: https://www.sciencedirect.com/science/article/pii/S0167947320302504.
3. From the given code, modify the EM algorithm to become a Stochastic EM Algorithm.
4. Use the data from the paper: https://www.sciencedirect.com/science/article/abs/pii/S0031320322001753
5. Perform categorical clustering using the Bernoulli Mixture Model with Stochastic EM Algorithm.
6. Compare its performance with K-Modes Algorithm using Folkes-Mallows Index, Adjusted Rand Index, and Normalized Mutual Information Score.
7. Compare and contrast the performances, and explain what is happening (i.e. why is FMI always higher than ARI and NMI? Why is ARI and NMI low compared to FMI? etc.)
8. Write the report in Latex, push to your github with the codes.

In [2]:
from ucimlrepo import fetch_ucirepo 
soybean = fetch_ucirepo(id=91) 
zoo = fetch_ucirepo(id=111) 
heart_disease = fetch_ucirepo(id=45) 
dermatology = fetch_ucirepo(id=33)
breast_cancer = fetch_ucirepo(id=15)
mushroom = fetch_ucirepo(id=73) 

In [3]:
import pandas as pd
X = soybean.data.features
y = soybean.data.targets 
soybean_df = pd.merge(X, y, left_index=True, right_index=True)

X = zoo.data.features
y = zoo.data.targets 
zoo_df = pd.merge(X, y, left_index=True, right_index=True)

X = heart_disease.data.features
y = heart_disease.data.targets 
heart_disease_df = pd.merge(X, y, left_index=True, right_index=True)

X = dermatology.data.features
y = dermatology.data.targets 
dermatology_df = pd.merge(X, y, left_index=True, right_index=True)

X = breast_cancer.data.features
y = breast_cancer.data.targets 
breast_cancer_df = pd.merge(X, y, left_index=True, right_index=True)

X = mushroom.data.features
y = mushroom.data.targets 
mushroom_df = pd.merge(X, y, left_index=True, right_index=True)


soybean_df = soybean_df.dropna()
zoo_df = zoo_df.dropna()
heart_disease_df = heart_disease_df.dropna()
dermatology_df = dermatology_df.dropna()
breast_cancer_df = breast_cancer_df.dropna()
mushroom_df = mushroom_df.dropna()

In [4]:
import numpy as np

class BernoulliModel:
    def __init__(self, n_clusters, max_iter=100, tol=1e-4):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol

    def _initialize_parameters(self, X):
        n_samples, n_features = X.shape
        self.cluster_probs = np.ones(self.n_clusters) / self.n_clusters
        self.cluster_centers = np.random.rand(self.n_clusters, n_features)

    def _e_step(self, X):
        log_likelihood = np.zeros((len(X), self.n_clusters))

        for k in range(self.n_clusters):
            log_likelihood[:, k] = np.sum(X * np.log(self.cluster_centers[k]) + (1 - X) * np.log(1 - self.cluster_centers[k]), axis=1)

        log_likelihood += np.log(self.cluster_probs)

        # Normalize to prevent underflow
        log_likelihood -= np.max(log_likelihood, axis=1)[:, np.newaxis]
        likelihood = np.exp(log_likelihood)
        self.responsibilities = likelihood / np.sum(likelihood, axis=1)[:, np.newaxis]

    def _m_step(self, X):
        for k in range(self.n_clusters):
            self.cluster_centers[k] = np.sum(X * self.responsibilities[:, k, np.newaxis], axis=0) / np.sum(self.responsibilities[:, k])

        self.cluster_probs = np.mean(self.responsibilities, axis=0)

    def fit(self, X):
        self._initialize_parameters(X)

        for _ in range(self.max_iter):
            old_centers = np.copy(self.cluster_centers)

            self._e_step(X)
            self._m_step(X)

            # Check for convergence
            if np.linalg.norm(self.cluster_centers - old_centers) < self.tol:
                break

    def predict(self, X):
        log_likelihood = np.zeros((len(X), self.n_clusters))

        for k in range(self.n_clusters):
            log_likelihood[:, k] = np.sum(X * np.log(self.cluster_centers[k]) + (1 - X) * np.log(1 - self.cluster_centers[k]), axis=1)

        log_likelihood += np.log(self.cluster_probs)

        return np.argmax(log_likelihood, axis=1)


In [9]:
# Example usage
# Assuming you have your data stored in X, where each row represents a sample and each column represents a binary feature.

# Initialize the model
n_clusters = 3  # Number of clusters
model = BernoulliModel(n_clusters)

# Fit the model to the data
model.fit(zoo_df)

# Predict cluster labels for the data
labels = model.predict(zoo_df)

  log_likelihood[:, k] = np.sum(X * np.log(self.cluster_centers[k]) + (1 - X) * np.log(1 - self.cluster_centers[k]), axis=1)
  log_likelihood[:, k] = np.sum(X * np.log(self.cluster_centers[k]) + (1 - X) * np.log(1 - self.cluster_centers[k]), axis=1)
  log_likelihood[:, k] = np.sum(X * np.log(self.cluster_centers[k]) + (1 - X) * np.log(1 - self.cluster_centers[k]), axis=1)
  log_likelihood[:, k] = np.sum(X * np.log(self.cluster_centers[k]) + (1 - X) * np.log(1 - self.cluster_centers[k]), axis=1)
  log_likelihood[:, k] = np.sum(X * np.log(self.cluster_centers[k]) + (1 - X) * np.log(1 - self.cluster_centers[k]), axis=1)
  log_likelihood[:, k] = np.sum(X * np.log(self.cluster_centers[k]) + (1 - X) * np.log(1 - self.cluster_centers[k]), axis=1)
  log_likelihood[:, k] = np.sum(X * np.log(self.cluster_centers[k]) + (1 - X) * np.log(1 - self.cluster_centers[k]), axis=1)
  log_likelihood[:, k] = np.sum(X * np.log(self.cluster_centers[k]) + (1 - X) * np.log(1 - self.cluster_centers[k]), axis=1)
