This notebook is divided into two parts:
1. Model fitting dry run
2. Testing model on random data
3. Loading Real Data
4. Making the model on the real data
5. Testing the model on the real data
6. Compresssing the model to be used in the app

In [3]:
import numpy as np

class GMM:
    def __init__(self, n_components, max_iter=100, tol=1e-4):
        self.n_components = n_components
        self.max_iter = max_iter
        self.tol = tol

    def fit(self, X):
        self._initialize_parameters(X)

        for iteration in range(self.max_iter):
            prev_log_likelihood = self._compute_log_likelihood(X)
            
            # E-step: compute responsibilities
            self.responsibilities = self._e_step(X)
            
            # M-step: update parameters
            self._m_step(X)

            # Check for convergence
            log_likelihood = self._compute_log_likelihood(X)
            if np.abs(log_likelihood - prev_log_likelihood) < self.tol:
                break

    def predict(self, X):
        probabilities = np.zeros((X.shape[0], self.n_components))
        for k in range(self.n_components):
            probabilities[:, k] = self.weights[k] * self._gaussian(X, self.means[k], self.covariances[k])
        cluster_labels = np.argmax(probabilities, axis=1)

        # Create dictionary of clusters
        clusters = {label: X[cluster_labels == label] for label in range(self.n_components)}
        return clusters

    def _initialize_parameters(self, X):
        n_samples, n_features = X.shape
        self.means = X[np.random.choice(n_samples, self.n_components, replace=False)]
        self.covariances = np.array([np.eye(n_features) for _ in range(self.n_components)])
        self.weights = np.full(self.n_components, 1 / self.n_components)

    def _e_step(self, X):
        responsibilities = np.zeros((X.shape[0], self.n_components))
        for k in range(self.n_components):
            responsibilities[:, k] = self.weights[k] * self._gaussian(X, self.means[k], self.covariances[k])
        responsibilities /= responsibilities.sum(axis=1, keepdims=True)
        return responsibilities

    def _m_step(self, X):
        n_samples = X.shape[0]
        for k in range(self.n_components):
            responsibility = self.responsibilities[:, k]
            total_responsibility = responsibility.sum()
            self.means[k] = (responsibility[:, np.newaxis] * X).sum(axis=0) / total_responsibility
            self.covariances[k] = (
                (responsibility[:, np.newaxis, np.newaxis] * (X - self.means[k])[:, :, np.newaxis] * (X - self.means[k])[:, np.newaxis, :]).sum(axis=0)
                / total_responsibility
            )
            self.weights[k] = total_responsibility / n_samples

    def _compute_log_likelihood(self, X):
        likelihood = np.zeros(X.shape[0])
        for k in range(self.n_components):
            likelihood += self.weights[k] * self._gaussian(X, self.means[k], self.covariances[k])
        return np.sum(np.log(likelihood))

    @staticmethod
    def _gaussian(X, mean, covariance):
        n_features = X.shape[1]
        determinant = np.linalg.det(covariance)
        inverse = np.linalg.inv(covariance)
        norm_factor = (2 * np.pi) ** (-n_features / 2) * determinant ** -0.5
        diff = X - mean
        return norm_factor * np.exp(-0.5 * np.sum(diff @ inverse * diff, axis=1))



In [17]:
X = np.array([
    [1.0, 2.0],
    [1.5, 1.8],
    [5.0, 8.0],
    [6.0, 8.5],
    [9.0, 11.0],
    [8.5, 10.5],
    [2.0, 2.5],  # Adding points with more variance in both dimensions
    [7.5, 7.0],  # to avoid covariance matrix becoming singular
    [3.5, 5.5],
    [4.5, 3.0]
])

model = GMM(n_components=2)
model.fit(X)
clusters = model.predict(X)

print(clusters)

{0: array([[1. , 2. ],
       [1.5, 1.8],
       [5. , 8. ],
       [6. , 8.5],
       [2. , 2.5],
       [3.5, 5.5],
       [4.5, 3. ]]), 1: array([[ 9. , 11. ],
       [ 8.5, 10.5],
       [ 7.5,  7. ]])}


Now Begins the text preprocessing

In [28]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')  # Choose a suitable model
text_paragraphs = [
    "Why do we need softmax?",
    "Explain the Gaussian Mixture Model algorithm.",
    "What are supervised learning techniques?",
    "How does rule-based AI work?",
    "what are netwrok sockets?",
    "what is a web server?",
    "How do we know if an index for a database is useful?",
    "What is the difference between a primary key and a foreign key?",
    "What is a surrogate key?",
    "Who discovered vector spaces?",
]
embeddings = model.encode(text_paragraphs)  # Shape: (n_samples, embedding_dim)




In [29]:
embeddings[0].shape

(384,)

In [33]:
import numpy as np

class NumPyArrayHashTable:
    def __init__(self, size):
        self.size = size
        self.table = [None] * size

    def _hash(self, key):
        # Implement a suitable hash function for NumPy arrays
        # This example uses a simple hash based on the array's data
        return hash(key.tobytes()) % self.size

    def insert(self, key, value):
        index = self._hash(key)
        if self.table[index] is None:
            self.table[index] = [(key, value)]
        else:
            self.table[index].append((key, value))

    def get(self, key):
        index = self._hash(key)
        if self.table[index]:
            for k, v in self.table[index]:
                if np.array_equal(k, key):
                    return v
        return None
    
    def __repr__(self):
        return f"NumPyArrayHashTable(size={self.size}, table={self.table})"

# Example usage
ht = NumPyArrayHashTable(10)
key1 = np.array([1, 2, 3])
key2 = np.array([4, 5, 6])
ht.insert(key1, "value1")
ht.insert(key2, "value2")

print(ht.get(key1))  # Output: "value1"
print(ht.get(key2))  # Output: "value2"

value1
value2


In [34]:
# Create a dictionary mapping embeddings to their original sentences
# Initialize hash table with size equal to number of embeddings
hash_table = NumPyArrayHashTable(len(embeddings))

# Insert each embedding-text pair into the hash table
for embedding, text in zip(embeddings, text_paragraphs):
    hash_table.insert(embedding, text)

embedding_to_sentence = hash_table



embedding_to_sentence

NumPyArrayHashTable(size=10, table=[[(array([-1.04882866e-01,  1.65973436e-02, -3.94565426e-02,  1.47038568e-02,
       -1.28905056e-02,  1.31034888e-02,  2.11889930e-02,  5.67062609e-02,
        2.62233578e-02,  1.52511559e-02,  4.15830947e-02,  2.83600837e-02,
       -4.79813367e-02, -3.00951265e-02, -8.31015483e-02, -6.87654037e-03,
       -1.29936319e-02, -7.47617632e-02,  3.92186344e-02, -8.05413052e-02,
       -2.71138903e-02,  1.51843973e-03, -9.19998810e-03, -7.78320879e-02,
        2.23824680e-02, -4.49786196e-04,  6.82400614e-02,  6.38134331e-02,
        4.75796917e-03,  3.89225371e-02, -3.73880081e-02, -5.99806644e-02,
       -8.08196813e-02,  7.16025755e-02, -5.24824448e-02,  3.11312154e-02,
       -2.75163376e-03,  4.15021777e-02, -6.42911643e-02, -2.24029906e-02,
        2.39436328e-02, -8.62185732e-02,  1.77829489e-02,  2.94017345e-02,
       -1.28814913e-02,  2.96536814e-02, -3.72966900e-02, -5.38355522e-02,
       -3.91659141e-02, -8.06197599e-02,  6.97432235e-02, -4.5

In [35]:
text_model = GMM(n_components=6)
text_model.fit(embeddings)
text_clusters = text_model.predict(embeddings)

text_clusters

  norm_factor = (2 * np.pi) ** (-n_features / 2) * determinant ** -0.5
  responsibilities /= responsibilities.sum(axis=1, keepdims=True)
  r = _umath_linalg.det(a, signature=signature)


{0: array([[-0.07961965, -0.09150109,  0.00063998, ...,  0.0865569 ,
         -0.01579864, -0.03520425],
        [-0.01215575, -0.04346662,  0.06489774, ...,  0.11518159,
          0.0030451 , -0.03289063],
        [ 0.00749722, -0.02590082, -0.04639964, ...,  0.06773334,
          0.06688504,  0.0022567 ],
        ...,
        [-0.03467013,  0.01918791, -0.06138425, ...,  0.044944  ,
          0.03716943,  0.07534683],
        [-0.07423893, -0.04729349, -0.11563499, ...,  0.04615815,
         -0.02637787,  0.02019044],
        [-0.09001797, -0.00802459, -0.02158847, ..., -0.02148671,
          0.00860375, -0.00173726]], shape=(10, 384), dtype=float32),
 1: array([], shape=(0, 384), dtype=float32),
 2: array([], shape=(0, 384), dtype=float32),
 3: array([], shape=(0, 384), dtype=float32),
 4: array([], shape=(0, 384), dtype=float32),
 5: array([], shape=(0, 384), dtype=float32)}