**This notebook is divided into two parts:**
1. Model fitting dry run
2. Testing model on random data
3. Loading Real Data
4. Making the model on the real data
5. Testing the model on the real data
6. Compresssing the model to be used in the 

In [1]:
import numpy as np

class GMM:
    def __init__(self, n_components, max_iter=100, tol=1e-4):
        self.n_components = n_components
        self.max_iter = max_iter
        self.tol = tol

    def fit(self, X):
        self._initialize_parameters(X)

        for iteration in range(self.max_iter):
            prev_log_likelihood = self._compute_log_likelihood(X)
            
            # E-step: compute responsibilities
            self.responsibilities = self._e_step(X)
            
            # M-step: update parameters
            self._m_step(X)

            # Check for convergence
            log_likelihood = self._compute_log_likelihood(X)
            if np.abs(log_likelihood - prev_log_likelihood) < self.tol:
                break

    def predict(self, X):
        probabilities = np.zeros((X.shape[0], self.n_components))
        for k in range(self.n_components):
            probabilities[:, k] = self.weights[k] * self._gaussian(X, self.means[k], self.covariances[k])
        cluster_labels = np.argmax(probabilities, axis=1)

        # Create dictionary of clusters
        clusters = {label: X[cluster_labels == label] for label in range(self.n_components)}
        return clusters

    def _initialize_parameters(self, X):
        n_samples, n_features = X.shape
        self.means = X[np.random.choice(n_samples, self.n_components, replace=False)]
        self.covariances = np.array([np.eye(n_features) for _ in range(self.n_components)])
        self.weights = np.full(self.n_components, 1 / self.n_components)

    def _e_step(self, X):
        responsibilities = np.zeros((X.shape[0], self.n_components))
        for k in range(self.n_components):
            responsibilities[:, k] = self.weights[k] * self._gaussian(X, self.means[k], self.covariances[k])
        responsibilities /= responsibilities.sum(axis=1, keepdims=True)
        return responsibilities

    def _m_step(self, X):
        n_samples = X.shape[0]
        for k in range(self.n_components):
            responsibility = self.responsibilities[:, k]
            total_responsibility = responsibility.sum()
            self.means[k] = (responsibility[:, np.newaxis] * X).sum(axis=0) / total_responsibility
            self.covariances[k] = (
                (responsibility[:, np.newaxis, np.newaxis] * (X - self.means[k])[:, :, np.newaxis] * (X - self.means[k])[:, np.newaxis, :]).sum(axis=0)
                / total_responsibility
            )
            self.weights[k] = total_responsibility / n_samples

    def _compute_log_likelihood(self, X):
        likelihood = np.zeros(X.shape[0])
        for k in range(self.n_components):
            likelihood += self.weights[k] * self._gaussian(X, self.means[k], self.covariances[k])
        return np.sum(np.log(likelihood))

    @staticmethod
    def _gaussian(X, mean, covariance):
        n_features = X.shape[1]
        determinant = np.linalg.det(covariance)
        inverse = np.linalg.inv(covariance)
        norm_factor = (2 * np.pi) ** (-n_features / 2) * determinant ** -0.5
        diff = X - mean
        return norm_factor * np.exp(-0.5 * np.sum(diff @ inverse * diff, axis=1))



In [2]:
X = np.array([
    [1.0, 2.0],
    [1.5, 1.8],
    [5.0, 8.0],
    [6.0, 8.5],
    [9.0, 11.0],
    [8.5, 10.5],
    [2.0, 2.5],  # Adding points with more variance in both dimensions
    [7.5, 7.0],  # to avoid covariance matrix becoming singular
    [3.5, 5.5],
    [4.5, 3.0]
])

model = GMM(n_components=2)
model.fit(X)
clusters = model.predict(X)

print(clusters)

{0: array([[ 5. ,  8. ],
       [ 6. ,  8.5],
       [ 9. , 11. ],
       [ 8.5, 10.5],
       [ 7.5,  7. ],
       [ 3.5,  5.5]]), 1: array([[1. , 2. ],
       [1.5, 1.8],
       [2. , 2.5],
       [4.5, 3. ]])}


Now Begins the text preprocessing

In [8]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')  # Choose a suitable model
text_paragraphs = [
    "Why do we need softmax?",
    "Explain the Gaussian Mixture Model algorithm.",
    "What are supervised learning techniques?",
    "How does rule-based AI work?",
    "what are netwrok sockets?",
    "what is a web server?",
    "How do we know if an index for a database is useful?",
    "What is the difference between a primary key and a foreign key?",
    "What is a surrogate key?",
    "Who discovered vector spaces?",
]
embeddings = model.encode(text_paragraphs)  # Shape: (n_samples, embedding_dim)




In [10]:
embeddings.shape

(10, 384)

In [13]:
import numpy as np

class Autoencoder:
    def __init__(self, input_dim=384, hidden_dim=100, learning_rate=0.01, epochs=1000):
        self.input_dim = input_dim      # 384 features
        self.hidden_dim = hidden_dim    # 100 features
        self.learning_rate = learning_rate
        self.epochs = epochs
        
        # Initialize weights and biases for encoder and decoder
        # Use He initialization for better training
        self.W1 = np.random.randn(input_dim, hidden_dim) * np.sqrt(2./input_dim)   # (384, 100)
        self.b1 = np.zeros((1, hidden_dim))                                         # (1, 100)
        self.W2 = np.random.randn(hidden_dim, input_dim) * np.sqrt(2./hidden_dim)  # (100, 384)
        self.b2 = np.zeros((1, input_dim))                                         # (1, 384)

    def sigmoid(self, x):
        # Clip values to avoid overflow
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
    
    def sigmoid_derivative(self, x):
        return x * (1 - x)
    
    def train(self, X):
        if X.shape[1] != self.input_dim:
            raise ValueError(f"Input dimension mismatch. Expected {self.input_dim}, got {X.shape[1]}")
            
        losses = []
        m = X.shape[0]  # number of training examples
        
        for epoch in range(self.epochs):
            # Forward pass
            # Encoder: (m, 384) @ (384, 100) -> (m, 100)
            encoded = self.sigmoid(np.dot(X, self.W1) + self.b1)
            
            # Decoder: (m, 100) @ (100, 384) -> (m, 384)
            decoded = self.sigmoid(np.dot(encoded, self.W2) + self.b2)
            
            # Compute reconstruction loss (mean squared error)
            loss = np.mean((X - decoded) ** 2)
            losses.append(loss)
            
            # Backpropagation
            # Output layer error
            error = X - decoded
            d_decoded = error * self.sigmoid_derivative(decoded)
            
            # Decoder gradients
            # (100, m) @ (m, 384) -> (100, 384)
            dW2 = np.dot(encoded.T, d_decoded) / m
            db2 = np.sum(d_decoded, axis=0, keepdims=True) / m
            
            # Hidden layer error
            # (m, 384) @ (384, 100) -> (m, 100)
            d_encoded = np.dot(d_decoded, self.W2.T) * self.sigmoid_derivative(encoded)
            
            # Encoder gradients
            # (384, m) @ (m, 100) -> (384, 100)
            dW1 = np.dot(X.T, d_encoded) / m
            db1 = np.sum(d_encoded, axis=0, keepdims=True) / m
            
            # Update weights and biases with gradient descent
            self.W1 += self.learning_rate * dW1
            self.b1 += self.learning_rate * db1
            self.W2 += self.learning_rate * dW2
            self.b2 += self.learning_rate * db2
            
            # Print progress
            if (epoch + 1) % 100 == 0:
                print(f"Epoch {epoch + 1}/{self.epochs}, Loss: {loss:.6f}")
                
        return losses

    def encode(self, X):
        """Convert input data from 384 dimensions to 100 dimensions"""
        if X.shape[1] != self.input_dim:
            raise ValueError(f"Input dimension mismatch. Expected {self.input_dim}, got {X.shape[1]}")
        return self.sigmoid(np.dot(X, self.W1) + self.b1)
    
    def decode(self, encoded):
        """Convert encoded 100-dimensional data back to 384 dimensions"""
        if encoded.shape[1] != self.hidden_dim:
            raise ValueError(f"Encoded dimension mismatch. Expected {self.hidden_dim}, got {encoded.shape[1]}")
        return self.sigmoid(np.dot(encoded, self.W2) + self.b2)

In [14]:
# Create example data (10 samples, 384 features)
X = np.random.randn(10, 384)

# Initialize and train autoencoder
autoencoder = Autoencoder(input_dim=384, hidden_dim=100, learning_rate=0.01, epochs=1000)
losses = autoencoder.train(X)

# Encode data to reduced dimension
encoded_data = autoencoder.encode(X)  # Shape: (10, 100)

# Decode back to original dimension
decoded_data = autoencoder.decode(encoded_data)  # Shape: (10, 384)

print("Original shape:", X.shape)
print("Encoded shape:", encoded_data.shape)
print("Decoded shape:", decoded_data.shape)

Epoch 100/1000, Loss: 0.892509
Epoch 200/1000, Loss: 0.822409
Epoch 300/1000, Loss: 0.771816
Epoch 400/1000, Loss: 0.736100
Epoch 500/1000, Loss: 0.710294
Epoch 600/1000, Loss: 0.690995
Epoch 700/1000, Loss: 0.676085
Epoch 800/1000, Loss: 0.664524
Epoch 900/1000, Loss: 0.655470
Epoch 1000/1000, Loss: 0.648216
Original shape: (10, 384)
Encoded shape: (10, 100)
Decoded shape: (10, 384)


In [25]:
def normalize_dataset(dataset):
    """
    Normalizes the dataset by scaling each feature to a range of 0 to 1 
    using min-max normalization.
    
    Parameters:
    dataset (numpy.ndarray): A 2D array of shape (n_samples, n_features) where n_features = 100.

    Returns:
    numpy.ndarray: A normalized dataset with feature values scaled to the range [0, 1].
    """
    if dataset.shape[1] != 100:
        raise ValueError("The dataset must have exactly 100 features.")
    
    # Calculate the min and max for each feature (axis=0)
    feature_mins = np.min(dataset, axis=0)
    feature_maxs = np.max(dataset, axis=0)
    
    # Avoid division by zero for features with constant values
    feature_ranges = feature_maxs - feature_mins
    feature_ranges[feature_ranges == 0] = 1  # Prevent division by zero
    
    # Normalize the dataset
    normalized_data = (dataset - feature_mins) / feature_ranges
    return normalized_data

In [31]:
stan_data = normalize_dataset(encoded_data)

In [32]:
import numpy as np

class NumPyArrayHashTable:
    def __init__(self, size):
        self.size = size
        self.table = [None] * size

    def _hash(self, key):
        # Implement a suitable hash function for NumPy arrays
        # This example uses a simple hash based on the array's data
        return hash(key.tobytes()) % self.size

    def insert(self, key, value):
        index = self._hash(key)
        if self.table[index] is None:
            self.table[index] = [(key, value)]
        else:
            self.table[index].append((key, value))

    def get(self, key):
        index = self._hash(key)
        if self.table[index]:
            for k, v in self.table[index]:
                if np.array_equal(k, key):
                    return v
        return None
    
    def __repr__(self):
        return f"NumPyArrayHashTable(size={self.size}, table={self.table})"

# Example usage
ht = NumPyArrayHashTable(10)
key1 = np.array([1, 2, 3])
key2 = np.array([4, 5, 6])
ht.insert(key1, "value1")
ht.insert(key2, "value2")

print(ht.get(key1))  # Output: "value1"
print(ht.get(key2))  # Output: "value2"

value1
value2


In [33]:
# Create a dictionary mapping embeddings to their original sentences
# Initialize hash table with size equal to number of embeddings
hash_table = NumPyArrayHashTable(len(stan_data))

# Insert each embedding-text pair into the hash table
for embedding, text in zip(stan_data, text_paragraphs):
    hash_table.insert(embedding, text)

embedding_to_sentence = hash_table

In [34]:
text_model = GMM(n_components=6)
text_model.fit(stan_data)
text_clusters = text_model.predict(stan_data)

  norm_factor = (2 * np.pi) ** (-n_features / 2) * determinant ** -0.5
  return norm_factor * np.exp(-0.5 * np.sum(diff @ inverse * diff, axis=1))
  return norm_factor * np.exp(-0.5 * np.sum(diff @ inverse * diff, axis=1))
  responsibilities /= responsibilities.sum(axis=1, keepdims=True)


In [35]:
text_clusters

{0: array([[1.39823518e-02, 1.32726269e-02, 9.54514062e-01, 2.36157203e-04,
         0.00000000e+00, 9.92457501e-01, 9.96435118e-01, 9.91835862e-01,
         1.28901974e-04, 1.00000000e+00, 5.18177690e-04, 2.78358437e-04,
         5.13162036e-03, 9.96507949e-01, 1.00000000e+00, 0.00000000e+00,
         9.98824165e-01, 1.18611350e-02, 9.92050379e-01, 9.99919676e-01,
         9.96200551e-01, 0.00000000e+00, 9.90265037e-01, 9.90532489e-01,
         9.99279014e-01, 9.80329242e-01, 1.00000000e+00, 1.09615794e-02,
         1.00000000e+00, 9.96478266e-01, 9.91759718e-01, 2.28734797e-03,
         9.94092925e-01, 2.42789146e-03, 9.90798784e-01, 0.00000000e+00,
         2.05655091e-03, 0.00000000e+00, 9.95638964e-01, 2.16577958e-03,
         3.60858053e-03, 2.55812553e-03, 9.78764406e-01, 1.00000000e+00,
         9.88085640e-01, 9.99286629e-01, 9.97537032e-01, 1.00000000e+00,
         9.96242973e-01, 2.44637670e-03, 8.10998650e-04, 1.14474362e-02,
         5.93951850e-04, 9.96089282e-01, 0.00000