<a href="https://colab.research.google.com/github/shivamsri07/vectors_and_llms/blob/main/product_quantization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import numpy as np
from sklearn.cluster import KMeans

# Example 4D vectors
vectors = np.random.rand(10, 4) # 10 vectors, 4 dimensions each

# Step 1: Create codebook of centroids
# We need to split the 4D vector into subvectors for Product Quantization.
# Let's split it into 2 subvectors of 2 dimensions each.
vectors_sub1 = vectors[:, :2]
vectors_sub2 = vectors[:, 2:]

k = 2 # Number of centroids for each subvector space

# Perform K-Means on each subvector space
kmeans_sub1 = KMeans(n_clusters=k, random_state=0, n_init=10)
kmeans_sub1.fit(vectors_sub1)
codebook_sub1 = kmeans_sub1.cluster_centers_

kmeans_sub2 = KMeans(n_clusters=k, random_state=0, n_init=10)
kmeans_sub2.fit(vectors_sub2)
codebook_sub2 = kmeans_sub2.cluster_centers_

# The codebook is a combination of codebook_sub1 and codebook_sub2

# Step 2: Take a new 4D vector and decode it (approximate it)
new_vector = np.random.rand(4)

# Split the new vector into subvectors
new_vector_sub1 = new_vector[:2].reshape(1, -1)
new_vector_sub2 = new_vector[2:].reshape(1, -1)

# Find the closest centroid in each codebook for the corresponding subvector
closest_centroid_idx_sub1 = kmeans_sub1.predict(new_vector_sub1)[0]
closest_centroid_sub1 = codebook_sub1[closest_centroid_idx_sub1]

closest_centroid_idx_sub2 = kmeans_sub2.predict(new_vector_sub2)[0]
closest_centroid_sub2 = codebook_sub2[closest_centroid_idx_sub2]

# Decode the vector by concatenating the closest centroids
decoded_vector = np.concatenate((closest_centroid_sub1, closest_centroid_sub2))

print("Original Vector:", new_vector)
print("Decoded Vector (Approximation):", decoded_vector)
print("Product Code of input vector:", (closest_centroid_idx_sub1, closest_centroid_idx_sub2))
print("Codebook Subvector 1:", codebook_sub1)
print("Codebook Subvector 2:", codebook_sub2)


Original Vector: [0.2855435  0.00769788 0.14704744 0.24921835]
Decoded Vector (Approximation): [0.69388635 0.25340995 0.24877843 0.43063217]
Product Code of input vector: (np.int32(1), np.int32(0))
Codebook Subvector 1: [[0.42202695 0.76837174]
 [0.69388635 0.25340995]]
Codebook Subvector 2: [[0.24877843 0.43063217]
 [0.5251123  0.87305405]]


In [19]:
# Generate product codes for all original vectors
product_codes = []
for vector in vectors:
  vector_sub1 = vector[:2].reshape(1, -1)
  vector_sub2 = vector[2:].reshape(1, -1)

  closest_centroid_idx_sub1 = kmeans_sub1.predict(vector_sub1)[0]
  closest_centroid_idx_sub2 = kmeans_sub2.predict(vector_sub2)[0]

  product_codes.append((closest_centroid_idx_sub1, closest_centroid_idx_sub2))

print("\nProduct Codes of Original Vectors:")
for i, code in enumerate(product_codes):
  print(f"Vector {i}: {code}")





Product Codes of Original Vectors:
Vector 0: (np.int32(1), np.int32(0))
Vector 1: (np.int32(1), np.int32(0))
Vector 2: (np.int32(1), np.int32(0))
Vector 3: (np.int32(1), np.int32(0))
Vector 4: (np.int32(0), np.int32(1))
Vector 5: (np.int32(1), np.int32(1))
Vector 6: (np.int32(0), np.int32(0))
Vector 7: (np.int32(1), np.int32(1))
Vector 8: (np.int32(0), np.int32(1))
Vector 9: (np.int32(0), np.int32(1))
