In [1]:
import numpy as np

    Scalar Quantization
    In scalar quantization, each dimension of the dataset is treated independently. The maximum and minimum values are calculated for each dimension across the dataset. The range between the maximum and minimum values in each dimension is then divided into equal-sized bins. Each value in the dataset is mapped to one of these bins, effectively quantizing the data.

In [2]:
dataset = np.random.normal(size=(2000, 256))
dataset

array([[ 0.01417943, -1.07419574,  0.73582553, ..., -2.23617757,
        -0.94531851, -0.26708104],
       [-0.36355731,  1.52229872,  0.51995347, ...,  1.45036581,
        -2.45654967, -0.48267998],
       [ 0.89915134,  0.41277587,  0.73744063, ...,  0.73220523,
         0.71921351,  2.02464959],
       ...,
       [-0.86690423,  0.96841778,  0.90647801, ..., -1.57835547,
         0.87775433,  1.09781529],
       [ 0.0194662 , -0.2007731 , -1.72313796, ...,  1.2409941 ,
        -0.93675584,  0.23991717],
       [ 0.59816484,  0.18695781, -0.95219869, ...,  1.38140028,
         0.1313539 , -0.90429461]])

In [15]:
# Calculate and store minimum and maximum across each dimension
ranges = np.vstack((np.min(dataset, axis=0), np.max(dataset, axis=0)))
ranges.shape

(2, 256)

In [19]:
# calculate each dimension's start value and step size. 
# The start value is the minimum value, and the step size is determined by the number of discrete bins in the integer type being used. 
# This example uses 8-bit unsigned integers (uint8), providing 256 bins.
starts = ranges[0,:]
steps = (ranges[1,:] - ranges[0,:]) / 255
steps.shape

(256,)

In [20]:
# The quantized dataset is then calculated as follows:
scalar_quantized_dataset = np.uint8((dataset - starts) / steps)

In [21]:
# The overall scalar quantization process can be encapsulated in a function:
def scalar_quantisation(dataset):
    # Calculate and store minimum and maximum across each dimension
    ranges = np.vstack((np.min(dataset, axis=0), np.max(dataset, axis=0)))
    starts = ranges[0,:]
    steps = (ranges[1,:] - starts) / 255
    return np.uint8((dataset - starts) / steps)

    Product Quantization
    In scalar quantization, the data distribution in each dimension should ideally be considered to avoid loss of information. Product quantization can preserve more information by dividing each vector into sub-vectors and quantizing each sub-vector independently.

In [22]:
array = [ [ 8.2, 10.3, 290.1, 278.1, 310.3, 299.9, 308.7, 289.7, 300.1],
				[ 0.1, 7.3, 8.9, 9.7, 6.9, 9.55, 8.1, 8.5, 8.99] ]

    product quantization involves the following steps:

    1. Divide each vector in the dataset into m disjoint sub-vectors.
    2. For each sub-vector, cluster the data into k centroids (using k-means, for example).
    3. Replace each sub-vector with the index of the nearest centroid in the corresponding codebook.

In [24]:
from sklearn.cluster import KMeans
import numpy as np

# Given array
array = np.array([
    [8.2, 10.3, 290.1, 278.1, 310.3, 299.9, 308.7, 289.7, 300.1],
    [0.1, 7.3, 8.9, 9.7, 6.9, 9.55, 8.1, 8.5, 8.99]
])

# Number of subvectors and centroids
m, k = 3, 2

# Divide each vector into m disjoint sub-vectors
subvectors = array.reshape(-1, m)

# Perform k-means on each sub-vector independently
kmeans = KMeans(n_clusters=k, random_state=0).fit(subvectors)

# Replace each sub-vector with the index of the nearest centroid
labels = kmeans.labels_

# Reshape labels to match the shape of the original array
quantized_array = labels.reshape(array.shape[0], -1)

# Output the quantized array
quantized_array

  super()._check_params_vs_input(X, default_n_init=10)


array([[0, 1, 1],
       [0, 0, 0]], dtype=int32)