In [1]:
!pip install hilbertcurve numpy scipy sentence-transformers

Collecting hilbertcurve
  Downloading hilbertcurve-2.0.5-py3-none-any.whl.metadata (11 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-

In [2]:
from hilbertcurve.hilbertcurve import HilbertCurve
import numpy as np
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')

def create_vector_from_sentence(sentence):
    embedding = model.encode(sentence)
    return embedding

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
def floats_to_hilbert(floats, precision=16, dimensions=2):
    min_val, max_val = min(floats), max(floats)
    range_val = max_val - min_val
    quantized = [int(((f - min_val) / range_val) * (2**precision - 1)) for f in floats]
    padding_length = 0
    if len(quantized) % dimensions != 0:
        padding_length = dimensions - (len(quantized) % dimensions)
        quantized.extend([0] * padding_length)
    hilbert_curve = HilbertCurve(precision, dimensions)
    points = [quantized[i:i+dimensions] for i in range(0, len(quantized), dimensions)]
    hilbert_positions = hilbert_curve.distances_from_points(points)
    return hilbert_positions

def cosine_similarity(a, b, tolerance=1e-5):
    a = np.array(a, dtype=np.float64)
    b = np.array(b, dtype=np.float64)
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    if norm_a < tolerance or norm_b < tolerance:
        return 0.0
    cos_sim = dot_product / (norm_a * norm_b)
    return max(0, cos_sim)

In [5]:
sentences = ["Hello, World!", "Goodbye, World!", "AI is awesome!", "I love coding."]

all_compressed = []
all_originals = []

for sentence in sentences:
    floats = create_vector_from_sentence(sentence)
    all_originals.append(floats)
    compressed = floats_to_hilbert(floats)
    all_compressed.append(compressed)
    print(f"Original embedding for '{sentence}': {floats}")
    print(f"Compressed Hilbert positions for '{sentence}': {compressed}")

similarity = cosine_similarity(all_originals[0], all_originals[1])
print(f"\nCosine similarity between '{sentences[0]}' and '{sentences[1]}': {similarity}")

Original embedding for 'Hello, World!': [-3.81771959e-02  3.29110362e-02 -5.45946229e-03  1.43699553e-02
 -4.02910151e-02 -1.16532475e-01  3.16876546e-02  1.91177137e-03
 -4.26223427e-02  2.91681383e-02  4.24266756e-02  3.20417918e-02
  2.98447125e-02  1.09803323e-02 -5.39396890e-02 -5.02772518e-02
 -2.35079043e-02  1.07793650e-02 -1.37707964e-01  4.11502458e-03
  2.93330830e-02  6.68411553e-02 -1.53894722e-02  4.84376252e-02
 -8.81497115e-02 -1.27268210e-02  4.14090455e-02  4.08315435e-02
 -5.01559563e-02 -5.81250452e-02  4.88015264e-02  6.88900948e-02
  5.87469079e-02  8.73098429e-03 -1.59182549e-02  8.51420015e-02
 -7.81474113e-02 -7.75167570e-02  2.07238048e-02  1.61942393e-02
  3.25105675e-02 -5.34889065e-02 -6.22288063e-02 -2.43146010e-02
  7.41272978e-03  2.39777584e-02  6.36094296e-03  5.11450991e-02
  7.27667063e-02  3.46496850e-02 -5.47711141e-02 -5.93285114e-02
 -7.16693187e-03  2.01377235e-02  3.58463973e-02  5.59089752e-03
  1.07735032e-02 -5.27637787e-02  1.01473574e-02 -