In [12]:
import redis
import requests
import json

configuring redis credentials in env

In [13]:
import os
REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
REDIS_PORT = os.getenv("REDIS_PORT", 6379)
REDIS_PASS = os.getenv("REDIS_PASS")

In [14]:
client = redis.Redis(
    host=REDIS_HOST,
    port=REDIS_PORT,
    password=REDIS_PASS
)

<!-- we will now create demo dataset with structure as follows -->
{
  "model": "Jigger",
  "brand": "Velorim",
  "price": 270,
  "type": "Kids bikes",
  "specs": {
    "material": "aluminium",
    "weight": "10"
  },
  "description": "Small and powerful, the Jigger is the best ride for the smallest of tikes! ..."
}

In [15]:
URL = ("https://raw.githubusercontent.com/bsbodden/redis_vss_getting_started"
       "/main/data/bikes.json"
       )
response = requests.get(URL, timeout=10)
bikes = response.json()

In [16]:
json.dumps(bikes[0], indent=2)

'{\n  "model": "Jigger",\n  "brand": "Velorim",\n  "price": 270,\n  "type": "Kids bikes",\n  "specs": {\n    "material": "aluminium",\n    "weight": "10"\n  },\n  "description": "Small and powerful, the Jigger is the best ride for the smallest of tikes! This is the tiniest kids\\u2019 pedal bike on the market available without a coaster brake, the Jigger is the vehicle of choice for the rare tenacious little rider raring to go. We say rare because this smokin\\u2019 little bike is not ideal for a nervous first-time rider, but it\\u2019s a true giddy up for a true speedster. The Jigger is a 12 inch lightweight kids bicycle and it will meet your little one\\u2019s need for speed. It\\u2019s a single speed bike that makes learning to pump pedals simple and intuitive. It even has  a handle in the bottom of the saddle so you can easily help your child during training!  The Jigger is among the most lightweight children\\u2019s bikes on the planet. It is designed so that 2-3 year-olds fit com

storing data in redis

In [17]:
pipeline = client.pipeline()
for i, bike in enumerate(bikes, start=1, ):
    redis_key = f"bikes:{i:03}"
    print(f"key={redis_key} value={bike}")
    
    pipeline.json().set(redis_key, "$", bike)
res = pipeline.execute()

key=bikes:001 value={'model': 'Jigger', 'brand': 'Velorim', 'price': 270, 'type': 'Kids bikes', 'specs': {'material': 'aluminium', 'weight': '10'}, 'description': 'Small and powerful, the Jigger is the best ride for the smallest of tikes! This is the tiniest kids’ pedal bike on the market available without a coaster brake, the Jigger is the vehicle of choice for the rare tenacious little rider raring to go. We say rare because this smokin’ little bike is not ideal for a nervous first-time rider, but it’s a true giddy up for a true speedster. The Jigger is a 12 inch lightweight kids bicycle and it will meet your little one’s need for speed. It’s a single speed bike that makes learning to pump pedals simple and intuitive. It even has  a handle in the bottom of the saddle so you can easily help your child during training!  The Jigger is among the most lightweight children’s bikes on the planet. It is designed so that 2-3 year-olds fit comfortably in a molded ride position that allows for 

In [18]:
res = client.json().get("bikes:010", "$.model")
print(res)

['Summit']


text embedding model

In [19]:
keys = sorted(client.keys("bikes:*"))

In [20]:
json_descriptions = client.json().mget(keys, "$.description")
descriptions = [item for sublist in json_descriptions for item in sublist]

In [21]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('msmarco-distilbert-base-v4')

  from tqdm.autonotebook import tqdm, trange


In [22]:

import numpy as np


embeddings = embedder.encode(descriptions).astype(np.float32).tolist()
VECTOR_DIMENSION = len(embeddings[0])
print(VECTOR_DIMENSION)

768


In [23]:
pipeline = client.pipeline()
for key, embedding in zip(keys, embeddings):
    pipeline.json().set(key, "$.description_embeddings", embedding)
pipeline.execute()

[True, True, True, True, True, True, True, True, True, True, True]

In [24]:
res = client.json().get("bikes:010")
print(json.dumps(res, sort_keys=True, indent=4))

{
    "brand": "nHill",
    "description": "This budget mountain bike from nHill performs well both on bike paths and on the trail. The fork with 100mm of travel absorbs rough terrain. Fat Kenda Booster tires give you grip in corners and on wet trails. The Shimano Tourney drivetrain offered enough gears for finding a comfortable pace to ride uphill, and the Tektro hydraulic disc brakes break smoothly. Whether you want an affordable bike that you can take to work, but also take trail riding on the weekends or you\u2019re just after a stable, comfortable ride for the bike path, the Summit gives a good value for money.",
    "description_embeddings": [
        -0.5381144285202026,
        -0.4946592152118683,
        -0.025176504626870155,
        0.654035210609436,
        -0.06241398677229881,
        -0.6898809671401978,
        -0.5430217385292053,
        -0.5903494358062744,
        0.506132185459137,
        0.2008497267961502,
        0.8015638589859009,
        1.0688027143478394

Create an index with a vector field
Python Equivalent redis-cli code as follows

```sh
FT.CREATE idx:bikes_vss ON JSON
  PREFIX 1 bikes: SCORE 1.0
  SCHEMA
    $.model TEXT WEIGHT 1.0 NOSTEM
    $.brand TEXT WEIGHT 1.0 NOSTEM
    $.price NUMERIC
    $.type TAG SEPARATOR ","
    $.description AS description TEXT WEIGHT 1.0
    $.description_embeddings AS vector VECTOR FLAT 6 TYPE FLOAT32 DIM 768 DISTANCE_METRIC COSINE
```

In [25]:
from redis import ResponseError
from redis.commands.search.field import TextField, NumericField, TagField, VectorField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType

schema = (
    TextField("$.model", no_stem=True, as_name="model"),
    TextField("$.brand", no_stem=True, as_name="brand"),
    NumericField("$.price", as_name="price"),
    TagField("$.type", as_name="type"),
    TextField("$.description", as_name="description"),
    VectorField(
        "$.description_embeddings",
        "FLAT",
        {
            "TYPE": "FLOAT32",
            "DIM": VECTOR_DIMENSION,
            "DISTANCE_METRIC": "COSINE",
        },
        as_name="vector",
    ),
)
definition = IndexDefinition(prefix=["bikes:"], index_type=IndexType.JSON)
try:
    res = client.ft("idx:bikes_vss").create_index(fields=schema, definition=definition)
except ResponseError as e:
    if e == "Index already exists":
        drop_index_response = client.ft("idx:bikes_vss").dropindex()
        print("index drop response: ", drop_index_response)
        res = client.ft("idx:bikes_vss").create_index(fields=schema, definition=definition) 


In [26]:
info = client.ft("idx:bikes_vss").info()
print("info=", info)
num_docs = info["num_docs"]
print("num_docs=", num_docs)
indexing_failures = info["hash_indexing_failures"]
print("indexing_failures=", indexing_failures)

info= {'index_name': 'idx:bikes_vss', 'index_options': [], 'index_definition': [b'key_type', b'JSON', b'prefixes', [b'bikes:'], b'default_score', b'1'], 'attributes': [[b'identifier', b'$.model', b'attribute', b'model', b'type', b'TEXT', b'WEIGHT', b'1', b'NOSTEM'], [b'identifier', b'$.brand', b'attribute', b'brand', b'type', b'TEXT', b'WEIGHT', b'1', b'NOSTEM'], [b'identifier', b'$.price', b'attribute', b'price', b'type', b'NUMERIC'], [b'identifier', b'$.type', b'attribute', b'type', b'type', b'TAG', b'SEPARATOR', b','], [b'identifier', b'$.description', b'attribute', b'description', b'type', b'TEXT', b'WEIGHT', b'1'], [b'identifier', b'$.description_embeddings', b'attribute', b'vector', b'type', b'VECTOR', b'algorithm', b'FLAT', b'data_type', b'FLOAT32', b'dim', 768, b'distance_metric', b'COSINE']], 'num_docs': '11', 'max_doc_id': '104', 'num_terms': '778', 'num_records': '10403', 'inverted_sz_mb': '0.06006336212158203', 'vector_index_sz_mb': '2.99884033203125', 'total_inverted_index

In [27]:
queries = [
    "Bike for small kids",
    "Best Mountain bikes for kids",
    "Cheap Mountain bike for kids",
    "Female specific mountain bike",
    "Road bike for beginners",
    "Commuter bike for people over 60",
    "Comfortable commuter bike",
    "Good bike for college students",
    "Mountain bike for beginners",
    "Vintage bike",
    "Comfortable city bike",
]

In [28]:
encoded_queries = embedder.encode(queries)
len(encoded_queries)

11

In [29]:
from redis.commands.search.query import Query

query = (
    Query('(*)=>[KNN 3 @vector $query_vector AS vector_score]')
     .sort_by('vector_score')
     .return_fields('vector_score', 'id', 'brand', 'model', 'description')
     .dialect(2)
)

In [30]:
responses = {}
for key, encoded_query in enumerate(encoded_queries):
    response = client.ft('idx:bikes_vss').search(
        query,
        {
        'query_vector': np.array(encoded_query, dtype=np.float32).tobytes()
        }
    ).docs
    responses[key]= response
    print(f"result count on key {key} is {len(response)}")

result count on key 0 is 3
result count on key 1 is 3
result count on key 2 is 3
result count on key 3 is 3
result count on key 4 is 3
result count on key 5 is 3
result count on key 6 is 3
result count on key 7 is 3
result count on key 8 is 3
result count on key 9 is 3
result count on key 10 is 3


KNN Search example

In [31]:
query = (
    Query('(*)=>[KNN 3 @vector $query_vector AS vector_score]')
     .sort_by('vector_score')
     .return_fields('vector_score', 'id', 'brand', 'model', 'description')
     .dialect(2)
)

In [32]:
client.ft('idx:bikes_vss').search(
    query,
    {
      'query_vector': np.array(encoded_query, dtype=np.float32).tobytes()
    }
).docs

[Document {'id': 'bikes:007', 'payload': None, 'vector_score': '0.549334764481', 'brand': 'ScramBikes', 'model': 'WattBike', 'description': 'The WattBike is the best e-bike for people who still feel young at heart. It has a  Bafang 500 watt geared hub motor that can reach 20 miles per hour on both steep inclines and city streets. The lithium-ion battery, which gets nearly 40 miles per charge, has a lightweight form factor, making it easier for seniors to use. It comes fully assembled (no convoluted instructions!) and includes a sturdy helmet at no cost. The Plush saddle softens over time with use. The included Seatpost, however, is easily adjustable and adds to this bike’s fantastic rating for seniors, as do the hydraulic disc brakes from Tektro.  '},
 Document {'id': 'bikes:006', 'payload': None, 'vector_score': '0.558552265167', 'brand': 'Breakout', 'model': 'XBN 2.1 Alloy', 'description': 'The XBN 2.1 Alloy is our entry-level road bike – but that’s not to say that it’s a basic machi

creating query table
lets import pandas

In [33]:
import pandas as pd

In [34]:
def create_query_table(query, queries, encoded_queries, extra_params=None):
    """
    Creates a query table.
    """
    results_list = []
    for i, encoded_query in enumerate(encoded_queries):
        result_docs = (
            client.ft("idx:bikes_vss")
            .search(
                query,
                {"query_vector": np.array(encoded_query, dtype=np.float32).tobytes()}
                | (extra_params if extra_params else {}),
            )
            .docs
        )
        for doc in result_docs:
            vector_score = round(1 - float(doc.vector_score), 2)
            results_list.append(
                {
                    "query": queries[i],
                    "score": vector_score,
                    "id": doc.id,
                    "brand": doc.brand,
                    "model": doc.model,
                    "description": doc.description,
                }
            )

    # Optional: convert the table to Markdown using Pandas
    queries_table = pd.DataFrame(results_list)
    queries_table.sort_values(
        by=["query", "score"], ascending=[True, False], inplace=True
    )
    queries_table["query"] = queries_table.groupby("query")["query"].transform(
        lambda x: [x.iloc[0]] + [""] * (len(x) - 1)
    )
    queries_table["description"] = queries_table["description"].apply(
        lambda x: (x[:497] + "...") if len(x) > 500 else x
    )
    return queries_table.to_markdown(index=False)

In [35]:
query = (
    Query("(*)=>[KNN 3 @vector $query_vector AS vector_score]")
    .sort_by("vector_score")
    .return_fields("vector_score", "id", "brand", "model", "description")
    .dialect(2)
)

table = create_query_table(query, queries, encoded_queries)
print(table)

| query                            |   score | id        | brand      | model                | description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
|:---------------------------------|--------:|:----------|:-----------|:---------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------