# Redis as a Vector Database for Semantic Search

Reference: https://redis.io/docs/get-started/vector-database/


In [1]:
import json
import time

import numpy as np
import pandas as pd
import redis
import requests
from redis.commands.search.field import (
    NumericField,
    TagField,
    TextField,
    VectorField,
)
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


## Connect Redis

Instantiate the Redis client. By default, Redis returns binary responses. To decode them, you pass the decode_responses parameter set to True.


In [2]:
client = redis.Redis(host="localhost", port=6379, decode_responses=True)
client

Redis<ConnectionPool<Connection<host=localhost,port=6379,db=0>>>

## Demo Data

Download a demo data of bikes.

In [15]:
url = "https://raw.githubusercontent.com/bsbodden/redis_vss_getting_started/main/data/bikes.json"
response = requests.get(url)
bikes = response.json()
bikes[:2]

[{'model': 'Jigger',
  'brand': 'Velorim',
  'price': 270,
  'type': 'Kids bikes',
  'specs': {'material': 'aluminium', 'weight': '10'},
  'description': 'Small and powerful, the Jigger is the best ride for the smallest of tikes! This is the tiniest kids’ pedal bike on the market available without a coaster brake, the Jigger is the vehicle of choice for the rare tenacious little rider raring to go. We say rare because this smokin’ little bike is not ideal for a nervous first-time rider, but it’s a true giddy up for a true speedster. The Jigger is a 12 inch lightweight kids bicycle and it will meet your little one’s need for speed. It’s a single speed bike that makes learning to pump pedals simple and intuitive. It even has  a handle in the bottom of the saddle so you can easily help your child during training!  The Jigger is among the most lightweight children’s bikes on the planet. It is designed so that 2-3 year-olds fit comfortably in a molded ride position that allows for efficient

## Store Data

Store bikes data as JSONs in Redis.

In [10]:
pipeline = client.pipeline()
for i, bike in enumerate(bikes, start=1):
    redis_key = f"bikes:{i:03}"
    pipeline.json().set(redis_key, "$", bike)
res = pipeline.execute()
res

[True, True, True, True, True, True, True, True, True, True, True]

## Retrieve Data

Example of retrieving data from Redis.

In [12]:
res = client.json().get("bikes:010")
res

{'model': 'Summit',
 'brand': 'nHill',
 'price': 1200,
 'type': 'Mountain Bike',
 'specs': {'material': 'alloy', 'weight': '11.3'},
 'description': 'This budget mountain bike from nHill performs well both on bike paths and on the trail. The fork with 100mm of travel absorbs rough terrain. Fat Kenda Booster tires give you grip in corners and on wet trails. The Shimano Tourney drivetrain offered enough gears for finding a comfortable pace to ride uphill, and the Tektro hydraulic disc brakes break smoothly. Whether you want an affordable bike that you can take to work, but also take trail riding on the weekends or you’re just after a stable, comfortable ride for the bike path, the Summit gives a good value for money.'}

In [14]:
res = client.json().get("bikes:010", "$.model")
res

['Summit']

## Embeddings

Use an embedder model to get embeddings for each data entry.

Pre-trained MS MARCO model is widely used in search engines, chatbots, and other AI applications.

> Pre-trained weights are store in `~/.cache/torch/sentence_transformers`

In [17]:
embedder = SentenceTransformer('msmarco-distilbert-base-v4')
embedder

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [20]:
keys = sorted(client.keys("bikes:*"))
keys

['bikes:001',
 'bikes:002',
 'bikes:003',
 'bikes:004',
 'bikes:005',
 'bikes:006',
 'bikes:007',
 'bikes:008',
 'bikes:009',
 'bikes:010',
 'bikes:011']

In [24]:
descriptions = client.json().mget(keys, "$.description")
descriptions = [item for sublist in descriptions for item in sublist]
descriptions

['Small and powerful, the Jigger is the best ride for the smallest of tikes! This is the tiniest kids’ pedal bike on the market available without a coaster brake, the Jigger is the vehicle of choice for the rare tenacious little rider raring to go. We say rare because this smokin’ little bike is not ideal for a nervous first-time rider, but it’s a true giddy up for a true speedster. The Jigger is a 12 inch lightweight kids bicycle and it will meet your little one’s need for speed. It’s a single speed bike that makes learning to pump pedals simple and intuitive. It even has  a handle in the bottom of the saddle so you can easily help your child during training!  The Jigger is among the most lightweight children’s bikes on the planet. It is designed so that 2-3 year-olds fit comfortably in a molded ride position that allows for efficient riding, balanced handling and agility. The Jigger’s frame design and gears work together so your buddingbiker can stand up out of the seat, stop rapidly

In [27]:
embeddings = embedder.encode(descriptions).astype(np.float32)
embeddings.shape

(11, 768)

In [28]:
embeddings = embeddings.tolist()
VECTOR_DIMENSION = len(embeddings[0])

## Store Embeddings

Store embeddings as a vector in Redis.

In [29]:
pipeline = client.pipeline()
for key, embedding in zip(keys, embeddings):
    pipeline.json().set(key, "$.description_embeddings", embedding)
pipeline.execute()

[True, True, True, True, True, True, True, True, True, True, True]

In [30]:
res = client.json().get("bikes:010")
res

{'model': 'Summit',
 'brand': 'nHill',
 'price': 1200,
 'type': 'Mountain Bike',
 'specs': {'material': 'alloy', 'weight': '11.3'},
 'description': 'This budget mountain bike from nHill performs well both on bike paths and on the trail. The fork with 100mm of travel absorbs rough terrain. Fat Kenda Booster tires give you grip in corners and on wet trails. The Shimano Tourney drivetrain offered enough gears for finding a comfortable pace to ride uphill, and the Tektro hydraulic disc brakes break smoothly. Whether you want an affordable bike that you can take to work, but also take trail riding on the weekends or you’re just after a stable, comfortable ride for the bike path, the Summit gives a good value for money.',
 'description_embeddings': [-0.5381145477294922,
  -0.4946596026420593,
  -0.025176959112286568,
  0.6540350914001465,
  -0.06241418793797493,
  -0.6898813247680664,
  -0.5430221557617188,
  -0.5903492569923401,
  0.5061327219009399,
  0.2008499801158905,
  0.80156409740448

## Indexing

Create index to the data.

For Redis, we define indexing method and distance measure in this step.

In [31]:
schema = (
    TextField("$.model", no_stem=True, as_name="model"),
    TextField("$.brand", no_stem=True, as_name="brand"),
    NumericField("$.price", as_name="price"),
    TagField("$.type", as_name="type"),
    TextField("$.description", as_name="description"),
    VectorField(
        "$.description_embeddings",
        "FLAT", # FLAT: Specifies the indexing method, which is either a flat index or a hierarchical navigable small world graph (HNSW).
        {
            "TYPE": "FLOAT32",
            "DIM": VECTOR_DIMENSION,
            "DISTANCE_METRIC": "COSINE",
        },
        as_name="vector",
    ),
)
definition = IndexDefinition(prefix=["bikes:"], index_type=IndexType.JSON)
res = client.ft("idx:bikes_vss").create_index(
    fields=schema, definition=definition
)
res

'OK'

In [34]:
info = client.ft("idx:bikes_vss").info()
num_docs = info["num_docs"]
indexing_failures = info["hash_indexing_failures"]
print(f"{num_docs} documents indexed with {indexing_failures} failures")

11 documents indexed with 0 failures


## Search and Query

Create Redis query for similarity search.

In [47]:
def create_query_table(query, queries, encoded_queries, extra_params={}):
    results_list = []
    for i, encoded_query in enumerate(encoded_queries):
        result_docs = (
            client.ft("idx:bikes_vss")
            .search(
                query,
                {
                    "query_vector": np.array(
                        encoded_query, dtype=np.float32
                    ).tobytes()
                }
                | extra_params,
            )
            .docs
        )
        for doc in result_docs:
            vector_score = round(1 - float(doc.vector_score), 2)
            results_list.append(
                {
                    "query": queries[i],
                    "score": vector_score,
                    "id": doc.id,
                    "brand": doc.brand,
                    "model": doc.model,
                    "description": doc.description,
                }
            )

    # Optional: convert the table to Markdown using Pandas
    queries_table = pd.DataFrame(results_list)
    queries_table.sort_values(
        by=["query", "score"], ascending=[True, False], inplace=True
    )
    queries_table["query"] = queries_table.groupby("query")["query"].transform(
        lambda x: [x.iloc[0]] + [""] * (len(x) - 1)
    )
    queries_table["description"] = queries_table["description"].apply(
        lambda x: (x[:497] + "...") if len(x) > 500 else x
    )
    queries_table.to_markdown(index=False)
    return queries_table

In [43]:
queries = [
    "Bike for small kids",
    "Best Mountain bikes for kids",
    "Cheap Mountain bike for kids",
    "Female specific mountain bike",
    "Road bike for beginners",
    "Commuter bike for people over 60",
    "Comfortable commuter bike",
    "Good bike for college students",
    "Mountain bike for beginners",
    "Vintage bike",
    "Comfortable city bike",
]

In [44]:
encoded_queries = embedder.encode(queries)
encoded_queries.shape

(11, 768)

In [42]:
"""
KNN algorithm calculates the distance between the query vector (embeddings) and each vector (embbedings) 
in the database based on the chosen distance function. 
It then returns the K items with the smallest distances to the query vector
"""
query = (
    Query("(*)=>[KNN 3 @vector $query_vector AS vector_score]")
    .sort_by("vector_score")
    .return_fields("vector_score", "id", "brand", "model", "description")
    .dialect(2)
)
# To utilize a vector query with the FT.SEARCH command, you must specify DIALECT 2 or greater.

In [48]:
create_query_table(query, queries, encoded_queries)

Unnamed: 0,query,score,id,brand,model,description
3,Best Mountain bikes for kids,0.54,bikes:003,Nord,Chook air 5,The Chook Air 5 gives kids aged six years and...
4,,0.51,bikes:010,nHill,Summit,This budget mountain bike from nHill performs ...
5,,0.46,bikes:001,Velorim,Jigger,"Small and powerful, the Jigger is the best rid..."
0,Bike for small kids,0.52,bikes:001,Velorim,Jigger,"Small and powerful, the Jigger is the best rid..."
1,,0.45,bikes:007,ScramBikes,WattBike,The WattBike is the best e-bike for people who...
2,,0.41,bikes:003,Nord,Chook air 5,The Chook Air 5 gives kids aged six years and...
6,Cheap Mountain bike for kids,0.49,bikes:003,Nord,Chook air 5,The Chook Air 5 gives kids aged six years and...
7,,0.45,bikes:010,nHill,Summit,This budget mountain bike from nHill performs ...
8,,0.39,bikes:001,Velorim,Jigger,"Small and powerful, the Jigger is the best rid..."
30,Comfortable city bike,0.45,bikes:007,ScramBikes,WattBike,The WattBike is the best e-bike for people who...
