# Infinispan VectorStore: Similarity search demo 2

This demo shows how to run similarity search on a set of random sentences. Content
is stored in a separated cache and referenced in the vector cache via key id.

In [None]:
# Start Infinispan in a container

!docker run -d --name infinispanvs-demo -v $(pwd):/user-config  -p 11222:11222 infinispan/server:15.0.0.Dev09 -c /user-config/infinispan.yaml 

In [None]:
# Ensure that all we need is installed
# You may want to skip this 
%pip install sentence-transformers
%pip install langchain
%pip install langchain_core
%pip install langchain_community

In [None]:
# Import HuggingFace language model

from langchain_core.embeddings import Embeddings
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-MiniLM-L12-v2"

hf = HuggingFaceEmbeddings(model_name=model_name)

In [None]:
# Creating an empty langchain_core.VectorStore
# InfinispanVS is instantiated with custom options
# `textfield` : which cache field must be reported as Document.page_content
# `vectorfield` : which cache field contains the embedded vector
# `cache_name` : name of the cache for vectors
# `entity_name` : name of the protobuf message containing vectors

from infinispan_vector import InfinispanVS
ispnvs = InfinispanVS.from_texts(texts={}, embedding=hf,
                                 textfield="_key",vectorfield="floatVector",
                                 cache_name="sentence_demo_cache",
                                 entity_name="sentence_demo_vec")

In [None]:
# Create the protobuf schema for vector
# Note the additional comments in the proto schema:
# `@Indexed` : a search index will be built for this entity
# `@Vector` : next field the vector field

import json
schema_vector = '''
/**
 * @Indexed
 */
message sentence_demo_vec {
/**
 * @Vector(dimension=384)
 */
repeated float floatVector = 1;
optional int32 _key = 2;
}
'''
output = ispnvs.schema_delete()
output = ispnvs.schema_create(schema_vector)
print(output.text)
assert output.status_code == 200
assert json.loads(output.text)["error"] == None

In [None]:
# Create the protobuf schema for content. Since this is not the cache
# containing the VectorStore db, we need to use the inner Infinispan helper
# class to do the job.
schema = '''
message sentence {
optional string title = 1;
optional string description = 2;
}
'''
output = ispnvs.ispn.schema_delete("sentence.proto")
output = ispnvs.ispn.schema_post("sentence.proto",schema)
print(output.text)
assert output.status_code == 200
assert json.loads(output.text)["error"] == None

In [None]:
# Creating an Infinispan cache to store vectors

ispnvs.cache_create()
ispnvs.cache_clear()
ispnvs.cache_index_reindex()

In [None]:
# Creating an Infinispan cache to store content
# again inner Infinispan class is used
cache_def = '''
{
  "distributed-cache": {
    "owners": "2",
    "mode": "SYNC",
    "statistics": true,
    "encoding": {
      "media-type": "application/x-protostream"
    }
  }
}
'''
ispnvs.ispn.cache_post("sentence",cache_def)
ispnvs.ispn.cache_clear("sentence")

In [None]:
# Adding some data from rnd_sentences.txt
# Vector and _key (content id) are stored in the vector db
# actual content is store in the `sentence` cache
import csv, time, gzip
with gzip.open('rnd_sentences.txt.gz', 'rt', newline='') as f:
    line = f.readline()
    i=0
    texts = []
    metas = []
    embeds = []
    while line:
        # Storing content
        doc={}
        doc["_type"]="sentence"
        doc["title"]=str(i)
        doc["description"]=line
        res = ispnvs.ispn.put(str(i), json.dumps(doc), cache_name="sentence")
        texts.append(line)
        # Storing meta
        meta={}
        meta["_key"]=str(i)
        metas.append(meta)
        i=i+1
# Change this to change the number of sentences you want to load
        if (i > 5000):
            break
        line = f.readline()

In [None]:
# add texts and fill vector db
keys = ispnvs.add_texts(texts, metas)

In [None]:
# Some demo queries
# query_res is a list of Document: page_content will contain the key
# to access the actual content in the `sentence` cache
query_res = ispnvs.similarity_search("I want to have fun this night",2)
for res in query_res:
    print(ispnvs.ispn.get(res.page_content, "sentence").text)

In [None]:
query_res = ispnvs.similarity_search("Leafs are falling from the trees in the park",5)
for res in query_res:
    print(ispnvs.ispn.get(res.page_content, "sentence").text)

In [None]:
query_res = ispnvs.similarity_search("I'm getting hungry",2)
for res in query_res:
    print(ispnvs.ispn.get(res.page_content, "sentence").text)

In [None]:
query_res = ispnvs.similarity_search("I feel like a bird in a cage",2)
for res in query_res:
    print(ispnvs.ispn.get(res.page_content, "sentence").text)

In [None]:
query_res = ispnvs.similarity_search("People are strange, when you are stranger",2)
for res in query_res:
    print(ispnvs.ispn.get(res.page_content, "sentence").text)

In [None]:
query_res = ispnvs.similarity_search("As we know, time is relative",2)
for res in query_res:
    print(ispnvs.ispn.get(res.page_content, "sentence").text)

In [None]:
# Clean up
ispnvs.cache_delete()
ispnvs.schema_delete()

In [None]:
ispnvs.ispn.cache_delete("sentence")
ispnvs.ispn.schema_delete("sentence.proto")

In [None]:
!docker rm --force infinispanvs-demo