### Setup Redis Connection

In [1]:
import os
import redis

REDIS_URL = os.getenv("REDIS_URL", "redis://127.0.0.1:6379")
print(f"Connecting to Redis at: {REDIS_URL}")

redis_client = redis.from_url(REDIS_URL)
redis_client.ping()

Connecting to Redis at: redis://127.0.0.1:6379


True

In [2]:
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {
    'normalize_embeddings': True,
    'batch_size':32,
}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)







In [3]:
sentences = [
    "my name is shah",
    "im from malaysia",
    "live in kuala lumpur"
]

from langchain.docstore.document import Document

# Convert sentences into LangChain Document objects
documents = [Document(page_content=sentence.encode('utf-8').decode('utf-8'), metadata={"source": "user_data"}) for sentence in sentences]

texts = [doc.page_content for doc in documents]
embeddings = hf.embed_documents(texts)
embeddings


[[0.07546498626470566,
  0.06085890904068947,
  -0.018996577709913254,
  -0.007547146175056696,
  0.022318650037050247,
  -0.031852301210165024,
  0.0069755855947732925,
  0.026578102260828018,
  0.058707624673843384,
  0.051354069262742996,
  0.005810820963233709,
  -0.05131806433200836,
  0.049076929688453674,
  0.03902445361018181,
  0.010620727203786373,
  -0.030504297465085983,
  0.0022999655921012163,
  -0.06213166564702988,
  0.07086240500211716,
  0.02020576037466526,
  0.011595475487411022,
  0.00028777786064893007,
  -0.03451204299926758,
  0.0644669234752655,
  -0.03584497049450874,
  -0.01536095142364502,
  0.014995163306593895,
  -0.014620281755924225,
  0.0024484561290591955,
  0.013581466861069202,
  -0.015241488814353943,
  -0.0025462359189987183,
  0.016626928001642227,
  0.030628273263573647,
  1.3767240716333617e-06,
  -0.013088833540678024,
  0.006480891723185778,
  -0.031628940254449844,
  -0.01682710461318493,
  -0.04269452393054962,
  0.044366318732500076,
  -0.0

In [4]:
#!pip install langchain_redis

need to use redis-stack images which include redisearch module. the module have to be added manual in redis.conf. go to Modules sections and this :

```
loadmodule /opt/redis-stack/lib/redisearch.so
```


In [5]:
print(hf)

model_name='sentence-transformers/all-mpnet-base-v2' cache_folder=None model_kwargs={'device': 'cuda'} encode_kwargs={'normalize_embeddings': True, 'batch_size': 32} multi_process=False show_progress=False


In [6]:
from langchain_redis import RedisConfig,RedisVectorStore

redisConfig = RedisConfig(
    index_name="new_vector_test",
    redis_url=REDIS_URL,
    metada_schema=[
        {"name": "text", "type": "tag"},
    ],
    primary_key="id" 
)

vector_store = RedisVectorStore(embeddings=hf, redis_config=redisConfig)
ids = vector_store.add_documents(documents)
ids

['01JNMBP9HF7ZJFVZG815RCJQ4M:01JNMBP9T9F6VPGQYK4VJ6HYA2',
 '01JNMBP9HF7ZJFVZG815RCJQ4M:01JNMBP9T96REQDQDBVNE134P9',
 '01JNMBP9HF7ZJFVZG815RCJQ4M:01JNMBP9T9Z4T60B4SAP6WKA4Q']

In [7]:
# Perform search
query = "country"
results = vector_store.similarity_search(query, k=5)

print(results)

[Document(metadata={}, page_content='im from malaysia'), Document(metadata={}, page_content='live in kuala lumpur'), Document(metadata={}, page_content='my name is shah')]


In [8]:
query = "Test query"
vector = hf.embed_query(query)
print(vector)  

[-0.014222144149243832, 0.0463714674115181, -0.04419915750622749, 0.022166680544614792, -0.04639187082648277, -0.014312309212982655, 0.013567843474447727, 0.043053507804870605, 0.06234164908528328, 0.007129913195967674, 0.01720835454761982, -0.03279409185051918, -0.03885768726468086, 0.02123173698782921, 0.04823543503880501, -0.011943984776735306, 0.02675282210111618, 0.039742499589920044, -0.041766777634620667, -0.0023787354584783316, -0.04121943935751915, -0.0033660761546343565, -0.03171033039689064, -0.015486759133636951, 0.02518879435956478, 0.04774787649512291, -0.014630717225372791, 0.02364218980073929, 0.012439468875527382, -0.026849990710616112, 0.02473977766931057, 0.02887810207903385, -0.010242811404168606, 0.005642981734126806, 1.5438791933775065e-06, -0.01759674772620201, 0.002161182928830385, -0.01377035304903984, 0.000865927548147738, -0.02706003561615944, -0.006876533385366201, 0.0977427214384079, 0.009062422439455986, 0.014023508876562119, -0.013519881293177605, 0.00367

In [9]:
### -----------------------------------------

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
 
# Load list of pdfs from a folder
data_path = "resources/"
docs = [os.path.join(data_path, file) for file in os.listdir(data_path)]
 
print("Listing available documents ...", docs)

Listing available documents ... ['resources/2022-chevy-colorado-ebrochure.pdf', 'resources/aapl-10k-2023.pdf', 'resources/amzn-10k-2023.pdf', 'resources/generation_basic_rag_test.csv', 'resources/jnj-10k-2023.pdf', 'resources/msft-10k-2023.pdf', 'resources/nke-10k-2023.pdf', 'resources/nvd-10k-2023.pdf', 'resources/propositions.json', 'resources/retrieval_basic_rag_test.csv', 'resources/testset.csv', 'resources/testset_15.csv']


In [11]:
# pick out the Nike doc for this exercise
doc = [doc for doc in docs if "nke" in doc][0]
 
# set up the file loader/extractor and text splitter to create chunks
text_splitter = RecursiveCharacterTextSplitter(
   chunk_size=2500, chunk_overlap=0
)
loader = PyPDFLoader(doc, headers = None)
# extract, load, and make chunks
chunks = loader.load_and_split(text_splitter)
 
print("Done preprocessing. Created", len(chunks), "chunks of the original pdf", doc)


Done preprocessing. Created 211 chunks of the original pdf resources/nke-10k-2023.pdf


In [12]:
embeddings = hf.embed_documents([chunk.page_content for chunk in chunks])
 
# Check to make sure we've created enough embeddings, 1 per document chunk
len(embeddings) == len(chunks)

True

In [19]:
from redis import Redis
from redisvl.index import SearchIndex
 
index_name = "redisvl3"
 
schema = {
 "index": {
   "name": index_name,
   "prefix": "chunk",
    "storage_type": "hash"
 },
 "fields": [
   {
       "name": "chunk_id",
       "type": "tag",
       "attrs": {
           "sortable": True
       }
   },
   {
       "name": "content",
       "type": "text"
   },
   {
       "name": "text_embedding",
       "type": "vector",
       "attrs": {
           "dims": 768,
           "distance_metric": "cosine",
           "algorithm": "hnsw",
           "datatype": "float32"
       }
   }
 ]
}



In [20]:
# connect to redis
client = Redis.from_url(REDIS_URL)
# create an index from schema and the client
index = SearchIndex.from_dict(schema)
index.set_client(client)
index.create(overwrite=True, drop=True)

In [21]:
# load expects an iterable of dictionaries
from redisvl.redis.utils import array_to_buffer
 
data = [
   {
       'chunk_id': i,
       'content': chunk.page_content,
       # For HASH -- must convert embeddings to bytes
       'text_embedding': array_to_buffer(embeddings[i], dtype='float32')
   } for i, chunk in enumerate(chunks)
]
 
# RedisVL handles batching automatically
keys = index.load(data, id_field="chunk_id")

### Query the database

In [22]:
from redisvl.query import VectorQuery
query = "Nike profit margins and company performance"
query_embedding = hf.embed_query(query)
vector_query = VectorQuery(
   vector=query_embedding,
   vector_field_name="text_embedding",
   num_results=3,
   return_fields=["chunk_id", "content"],
   return_score=True
)
 
# show the raw redis query
str(vector_query)

'*=>[KNN 3 @text_embedding $vector AS vector_distance] RETURN 3 chunk_id content vector_distance SORTBY vector_distance ASC DIALECT 2 LIMIT 0 3'

In [23]:
import pandas as pd

# execute the query with RedisVL
result=index.query(vector_query)
 
# view the results
pd.DataFrame(result)

Unnamed: 0,id,vector_distance,chunk_id,content
0,chunk:129,0.193879365921,129,"Table of Contents\nNIKE, INC.\nCONSOLIDATED ST..."
1,chunk:76,0.207569718361,76,financial measures calculated and presented in...
2,chunk:80,0.22341555357,80,Table of Contents\nCONSOLIDATED OPERATING RESU...


#### For custom Schema