Skip to content

Commit

Permalink
Update redis vector store (#12386)
Browse files Browse the repository at this point in the history
  • Loading branch information
tylerhutcherson committed Apr 11, 2024
1 parent 60a161a commit 433804f
Show file tree
Hide file tree
Showing 9 changed files with 1,063 additions and 888 deletions.
2 changes: 1 addition & 1 deletion docs/docs/community/integrations/vector_stores.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ as the storage backend for `VectorStoreIndex`.
- Pinecone (`PineconeVectorStore`). [Installation/Quickstart](https://docs.pinecone.io/docs/quickstart).
- Qdrant (`QdrantVectorStore`) [Installation](https://qdrant.tech/documentation/install/) [Python Client](https://qdrant.tech/documentation/install/#python-client)
- LanceDB (`LanceDBVectorStore`) [Installation/Quickstart](https://lancedb.github.io/lancedb/basic/)
- Redis (`RedisVectorStore`). [Installation](https://redis.io/docs/getting-started/installation/).
- Redis (`RedisVectorStore`). [Installation](https://redis.io/docs/latest/operate/oss_and_stack/install/install-stack/).
- Supabase (`SupabaseVectorStore`). [Quickstart](https://supabase.github.io/vecs/api/).
- TiDB (`TiDBVectorStore`). [Quickstart](../../examples/vector_stores/TiDBVector.ipynb). [Installation](https://tidb.cloud/ai). [Python Client](https://github.com/pingcap/tidb-vector-python).
- TimeScale (`TimescaleVectorStore`). [Installation](https://github.com/timescale/python-vector).
Expand Down
174 changes: 142 additions & 32 deletions docs/docs/examples/ingestion/ingestion_gdrive.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,107 @@
" IngestionPipeline,\n",
" IngestionCache,\n",
")\n",
"from llama_index.core.ingestion.cache import RedisCache\n",
"from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache\n",
"from llama_index.storage.docstore.redis import RedisDocumentStore\n",
"from llama_index.core.node_parser import SentenceSplitter\n",
"from llama_index.vector_stores.redis import RedisVectorStore"
"from llama_index.vector_stores.redis import RedisVectorStore\n",
"\n",
"from redisvl.schema import IndexSchema"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "baf744be",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ac74203675564f14b73882a6ae270d18",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"model.safetensors: 0%| | 0.00/133M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c93811def32744ce870253a77767777e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tokenizer_config.json: 0%| | 0.00/366 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "8c237673c9ec4e22a4eba34c934cc322",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f66602de35274bb299d100783e73a01b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tokenizer.json: 0%| | 0.00/711k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7cc44d9f4fd84913b403a05124e71d9a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"special_tokens_map.json: 0%| | 0.00/125 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f3a4992e06c44f2aac3f1a4d21e49065",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"1_Pooling/config.json: 0%| | 0.00/190 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")"
]
},
{
Expand All @@ -113,15 +210,32 @@
"metadata": {},
"outputs": [],
"source": [
"vector_store = RedisVectorStore(\n",
" index_name=\"redis_vector_store\",\n",
" index_prefix=\"vectore_store\",\n",
" redis_url=\"redis://localhost:6379\",\n",
"custom_schema = IndexSchema.from_dict(\n",
" {\n",
" \"index\": {\"name\": \"gdrive\", \"prefix\": \"doc\"},\n",
" # customize fields that are indexed\n",
" \"fields\": [\n",
" # required fields for llamaindex\n",
" {\"type\": \"tag\", \"name\": \"id\"},\n",
" {\"type\": \"tag\", \"name\": \"doc_id\"},\n",
" {\"type\": \"text\", \"name\": \"text\"},\n",
" # custom vector field for bge-small-en-v1.5 embeddings\n",
" {\n",
" \"type\": \"vector\",\n",
" \"name\": \"vector\",\n",
" \"attrs\": {\n",
" \"dims\": 384,\n",
" \"algorithm\": \"hnsw\",\n",
" \"distance_metric\": \"cosine\",\n",
" },\n",
" },\n",
" ],\n",
" }\n",
")\n",
"\n",
"cache = IngestionCache(\n",
" cache=RedisCache.from_host_and_port(\"localhost\", 6379),\n",
" collection=\"redis_cache\",\n",
"vector_store = RedisVectorStore(\n",
" schema=custom_schema,\n",
" redis_url=\"redis://localhost:6379\",\n",
")"
]
},
Expand All @@ -133,19 +247,31 @@
"outputs": [],
"source": [
"# Optional: clear vector store if exists\n",
"if vector_store._index_exists():\n",
"if vector_store.index_exists():\n",
" vector_store.delete_index()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e6d98845",
"metadata": {},
"outputs": [],
"source": [
"# Set up the ingestion cache layer\n",
"cache = IngestionCache(\n",
" cache=RedisCache.from_host_and_port(\"localhost\", 6379),\n",
" collection=\"redis_cache\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3be817bd-81a1-436f-8f92-3eb48531c915",
"metadata": {},
"outputs": [],
"source": [
"embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n",
"\n",
"pipeline = IngestionPipeline(\n",
" transformations=[\n",
" SentenceSplitter(),\n",
Expand Down Expand Up @@ -239,15 +365,7 @@
"execution_count": null,
"id": "c77f74b2-9bbe-46d6-b35f-23ea757b315b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ingested 6 Nodes\n"
]
}
],
"outputs": [],
"source": [
"nodes = pipeline.run(documents=docs)\n",
"print(f\"Ingested {len(nodes)} Nodes\")"
Expand Down Expand Up @@ -326,15 +444,7 @@
"execution_count": null,
"id": "d490fbb8-82ec-4284-a19d-1a8ca69da2a4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ingested 1 Nodes\n"
]
}
],
"outputs": [],
"source": [
"docs = load_data(folder_id=\"1RFhr3-KmOZCR5rtp4dlOMNl3LKe1kOA5\")\n",
"nodes = pipeline.run(documents=docs)\n",
Expand Down Expand Up @@ -398,9 +508,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "llama_index_v2",
"display_name": "llama-index-vector-stores-redis-MBNLFpFJ-py3.9",
"language": "python",
"name": "llama_index_v2"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand Down
78 changes: 41 additions & 37 deletions docs/docs/examples/ingestion/redis_ingestion_pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -29,27 +29,6 @@
"%pip install llama-index-embeddings-huggingface"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: redis in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (5.0.1)\n",
"Requirement already satisfied: async-timeout>=4.0.2 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from redis) (4.0.3)\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
]
}
],
"source": [
"!pip install redis"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -75,7 +54,8 @@
"source": [
"import os\n",
"\n",
"os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
"os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n",
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
]
},
{
Expand All @@ -102,16 +82,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages/deeplake/util/check_latest_version.py:32: UserWarning: A newer version of deeplake (3.8.9) is available. It's recommended that you update to the latest version using `pip install -U deeplake`.\n",
" warnings.warn(\n"
]
}
],
"outputs": [],
"source": [
"from llama_index.core import SimpleDirectoryReader\n",
"\n",
Expand Down Expand Up @@ -144,13 +115,46 @@
" IngestionPipeline,\n",
" IngestionCache,\n",
")\n",
"from llama_index.core.ingestion.cache import RedisCache\n",
"from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache\n",
"from llama_index.storage.docstore.redis import RedisDocumentStore\n",
"from llama_index.core.node_parser import SentenceSplitter\n",
"from llama_index.vector_stores.redis import RedisVectorStore\n",
"\n",
"from redisvl.schema import IndexSchema\n",
"\n",
"\n",
"embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n",
"\n",
"custom_schema = IndexSchema.from_dict(\n",
" {\n",
" \"index\": {\"name\": \"redis_vector_store\", \"prefix\": \"doc\"},\n",
" # customize fields that are indexed\n",
" \"fields\": [\n",
" # required fields for llamaindex\n",
" {\"type\": \"tag\", \"name\": \"id\"},\n",
" {\"type\": \"tag\", \"name\": \"doc_id\"},\n",
" {\"type\": \"text\", \"name\": \"text\"},\n",
" # custom vector field for bge-small-en-v1.5 embeddings\n",
" {\n",
" \"type\": \"vector\",\n",
" \"name\": \"vector\",\n",
" \"attrs\": {\n",
" \"dims\": 384,\n",
" \"algorithm\": \"hnsw\",\n",
" \"distance_metric\": \"cosine\",\n",
" },\n",
" },\n",
" ],\n",
" }\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline = IngestionPipeline(\n",
" transformations=[\n",
" SentenceSplitter(),\n",
Expand All @@ -160,8 +164,7 @@
" \"localhost\", 6379, namespace=\"document_store\"\n",
" ),\n",
" vector_store=RedisVectorStore(\n",
" index_name=\"redis_vector_store\",\n",
" index_prefix=\"vectore_store\",\n",
" schema=custom_schema,\n",
" redis_url=\"redis://localhost:6379\",\n",
" ),\n",
" cache=IngestionCache(\n",
Expand Down Expand Up @@ -221,7 +224,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"I see two documents: \"test2.txt\" and \"test1.txt\".\n"
"I see two documents.\n"
]
}
],
Expand Down Expand Up @@ -261,6 +264,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"13:32:07 redisvl.index.index INFO Index already exists, not overwriting.\n",
"Ingested 2 Nodes\n"
]
}
Expand All @@ -284,7 +288,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"I see three documents: test3.txt, test1.txt, and test2.txt.\n",
"You see three documents: test3.txt, test1.txt, and test2.txt.\n",
"This is a test file: three!\n",
"This is a NEW test file: one!\n",
"This is a test file: two!\n"
Expand Down
Loading

0 comments on commit 433804f

Please sign in to comment.