Update redis vector store (#12386)

run-llama · Apr 11, 2024 · 433804f · 433804f
1 parent 60a161a
commit 433804f
Show file tree

Hide file tree

Showing 9 changed files with 1,063 additions and 888 deletions.
diff --git a/docs/docs/community/integrations/vector_stores.md b/docs/docs/community/integrations/vector_stores.md
@@ -32,7 +32,7 @@ as the storage backend for `VectorStoreIndex`.
 - Pinecone (`PineconeVectorStore`). [Installation/Quickstart](https://docs.pinecone.io/docs/quickstart).
 - Qdrant (`QdrantVectorStore`) [Installation](https://qdrant.tech/documentation/install/) [Python Client](https://qdrant.tech/documentation/install/#python-client)
 - LanceDB (`LanceDBVectorStore`) [Installation/Quickstart](https://lancedb.github.io/lancedb/basic/)
-- Redis (`RedisVectorStore`). [Installation](https://redis.io/docs/getting-started/installation/).
+- Redis (`RedisVectorStore`). [Installation](https://redis.io/docs/latest/operate/oss_and_stack/install/install-stack/).
 - Supabase (`SupabaseVectorStore`). [Quickstart](https://supabase.github.io/vecs/api/).
 - TiDB (`TiDBVectorStore`). [Quickstart](../../examples/vector_stores/TiDBVector.ipynb). [Installation](https://tidb.cloud/ai). [Python Client](https://github.com/pingcap/tidb-vector-python).
 - TimeScale (`TimescaleVectorStore`). [Installation](https://github.com/timescale/python-vector).

diff --git a/docs/docs/examples/ingestion/ingestion_gdrive.ipynb b/docs/docs/examples/ingestion/ingestion_gdrive.ipynb
@@ -100,10 +100,107 @@
     "    IngestionPipeline,\n",
     "    IngestionCache,\n",
     ")\n",
-    "from llama_index.core.ingestion.cache import RedisCache\n",
+    "from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache\n",
     "from llama_index.storage.docstore.redis import RedisDocumentStore\n",
     "from llama_index.core.node_parser import SentenceSplitter\n",
-    "from llama_index.vector_stores.redis import RedisVectorStore"
+    "from llama_index.vector_stores.redis import RedisVectorStore\n",
+    "\n",
+    "from redisvl.schema import IndexSchema"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "baf744be",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ac74203675564f14b73882a6ae270d18",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c93811def32744ce870253a77767777e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8c237673c9ec4e22a4eba34c934cc322",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f66602de35274bb299d100783e73a01b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7cc44d9f4fd84913b403a05124e71d9a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f3a4992e06c44f2aac3f1a4d21e49065",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")"
    ]
   },
   {
@@ -113,15 +210,32 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "vector_store = RedisVectorStore(\n",
-    "    index_name=\"redis_vector_store\",\n",
-    "    index_prefix=\"vectore_store\",\n",
-    "    redis_url=\"redis://localhost:6379\",\n",
+    "custom_schema = IndexSchema.from_dict(\n",
+    "    {\n",
+    "        \"index\": {\"name\": \"gdrive\", \"prefix\": \"doc\"},\n",
+    "        # customize fields that are indexed\n",
+    "        \"fields\": [\n",
+    "            # required fields for llamaindex\n",
+    "            {\"type\": \"tag\", \"name\": \"id\"},\n",
+    "            {\"type\": \"tag\", \"name\": \"doc_id\"},\n",
+    "            {\"type\": \"text\", \"name\": \"text\"},\n",
+    "            # custom vector field for bge-small-en-v1.5 embeddings\n",
+    "            {\n",
+    "                \"type\": \"vector\",\n",
+    "                \"name\": \"vector\",\n",
+    "                \"attrs\": {\n",
+    "                    \"dims\": 384,\n",
+    "                    \"algorithm\": \"hnsw\",\n",
+    "                    \"distance_metric\": \"cosine\",\n",
+    "                },\n",
+    "            },\n",
+    "        ],\n",
+    "    }\n",
     ")\n",
     "\n",
-    "cache = IngestionCache(\n",
-    "    cache=RedisCache.from_host_and_port(\"localhost\", 6379),\n",
-    "    collection=\"redis_cache\",\n",
+    "vector_store = RedisVectorStore(\n",
+    "    schema=custom_schema,\n",
+    "    redis_url=\"redis://localhost:6379\",\n",
     ")"
    ]
   },
@@ -133,19 +247,31 @@
    "outputs": [],
    "source": [
     "# Optional: clear vector store if exists\n",
-    "if vector_store._index_exists():\n",
+    "if vector_store.index_exists():\n",
     "    vector_store.delete_index()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6d98845",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set up the ingestion cache layer\n",
+    "cache = IngestionCache(\n",
+    "    cache=RedisCache.from_host_and_port(\"localhost\", 6379),\n",
+    "    collection=\"redis_cache\",\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "3be817bd-81a1-436f-8f92-3eb48531c915",
    "metadata": {},
    "outputs": [],
    "source": [
-    "embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n",
-    "\n",
     "pipeline = IngestionPipeline(\n",
     "    transformations=[\n",
     "        SentenceSplitter(),\n",
@@ -239,15 +365,7 @@
    "execution_count": null,
    "id": "c77f74b2-9bbe-46d6-b35f-23ea757b315b",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Ingested 6 Nodes\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "nodes = pipeline.run(documents=docs)\n",
     "print(f\"Ingested {len(nodes)} Nodes\")"
@@ -326,15 +444,7 @@
    "execution_count": null,
    "id": "d490fbb8-82ec-4284-a19d-1a8ca69da2a4",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Ingested 1 Nodes\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "docs = load_data(folder_id=\"1RFhr3-KmOZCR5rtp4dlOMNl3LKe1kOA5\")\n",
     "nodes = pipeline.run(documents=docs)\n",
@@ -398,9 +508,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "llama_index_v2",
+   "display_name": "llama-index-vector-stores-redis-MBNLFpFJ-py3.9",
    "language": "python",
-   "name": "llama_index_v2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {

diff --git a/docs/docs/examples/ingestion/redis_ingestion_pipeline.ipynb b/docs/docs/examples/ingestion/redis_ingestion_pipeline.ipynb
@@ -29,27 +29,6 @@
     "%pip install llama-index-embeddings-huggingface"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: redis in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (5.0.1)\n",
-      "Requirement already satisfied: async-timeout>=4.0.2 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from redis) (4.0.3)\n",
-      "\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.1\u001b[0m\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
-     ]
-    }
-   ],
-   "source": [
-    "!pip install redis"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -75,7 +54,8 @@
    "source": [
     "import os\n",
     "\n",
-    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
+    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
    ]
   },
   {
@@ -102,16 +82,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages/deeplake/util/check_latest_version.py:32: UserWarning: A newer version of deeplake (3.8.9) is available. It's recommended that you update to the latest version using `pip install -U deeplake`.\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from llama_index.core import SimpleDirectoryReader\n",
     "\n",
@@ -144,13 +115,46 @@
     "    IngestionPipeline,\n",
     "    IngestionCache,\n",
     ")\n",
-    "from llama_index.core.ingestion.cache import RedisCache\n",
+    "from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache\n",
     "from llama_index.storage.docstore.redis import RedisDocumentStore\n",
     "from llama_index.core.node_parser import SentenceSplitter\n",
     "from llama_index.vector_stores.redis import RedisVectorStore\n",
     "\n",
+    "from redisvl.schema import IndexSchema\n",
+    "\n",
+    "\n",
     "embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n",
     "\n",
+    "custom_schema = IndexSchema.from_dict(\n",
+    "    {\n",
+    "        \"index\": {\"name\": \"redis_vector_store\", \"prefix\": \"doc\"},\n",
+    "        # customize fields that are indexed\n",
+    "        \"fields\": [\n",
+    "            # required fields for llamaindex\n",
+    "            {\"type\": \"tag\", \"name\": \"id\"},\n",
+    "            {\"type\": \"tag\", \"name\": \"doc_id\"},\n",
+    "            {\"type\": \"text\", \"name\": \"text\"},\n",
+    "            # custom vector field for bge-small-en-v1.5 embeddings\n",
+    "            {\n",
+    "                \"type\": \"vector\",\n",
+    "                \"name\": \"vector\",\n",
+    "                \"attrs\": {\n",
+    "                    \"dims\": 384,\n",
+    "                    \"algorithm\": \"hnsw\",\n",
+    "                    \"distance_metric\": \"cosine\",\n",
+    "                },\n",
+    "            },\n",
+    "        ],\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "pipeline = IngestionPipeline(\n",
     "    transformations=[\n",
     "        SentenceSplitter(),\n",
@@ -160,8 +164,7 @@
     "        \"localhost\", 6379, namespace=\"document_store\"\n",
     "    ),\n",
     "    vector_store=RedisVectorStore(\n",
-    "        index_name=\"redis_vector_store\",\n",
-    "        index_prefix=\"vectore_store\",\n",
+    "        schema=custom_schema,\n",
     "        redis_url=\"redis://localhost:6379\",\n",
     "    ),\n",
     "    cache=IngestionCache(\n",
@@ -221,7 +224,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "I see two documents: \"test2.txt\" and \"test1.txt\".\n"
+      "I see two documents.\n"
      ]
     }
    ],
@@ -261,6 +264,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "13:32:07 redisvl.index.index INFO   Index already exists, not overwriting.\n",
       "Ingested 2 Nodes\n"
      ]
     }
@@ -284,7 +288,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "I see three documents: test3.txt, test1.txt, and test2.txt.\n",
+      "You see three documents: test3.txt, test1.txt, and test2.txt.\n",
       "This is a test file: three!\n",
       "This is a NEW test file: one!\n",
       "This is a test file: two!\n"