From 07a47212143095c6e92628e35c26a8962bc8017f Mon Sep 17 00:00:00 2001 From: Sam Partee Date: Wed, 26 Jul 2023 16:24:51 -0700 Subject: [PATCH] Provider Documentation --- README.md | 9 +- docs/user_guide/index.md | 2 +- docs/user_guide/providers_03.ipynb | 265 +++++++++++++++++++++++++++++ docs/user_guide/schema.yaml | 14 ++ redisvl/index.py | 1 - 5 files changed, 286 insertions(+), 5 deletions(-) create mode 100644 docs/user_guide/providers_03.ipynb create mode 100644 docs/user_guide/schema.yaml diff --git a/README.md b/README.md index 0199346e..9a8cdee5 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,14 @@ [![Codecov](https://img.shields.io/codecov/c/github/RedisVentures/RedisVL/dev?label=Codecov&logo=codecov&token=E30WxqBeJJ)](https://codecov.io/gh/RedisVentures/RedisVL) [![License](https://img.shields.io/badge/License-BSD-3--blue.svg)](https://opensource.org/licenses/mit/) - +![Language](https://img.shields.io/github/languages/top/RedisVentures/RedisVL) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +![GitHub last commit](https://img.shields.io/github/last-commit/RedisVentures/RedisVL) +![GitHub deployments](https://img.shields.io/github/deployments/RedisVentures/RedisVL/github-pages?label=doc%20build) RedisVL provides a powerful Python client library for using Redis as a Vector Database. Leverage the speed and reliability of Redis along with vector-based semantic search capabilities to supercharge your application! -**Note:** This project is rapidly evolving, and the API may change frequently. Always refer to the most recent [documentation](https://redisvl.com/docs). +**Note:** This project is rapidly evolving, and the API may change frequently. Always refer to the most recent [documentation](https://www.redisvl.com). ## 🚀 What is RedisVL? Vector databases have become increasingly popular in recent years due to their ability to store and retrieve vectors efficiently. However, most vector databases are complex to use and require a lot of time and effort to set up. RedisVL aims to solve this problem by providing a simple and intuitive interface for using Redis as a vector database. @@ -32,7 +35,7 @@ RedisVL has a host of powerful features designed to streamline your vector datab Please note that this library is still under heavy development, and while you can quickly try RedisVL and deploy it in a production environment, the API may be subject to change at any time. -`pip install redisvl` +`pip install redisvl` (Coming Soon) ## Example Usage diff --git a/docs/user_guide/index.md b/docs/user_guide/index.md index 6ce2f868..e039d7b9 100644 --- a/docs/user_guide/index.md +++ b/docs/user_guide/index.md @@ -25,7 +25,7 @@ hybrid_queries_02 :caption: Providers :maxdepth: 3 -embedding_creation +providers_03 ``` ```{toctree} diff --git a/docs/user_guide/providers_03.ipynb b/docs/user_guide/providers_03.ipynb new file mode 100644 index 00000000..c83fa752 --- /dev/null +++ b/docs/user_guide/providers_03.ipynb @@ -0,0 +1,265 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Embedding Providers\n", + "\n", + "In this notebook, we will show how to use RedisVL to create embeddings using the built-in Providers. Today RedisVL supports:\n", + "1. OpenAI\n", + "2. HuggingFace\n", + "\n", + "Before running this notebook, be sure to\n", + "1. Have installed ``redisvl`` and have that environment active for this notebook.\n", + "2. Have a running Redis instance with RediSearch > 2.4 running.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# import necessary modules\n", + "import os\n", + "from redisvl.utils.utils import array_to_buffer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating Embeddings\n", + "\n", + "This example will show how to create an embedding from 3 simple sentences with a number of different providers\n", + "\n", + "- \"That is a happy dog\"\n", + "- \"That is a happy person\"\n", + "- \"Today is a nice day\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Huggingface\n", + "\n", + "Huggingface is a popular NLP library that has a number of pre-trained models. RedisVL supports using Huggingface to create embeddings from these models. To use Huggingface, you will need to install the ``sentence-transformers`` library.\n", + "\n", + "```bash\n", + "pip install sentence-transformers\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[0.00037813105154782534,\n", + " -0.05080341547727585,\n", + " -0.03514720872044563,\n", + " -0.023251093924045563,\n", + " -0.04415826499462128,\n", + " 0.020487893372774124,\n", + " 0.0014619074063375592,\n", + " 0.03126181662082672,\n", + " 0.056051574647426605,\n", + " 0.0188154224306345]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", + "from redisvl.providers import HuggingfaceProvider\n", + "\n", + "\n", + "# create a provider\n", + "hf = HuggingfaceProvider(model=\"sentence-transformers/all-mpnet-base-v2\")\n", + "\n", + "# embed a sentence\n", + "test = hf.embed(\"This is a test sentence.\")\n", + "test[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# You can also create many embeddings at once\n", + "\n", + "sentences = [\n", + " \"That is a happy dog\",\n", + " \"That is a happy person\",\n", + " \"Today is a sunny day\"\n", + "]\n", + "\n", + "embeddings = hf.embed_many(sentences)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Search with Provider Embeddings\n", + "\n", + "Now that we've created our embeddings, we can use them to search for similar sentences. We will use the same 3 sentences from above and search for similar sentences.\n", + "\n", + "First, we need to create the schema for our index.\n", + "\n", + "Here's what the schema for the example looks like in yaml for the HuggingFace Provider\n", + "\n", + "```yaml\n", + "index:\n", + " name: providers\n", + " prefix: rvl\n", + " storage_type: hash\n", + "\n", + "fields:\n", + " text:\n", + " - name: sentence\n", + " vector:\n", + " - name: embedding\n", + " dims: 768\n", + " algorithm: flat\n", + " distance_metric: cosine\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from redisvl.index import SearchIndex\n", + "\n", + "# construct a search index from the schema\n", + "index = SearchIndex.from_yaml(\"./schema.yaml\")\n", + "\n", + "# connect to local redis instance\n", + "index.connect(\"redis://localhost:6379\")\n", + "\n", + "# create the index (no data yet)\n", + "index.create(overwrite=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m15:50:34\u001b[0m \u001b[35msam.partee-NW9MQX5Y74\u001b[0m \u001b[34mredisvl.cli.index[33382]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n", + "\u001b[32m15:50:34\u001b[0m \u001b[35msam.partee-NW9MQX5Y74\u001b[0m \u001b[34mredisvl.cli.index[33382]\u001b[0m \u001b[1;30mINFO\u001b[0m 1. providers\n" + ] + } + ], + "source": [ + "# use the CLI to see the created index\n", + "!rvl index listall" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# load expects an iterable of dictionaries where\n", + "# the vector is stored as a bytes buffer\n", + "\n", + "data = [{\"text\": t,\n", + " \"embedding\": array_to_buffer(v)}\n", + " for t, v in zip(sentences, embeddings)]\n", + "\n", + "index.load(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "That is a happy dog\n", + "0.160862445831\n", + "That is a happy person\n", + "0.273598074913\n", + "Today is a sunny day\n", + "0.744559526443\n" + ] + } + ], + "source": [ + "from redisvl.query import VectorQuery\n", + "\n", + "# use the HuggingFace Provider again to create a query embedding\n", + "query_embedding = hf.embed(\"That is a happy cat\")\n", + "\n", + "query = VectorQuery(\n", + " vector=query_embedding,\n", + " vector_field_name=\"embedding\",\n", + " return_fields=[\"text\"],\n", + " num_results=3\n", + ")\n", + "\n", + "results = index.search(query.query, query_params=query.params)\n", + "for doc in results.docs:\n", + " print(doc.text)\n", + " print(doc.vector_distance)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.13 ('redisvl2')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "9b1e6e9c2967143209c2f955cb869d1d3234f92dc4787f49f155f3abbdfb1316" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/user_guide/schema.yaml b/docs/user_guide/schema.yaml new file mode 100644 index 00000000..c96ac23f --- /dev/null +++ b/docs/user_guide/schema.yaml @@ -0,0 +1,14 @@ + +index: + name: providers + prefix: rvl + storage_type: hash + +fields: + text: + - name: sentence + vector: + - name: embedding + dims: 768 + algorithm: flat + distance_metric: cosine \ No newline at end of file diff --git a/redisvl/index.py b/redisvl/index.py index e2040fde..6be401de 100644 --- a/redisvl/index.py +++ b/redisvl/index.py @@ -262,7 +262,6 @@ def load(self, data: Iterable[Dict[str, Any]], **kwargs): raise TypeError("data must be an iterable of dictionaries") for record in data: - # TODO don't use colon if no prefix key = f"{self._prefix}:{self._get_key_field(record)}" self._redis_conn.hset(key, mapping=record) # type: ignore