<a href="https://colab.research.google.com/github/seanreed1111/colab-demos/blob/master/my_embedchain_demo_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Embedchain

Embedchain is a framework to easily create LLM powered bots over any dataset.

Here is a very simple demo about how it work!

First of all we install the dependencies:

In [1]:
!pip install -qqq --upgrade embedchain loguru python-dotenv tiktoken

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/52.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.0/52.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.5/75.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m415.5/415.5 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m271.9/271.9 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Now we import the dependencies:

In [2]:
import os, sys, json
from langchain.embeddings import OpenAIEmbeddings
from embedchain import App, CustomApp
from embedchain.config import(AppConfig,
                              CustomAppConfig,
                              AddConfig,
                              QueryConfig,
                              ChatConfig,
                              ChunkerConfig
                              )
from embedchain.models import Providers, EmbeddingFunctions
from string import Template
from typing import Optional

from dotenv import load_dotenv

load_dotenv("/content/deployment.env")
from loguru import logger
logger.add(sys.stderr, colorize=True, format="<green>{time}</green> <level>{message}</level>", level="DEBUG")


1

In [16]:
dir(EmbeddingFunctions)

['GPT4ALL',
 'HUGGING_FACE',
 'OPENAI',
 'VERTEX_AI',
 '__class__',
 '__doc__',
 '__members__',
 '__module__']

# I need to write my own class that inherits from CustomAppConfig

In [20]:
from typing import Any, Optional

from chromadb.api.types import Documents, Embeddings
from embedchain.config.apps.BaseAppConfig import BaseAppConfig
from embedchain.models import (EmbeddingFunctions, Providers, VectorDatabases,
                               VectorDimensions)

# load_dotenv("/content/deployment.env")

class MyAzureOpenAIAppConfig(BaseAppConfig):
    """
    Config to initialize an embedchain custom `CustomAppConfig` instance that adds embedding_chunk_size=16
    """

    def __init__(
        self,
        embedding_chunk_size=16,
        log_level='DEBUG',
        embedding_fn=None,
        db=None,
        host=None,
        port=None,
        id=None,
        collection_name=None,
        provider= Providers.OPENAI,
        deployment_name=None,
        collect_metrics= None,
        db_type= None,
        es_config=None,
    ):
        """
        :param embedding_chunk_size: Optional. (Int) accounts for Azure OPENAI throttling
        :param log_level: Optional. (String) Debug level
        ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'].
        :param embedding_fn: Optional. Embedding function to use.
        :param embedding_fn_model: Optional. Model name to use for embedding function.
        :param db: Optional. (Vector) database to use for embeddings.
        :param host: Optional. Hostname for the database server.
        :param port: Optional. Port for the database server.
        :param id: Optional. ID of the app. Document metadata will have this id.
        :param collection_name: Optional. Collection name for the database.
        :param provider: Optional. (Providers): LLM Provider to use.
        :param open_source_app_config: Optional. Config instance needed for open source apps.
        :param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain.
        :param db_type: Optional. type of Vector database to use.
        :param es_config: Optional. elasticsearch database config to be used for connection
        """
        if provider:
            self.provider = provider
        else:
            raise ValueError("CustomApp must have a provider assigned.")

        super().__init__(
            log_level=log_level,
            embedding_fn=MyAzureOpenAIAppConfig.my_embedding_function(
                deployment_name=deployment_name,
                embedding_chunk_size = embedding_chunk_size
            ),
            db=db,
            host=host,
            port=port,
            id=id,
            collection_name=collection_name,
            collect_metrics=collect_metrics,
            db_type=db_type,
            vector_dim=CustomAppConfig.get_vector_dimension(embedding_function=embedding_fn),
            es_config=es_config,
        )

    @staticmethod
    def langchain_default_concept(embeddings: Any):
        """
        Langchains default function layout for embeddings.
        """

        def embed_function(texts: Documents) -> Embeddings:
            return embeddings.embed_documents(texts)

        return embed_function

    @staticmethod
    def get_vector_dimension(embedding_function: EmbeddingFunctions):
      return VectorDimensions.OPENAI.value

    @staticmethod
    def my_embedding_function(deployment_name: str = None,
                              embedding_chunk_size: int = 16
                              ):
      if deployment_name:
          embeddings = OpenAIEmbeddings(deployment=deployment_name, chunk_size=embedding_chunk_size)
      else:
          embeddings = OpenAIEmbeddings(chunk_size=embedding_chunk_size)
      return MyAzureOpenAIAppConfig.langchain_default_concept(embeddings)

# Setup My CustomAppConfig

In [21]:
# I need to add embedding_chunk_size = 16 into the embedding_function. manually!
@logger.catch
def get_azure_openai_app_config():
  return MyAzureOpenAIAppConfig(
      embedding_fn=EmbeddingFunctions.OPENAI,
      provider=Providers.AZURE_OPENAI,
      log_level="DEBUG",
      deployment_name=os.getenv("EMBEDDING_DEPLOYMENT_NAME")
  )

bot = CustomApp(get_azure_openai_app_config())

# Set up a default prompt

In [22]:
from string import Template

DEFAULT_PROMPT = """
  You are a chatbot having a conversation. You are given chat
  history and context.
  You need to answer the query considering context,
  chat history and your knowledge base.
  If you don't know the answer or the answer is neither contained in the context
  nor in history, then simply say "No idea, bro".

  $context

  History: $history

  Query: $query

  Helpful Answer:
"""  # noqa:E501

TEMPLATE = Template(DEFAULT_PROMPT)

Now, add different data sources using embedchain's `.add()` method:

In [24]:
@logger.catch
def void_add_databases():
  chunker_config = AddConfig(chunker=ChunkerConfig(chunk_size=300, chunk_overlap=30)) #for TEXT
  bot.add("https://en.wikipedia.org/wiki/A._W._Peet", config=chunker_config)
  bot.add("https://www.trinity.utoronto.ca/directory/peet-a-w/", config=chunker_config)
  bot.add("https://www.youtube.com/watch?v=gBYcM9fe8YA","youtube_video")

void_add_databases()

Successfully saved https://en.wikipedia.org/wiki/A._W._Peet (DataType.WEB_PAGE). New chunks count: 25
Successfully saved https://www.trinity.utoronto.ca/directory/peet-a-w/ (DataType.WEB_PAGE). New chunks count: 5
Successfully saved https://www.youtube.com/watch?v=gBYcM9fe8YA (DataType.YOUTUBE_VIDEO). New chunks count: 1


# Set up the ChatConfig

In [25]:
TEMPERATURE = 0
MAX_TOKENS = 1500
NUMBER_DOCUMENTS = 3 #how many documents to retrieve from the database

chat_config = ChatConfig(
    template=TEMPLATE,
    number_documents=NUMBER_DOCUMENTS,# default is set to 1 by parent class QueryConfig
    model="gpt-3.5-turbo",
    temperature=TEMPERATURE, # default is set to 0 by parent class QueryConfig
    max_tokens=MAX_TOKENS, # default is set to 1000 by parent class QueryConfig
    top_p=None,
    stream=False,
    deployment_name=os.getenv("DEPLOYMENT_NAME")
)

Your bot is ready now. Ask your bot any questions using `.query()` or `.chat()`  methods

If you have a `ChatConfig`, be sure to include it in the call to `.chat()`.

In [26]:
@logger.catch
def response():
  return bot.chat("What are does Professor Peet research?", config=chat_config)

response()

'As per the information available, Professor A.W. Peet is a physics professor at the University of Toronto and an affiliate of the Perimeter Institute for Theoretical Physics. However, there is no specific information available about their research area.'