<a href="https://colab.research.google.com/github/rexian/ML/blob/main/langchain/vectorDB/document_embedding_with_weaviate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Weaviate is an open-source, AI-native vector database designed to store both objects and vectors.
Key features:


*   Vector Search: Combines vector search with structured filtering, allowing for precise and contextual search results.
*   Hybrid Search: Merges vector search with keyword search techniques to deliver accurate results across various data modalities.
*   Generative AI Integration: Supports Retrieval-Augmented Generation (RAG) to build trustworthy generative AI applications using your own data.
*   Customization: Offers flexible schema definition and supports custom modules for integration with popular services and model hubs.
*   Scalability: Built with cloud-native architecture, ensuring fault tolerance, scalability, and production-readiness.

In [None]:
!pip install weaviate-client
!pip install langchain
!pip install openai

In [None]:
OPENAI_API_KEY = ""
WEAVIATE_API_KEY = ""
WEAVIATE_CLUSTER = ""

In [None]:
!pip install unstructured
!pip install "unstructured[pdf]"

In [None]:
from langchain.document_loaders import DirectoryLoader
loader = DirectoryLoader("./spaul",glob = "**/*.pdf")
data = loader.load()

Split text into chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
docs = text_splitter.split_documents(data)

Create embedding

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(openai_api_key= OPENAI_API_KEY)

Create vector database client

In [None]:
import weaviate
from langchain.vectorstores import Weaviate

#Connect to weaviate Cluster
auth_config = weaviate.auth.AuthApiKey(api_key = WEAVIATE_API_KEY)
WEAVIATE_URL = WEAVIATE_CLUSTER

client = weaviate.Client(
    url = WEAVIATE_URL,
    additional_headers = {"X-OpenAI-Api-key": OPENAI_API_KEY},
    auth_client_secret = auth_config,
    startup_period = 10
)

In [None]:
client.is_ready()

Define input structure

In [None]:
client.schema.delete_all()
client.schema.get()
schema = {
    "classes": [
        {
            "class": "Chatbot",
            "description": "Documents for chatbot",
            "vectorizer": "text2vec-openai",
            "moduleConfig": {"text2vec-openai": {"model": "ada", "type": "text"}},
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "The content of the paragraph",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": "content",
                },
            ],
        },
    ]
}

client.schema.create(schema)
vectorstore = Weaviate(client, "Chatbot", "content", attributes=["source"])

Load vector embedding to the vector store

In [None]:
text_meta_pair = [(doc.page_content, doc.metadata) for doc in docs]
texts, meta = list(zip(*text_meta_pair))
vectorstore.add_texts(texts, meta)

In [None]:
query = "What is Natural Language Processing?"
# retrieve text related to the query
docs = vectorstore.similarity_search(query, top_k=10)

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

Define chain

In [None]:
chain = load_qa_chain(
    OpenAI(openai_api_key = OPENAI_API_KEY,temperature=0),
    chain_type="stuff")
chain.run(input_documents=docs, question=query)