<a href="https://colab.research.google.com/github/ramaraweera/introduction-to-ai-native-vector-databases-4470531/blob/main/weaviate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install openai==0.28.0 weaviate-client langchain langchain-community pypdf

Collecting openai==0.28.0
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Collecting weaviate-client
  Downloading weaviate_client-4.18.1-py3-none-any.whl.metadata (3.7 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting pypdf
  Downloading pypdf-6.4.0-py3-none-any.whl.metadata (7.1 kB)
Collecting validators<1.0.0,>=0.34.0 (from weaviate-client)
  Downloading validators-0.35.0-py3-none-any.whl.metadata (3.9 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting requests>=2.20 (from openai==0.28.0)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-community)
  Downloa

In [2]:
from IPython.display import HTML, display

def set_css():
    display(HTML('''
    <style>
    pre {
        white-space: pre-wrap;
    }
    </style>
    '''))

get_ipython().events.register('pre_run_cell', set_css)


In [3]:
import openai
import os
from google.colab import userdata
from typing import List, Optional
import weaviate
import datetime
import json
from weaviate.auth import Auth
from weaviate.util import generate_uuid5
from weaviate.classes.query import Filter
import weaviate.classes as wvc
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader, UnstructuredURLLoader

In [4]:
def call_gpt4(system_prompt, user_prompt, chunks=None):
    openai.api_key = os.getenv("OPENAI_API_KEY", userdata.get('OPENAI_API_KEY'))
    if not openai.api_key:
        raise ValueError("OpenAI API key not found. Set the OPENAI_API_KEY environment variable.")

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    # Add chunks if provided
    if chunks:
        all_chunks = "---------\n".join(chunks)
        messages.append({"role": "system", "content": f"Use the following context to answer the question <Context>{all_chunks}</Context>"})

    print(json.dumps(messages, indent=4))
    response = openai.ChatCompletion.create(
        model="gpt-4-1106-preview",
        messages=messages
    )
    return response['choices'][0]['message']['content']

In [5]:
def init_pdf_collection(client: weaviate.Client, collection_name: str) -> None:
    documents = client.collections.delete(collection_name)

    documents= client.collections.create(
        name=collection_name,
        vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),    # Set the vectorizer to "text2vec-openai" to use the OpenAI API for vector-related operations
        # https://weaviate.io/developers/weaviate/config-refs/datatypes
        properties=[
            wvc.config.Property(
                name="content",
                data_type=wvc.config.DataType.TEXT,
                vectorize_property_name=True,
                tokenization=wvc.config.Tokenization.LOWERCASE
            ),
            wvc.config.Property(
                name="date",
                data_type=wvc.config.DataType.DATE,
                vectorize_property_name=False  # Include the property name ("question") when vectorizin
            ),
            wvc.config.Property(
                name="page_number",
                data_type=wvc.config.DataType.INT,
                vectorize_property_name=False  # Include the property name ("question") when vectorizin
            ),
            wvc.config.Property(
                name="total_pages",
                data_type=wvc.config.DataType.INT,
                vectorize_property_name=False  # Include the property name ("question") when vectorizin
            ),
            wvc.config.Property(
                name="file_path",
                data_type=wvc.config.DataType.TEXT,
                vectorize_property_name=True  # Include the property name ("question") when vectorizin
            ),
            wvc.config.Property(
                name="chunk_number",
                data_type=wvc.config.DataType.INT,
                vectorize_property_name=False  # Include the property name ("question") when vectorizin
            ),
            wvc.config.Property(
                name="total_chunks",
                data_type=wvc.config.DataType.INT,
                vectorize_property_name=False  # Include the property name ("question") when vectorizin
            )

            # https://weaviate.io/developers/weaviate/config-refs/datatypes
        ]
    )

In [6]:
def ingest(file_path, collection):
    # Define the file path and metadata

    # Process the PDF file asynchronously
    documents = load_pdf(file_path)

    # Output the processed documents
    for i,doc in enumerate(documents):
        pagenumber = i
        chunks = chunk_text(doc.page_content)
        # Store chunks in Weaviate
        for j,chunk in enumerate(chunks):
            print(f'Processing {file_path} page {i+1} chunk {j+1} of {len(chunks)}')
            data_object = {
                "date": datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
                "content": chunk,
                "page_number": pagenumber,
                "file_path": file_path,
                "total_pages": len(documents),
                "chunk_number": j,
                "total_chunks": len(chunks)
            }
            collection.data.insert(data_object)

In [7]:
def get_weaviate_client() -> weaviate.Client:
    """
    Get a Weaviate client instance.

    Returns:
        weaviate.Client: The Weaviate client instance
    """
    weaviate_url = userdata.get('WEAVIATE_URL')
    api_key = userdata.get('WEAVIATE_API_KEY')
    openai_key = userdata.get('OPENAI_API_KEY')

    client = weaviate.connect_to_weaviate_cloud(
                    cluster_url=weaviate_url,
                    auth_credentials=Auth.api_key(api_key),
                    headers={'X-OpenAI-Api-key': openai_key}
                )
    return client

In [8]:
def load_pdf(file_path: str) -> list:
    """
    load a pdf into a list of pages / documents

    Args:
        file_path: HTTPS URL to PDF file

    Returns:
        list: Processed documents

    Raises:
        ValueError: If URL does not use HTTPS protocol
    """

    try:
        loader = PyPDFLoader(file_path)
        # documentS because the single document is split into multiple documents
        documents = loader.load()

        return documents

    finally:
        if 'temp_path' in locals():
            os.unlink(temp_path)

In [9]:
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
    """
    Splits text into overlapping chunks of specified size.

    Args:
        text (str): The input text to be chunked
        chunk_size (int): Maximum size of each chunk in characters
        overlap (int): Number of characters to overlap between chunks

    Returns:
        List[str]: List of text chunks
    """
    # ... keeping the same chunking logic from wvsearch.py ...
    chunks = []
    start = 0
    text_length = len(text)

    while start < text_length:
        end = start + chunk_size

        if end < text_length:
            next_period = text.find('.', end - 50, end + 50)
            if next_period != -1:
                end = next_period + 1
            else:
                next_space = text.find(' ', end)
                if next_space != -1:
                    end = next_space

        chunks.append(text[start:end].strip())
        start = end - overlap

    return chunks

In [10]:
def search_weaviate(**kwargs):
    """
    Search for documents using the vector store.

    Args:
        **kwargs: Keyword arguments
            search_term (str): The search query text
            metadata_filter (dict, optional): Dictionary of metadata key-value pairs to filter results
            limit (int, optional): Maximum number of results to return (default: 10)

    Returns:
        list: List of search results with content and metadata
    """
    search_term = kwargs.get('search_term')
    limit = kwargs.get('limit', 10)
    collection = kwargs.get('collection')
    file_path = kwargs.get('file_path')

    results = collection.query.near_text(
        query=search_term,
        limit=10,
        filters=Filter.by_property("file_path").equal(file_path)
    )
    return results

In [11]:
llmtest = call_gpt4("You are a helpful assistant.", "According to the Hitch Hikers Guide to the galaxy. What is the Answer to everything?")
print(llmtest)

[
    {
        "role": "system",
        "content": "You are a helpful assistant."
    },
    {
        "role": "user",
        "content": "According to the Hitch Hikers Guide to the galaxy. What is the Answer to everything?"
    }
]
According to Douglas Adams' science fiction series "The Hitchhiker's Guide to the Galaxy," the Answer to the Ultimate Question of Life, the Universe, and Everything is simply the number 42. This enigmatic and humorous answer is provided by an advanced supercomputer named Deep Thought after seven and a half million years of computation. However, the actual Ultimate Question itself is unknown, which prompts further comedic exploration in the story.


In [12]:
wclient = get_weaviate_client()

In [13]:
collection_name="pdf"
init_pdf_collection(wclient, collection_name)

In [14]:
collection = wclient.collections.get(collection_name)

In [16]:
file_path = "/content/sample_data/Promoga Help CenterFAQ.pdf"
ingest(file_path, collection)


Processing /content/sample_data/Promoga Help CenterFAQ.pdf page 1 chunk 1 of 2
Processing /content/sample_data/Promoga Help CenterFAQ.pdf page 1 chunk 2 of 2
Processing /content/sample_data/Promoga Help CenterFAQ.pdf page 2 chunk 1 of 2
Processing /content/sample_data/Promoga Help CenterFAQ.pdf page 2 chunk 2 of 2
Processing /content/sample_data/Promoga Help CenterFAQ.pdf page 3 chunk 1 of 2
Processing /content/sample_data/Promoga Help CenterFAQ.pdf page 3 chunk 2 of 2
Processing /content/sample_data/Promoga Help CenterFAQ.pdf page 4 chunk 1 of 2
Processing /content/sample_data/Promoga Help CenterFAQ.pdf page 4 chunk 2 of 2
Processing /content/sample_data/Promoga Help CenterFAQ.pdf page 5 chunk 1 of 2
Processing /content/sample_data/Promoga Help CenterFAQ.pdf page 5 chunk 2 of 2
Processing /content/sample_data/Promoga Help CenterFAQ.pdf page 6 chunk 1 of 2
Processing /content/sample_data/Promoga Help CenterFAQ.pdf page 6 chunk 2 of 2
Processing /content/sample_data/Promoga Help CenterF

In [17]:
results = search_weaviate(search_term="How can I get a refund?", collection=collection, file_path=file_path)
llm_chunks=[]
if results:
    for i, chunk in enumerate(results.objects):
      print(chunk.properties.get('content'))
      llm_chunks.append(chunk.properties.get('content'))
      print('----------------------------------')
else:
    print("No results found.")

mation. Sometimes, receipts might be delayed due
to heavy site traffic; please allow up to 24 hours. You can check your purchase information in 
My Purchases or manage your reservations under My Reservations.
What happens if the instructor or organizer cancels my 
activity?
If you’re notified of a cancellation, contact the instructor via My Purchases > Options > 
Message Instructor from the profile drop-down.
----------------------------------
Term-plans-limited-features
If the instructor chooses to offer term passes (30 day, 3, 6, or 12 months), the bookings and 
reservation rules are the sole responsibility of the instructor. Currently, Promoga does not 
provide tracking and reservations for term pricing passes sold on Promoga. As noted in 
section 3 of the My Studio page, term passes sold should be tracked by the instructor.
Payments and Charges
What debit/credit cards do you accept?
We accept Visa, Mastercard, Visa & MC debit, and Amex.
Can I cancel a class I have paid for?
Yes, yo

In [20]:
answer = call_gpt4("You are a helpful assistant.", "How can I get a refund?",llm_chunks)
print('')
print("LLMS ANSWER -------------------")
print(answer)

[
    {
        "role": "system",
        "content": "You are a helpful assistant."
    },
    {
        "role": "user",
        "content": "How can I get a refund?"
    },
    {
        "role": "system",
        "content": "Use the following context to answer the question <Context>mation. Sometimes, receipts might be delayed due\nto heavy site traffic; please allow up to 24 hours. You can check your purchase information in \nMy Purchases or manage your reservations under My Reservations.\nWhat happens if the instructor or organizer cancels my \nactivity?\nIf you\u2019re notified of a cancellation, contact the instructor via My Purchases > Options > \nMessage Instructor from the profile drop-down.---------\nTerm-plans-limited-features\nIf the instructor chooses to offer term passes (30 day, 3, 6, or 12 months), the bookings and \nreservation rules are the sole responsibility of the instructor. Currently, Promoga does not \nprovide tracking and reservations for term pricing passes sold 

In [22]:
wclient.close()