In [1]:
from unstructured.partition.pdf import partition_pdf
from pathlib import Path

import weaviate
from weaviate.embedded import EmbeddedOptions
import os

## Where the DB is stored locally:

When Embedded Weaviate starts for the first time, it creates a permanent datastore in the location set in your persistence_data_path. When your client exits, the Embedded Weaviate instance also exits, but the data persists . The next time the client runs, it starts a new instance of Embedded Weaviate. New Embedded Weaviate instances use the data that is saved in the datastore.

## Data storage directory

If XDG_DATA_HOME is set, the default is: XDG_DATA_HOME/weaviate/

If XDG_DATA_HOME is not set, the default is: ~/.local/share/weaviate

In my case the data is stored in the following location: /Users/username/.local/share/weaviate

In [21]:
client = weaviate.Client(
    embedded_options=EmbeddedOptions(
        additional_env_vars={"X-HuggingFace-Api-Key": "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"}
    )
)
assert client.is_ready()

embedded weaviate is already listening on port 8079


## This is the structure of the data vector dabase: We called it PDF_Document. This is the "Class" that we are going to use to store the data. 


In [22]:
client.schema.delete_all()
# Create a new class with a vectorizer
schema = {
    "class": "PDF_Document",    
    "vectorizer": "text2vec-huggingface",
    "properties": [
        {
            "name": "content",  #What we want to vectorize
            "dataType": ["text"],
            "description": "Content of PDF",
            "moduleConfig": {
                "text2vec-huggingface": {"skip": False, "vectorizePropertyName": False}
            },
        },
        {
            "name": "filename",
            "dataType": ["text"],
            "description": "PDF filename"
        },
    ],
    "moduleConfig": {
    "text2vec-huggingface": {
      "model": "sentence-transformers/all-MiniLM-L6-v2",  # Can be any public or private Hugging Face model.
      "options": {
        "waitForModel": True,  # Try this if you get a "model not ready" error
      }
}
}
}

client.schema.create_class(schema)

{"level":"info","msg":"Created shard pdf_document_HuFNascIdOa9 in 3.401316ms","time":"2024-03-11T20:58:43-04:00"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-03-11T20:58:43-04:00","took":94044}


In [23]:
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import DataSourceMetadata
from unstructured.partition.pdf import partition_pdf
from weaviate.util import generate_uuid5

In [41]:
def get_chunks(elements, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):

    chunks = chunk_by_title(
        elements,
        multipage_sections=False, # If True, the title of the first page is used for all pages
        combine_text_under_n_chars=chunk_under_n_chars,
        new_after_n_chars=chunk_new_after_n_chars
 
    )

    for i in range(len(chunks)):
        chunks[i] = {"text": chunks[i].text, "filename": chunks[i].metadata.filename}

    chunk_texts = [x['text'] for x in chunks]
    return chunks


def add_data_to_weaviate(files, client, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):
    for filename in files:
        try:
            elements = partition_pdf(filename=filename)
            chunks = get_chunks(elements, chunk_under_n_chars, chunk_new_after_n_chars)
        except IndexError as e:
            print(e)
            continue

        print(f"Uploading {len(chunks)} chunks for {str(filename)}.")
        for i, chunk in enumerate(chunks):
            try:
                client.data_object.create(class_name="PDF_Document", data_object={"content": chunk['text'], "filename": filename})
            except Exception as e:
                print(e)
                print(f"Failed to upload chunk {i} for {str(filename)}.")

        with client.batch as batch:
            for data_object in chunks:
                batch.add_data_object(data_object={"content": chunk['text'], "filename": filename}, class_name="PDF_Document", uuid=generate_uuid5(data_object))

        
    client.batch.flush()







In [42]:
from weaviate import Client
import time
import uuid

def configure_batch(client: Client, batch_size: int, batch_target_rate: int):
    """
    Configure the weaviate client's batch so it creates objects at `batch_target_rate`.

    Parameters
    ----------
    client : Client
        The Weaviate client instance.
    batch_size : int
        The batch size.
    batch_target_rate : int
        The batch target rate as # of objects per second.
    """

    def callback(batch_results: dict) -> None:

        # you could print batch errors here
        time_took_to_create_batch = batch_size * (client.batch.creation_time/client.batch.recommended_num_objects)
        time.sleep(
            max(batch_size/batch_target_rate - time_took_to_create_batch + 1, 0)
        )

    client.batch.configure(
        batch_size=batch_size,
        timeout_retries=5,
        callback=callback,
    )

def add_data_to_weaviate(files, client, chunk_under_n_chars=500, chunk_new_after_n_chars=1500, batch_size=10, batch_target_rate=2):
    configure_batch(client, batch_size, batch_target_rate)

    for filename in files:
        try:
            elements = partition_pdf(filename=filename)
            chunks = get_chunks(elements, chunk_under_n_chars, chunk_new_after_n_chars)
        except IndexError as e:
            print(e)
            continue

        print(f"Uploading {len(chunks)} chunks for {str(filename)}.")
        with client.batch as batch:
            for chunk in chunks:
                data_object = {"content": chunk['text'], "filename": filename}
                batch.add_data_object(data_object=data_object, class_name="PDF_Document", uuid=uuid.uuid5(uuid.NAMESPACE_DNS, str(data_object)))

    client.batch.flush()


## Add the files to the vector database

In [43]:

directory_path = '../data/coursematerial/'
import glob
# Dictionary to hold file names and their elements

# Find all PDF files in the specified directory
pdf_files = glob.glob(os.path.join(directory_path, '*.pdf'))
add_data_to_weaviate(
    files=pdf_files,
    client=client,
    chunk_under_n_chars=250,
    chunk_new_after_n_chars=500,
    batch_size=10,
    batch_target_rate=2
)



Uploading 151 chunks for ../data/coursematerial/lec06.pdf.
Uploading 123 chunks for ../data/coursematerial/lec12.pdf.
Uploading 59 chunks for ../data/coursematerial/lec07.pdf.
Uploading 60 chunks for ../data/coursematerial/lec11.pdf.
Uploading 87 chunks for ../data/coursematerial/lec05.pdf.
Uploading 215 chunks for ../data/coursematerial/lec04.pdf.
Uploading 63 chunks for ../data/coursematerial/lec10.pdf.
Uploading 77 chunks for ../data/coursematerial/lec01.pdf.
Uploading 90 chunks for ../data/coursematerial/lec03.pdf.
Uploading 67 chunks for ../data/coursematerial/lec02.pdf.
Uploading 67 chunks for ../data/coursematerial/lec09.pdf.
Uploading 87 chunks for ../data/coursematerial/lec08.pdf.


## Cells below are two examples of queries to the database to get the data you need.

In [46]:
client = weaviate.Client("http://localhost:8079")
# Perform a query
query1 = """
{
  Get {
    PDF_Document (limit: 10) {
      content
      filename
    }
  }
}
"""
result = client.query.raw(query1)
print(result)



{'data': {'Get': {'PDF_Document': [{'content': 'Separating Hyperplanes\n\nSuppose we are given these data points from two diﬀerent classes and want to ﬁnd a linear classiﬁer that separates them.\n\nIntro ML (UofT)\n\nSTA314-Lec6\n\n5 / 44\n\nSeparating Hyperplanes', 'filename': '../data/coursematerial/lec06.pdf'}, {'content': 'sha1_base64="19E7QQ3SzuFSITLhYdfln84cJaQ=">AAACQnicdVDNTsJAGNzFP8Q/0KOXRqLxRFoueiRy8YiJgAk0ZLvdlpX9aXa3JqThHbzq8/gSvoI349WDS+lBIEzyJZOZb5LJBAmj2rjuJyxtbe/s7pX3KweHR8cn1dppT8tUYdLFkkn1FCBNGBWka6hh5ClRBPGAkX4wac/9/gtRmkrxaKYJ8TmKBY0oRsZKPY5UTMWoWncbbg5nnXgFqYMCnVENXg1DiVNOhMEMaT3w3MT4GVKGYkZmlWGqSYLwBMVkYKlAnGg/y+vOnEurhE4klT1hnFz9n8gQ13rKA/vJkRnrVW8ubvLMmM+WNRZLRa1M8QZjpa2Jbv2MiiQ1ROBF2ShljpHOfD8npIpgw6aWIGzzFDt4jBTCxq5cGebBrC05RyLUM7ust7rjOuk1G57b8B6a9dZdsXEZnIMLcA08cANa4B50QBdg8AxewRt4hx/wC37Dn', 'filename': '../data/coursematerial/lec06.pdf'}, {'content': 'z = w(cid:62)x + b y = sign(z)\n\nThis is an equivalent formulation of binary linear classiﬁcation.\n\nLas

In [45]:
query2 = """
{
  Aggregate {
    PDF_Document {
      meta {
        count
      }
    }
  }
}
"""

result = client.query.raw(query2)
print(result)

{'data': {'Aggregate': {'PDF_Document': [{'meta': {'count': 17}}]}}}
