In [6]:
from unstructured.partition.pdf import partition_pdf
from pathlib import Path

import weaviate
from weaviate.embedded import EmbeddedOptions
import os

In [7]:
client = weaviate.Client(
    embedded_options=EmbeddedOptions(
        additional_env_vars={"X-HuggingFace-Api-Key": "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"}
    )
)
assert client.is_ready()

embedded weaviate is already listening on port 8079


            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            


In [8]:
client.schema.delete_all()
# Create a new class with a vectorizer
schema = {
    "class": "Test",    
    "vectorizer": "text2vec-huggingface",
    "properties": [
        {
            "name": "content",  #What we want to vectorize
            "dataType": ["text"],
            "description": "Content of PDF",
            "moduleConfig": {
                "text2vec-huggingface": {"skip": False, "vectorizePropertyName": False}
            },
        },
        {
            "name": "filename",
            "dataType": ["text"],
            "description": "PDF filename"
        },
    ],
    "moduleConfig": {
    "text2vec-huggingface": {
      "model": "sentence-transformers/all-MiniLM-L6-v2",  # Can be any public or private Hugging Face model.
      "options": {
        "waitForModel": True,  # Try this if you get a "model not ready" error
      }
}
}
}

client.schema.create_class(schema)

In [9]:
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import DataSourceMetadata
from unstructured.partition.pdf import partition_pdf
from weaviate.util import generate_uuid5

In [10]:
!pip install ratelimit




In [11]:

from ratelimit import limits, sleep_and_retry
RATE_LIMIT = 1


In [12]:
def get_chunks(elements, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):

    chunks = chunk_by_title(
        elements,
        multipage_sections=False, # If True, the title of the first page is used for all pages
        combine_text_under_n_chars=chunk_under_n_chars,
        new_after_n_chars=chunk_new_after_n_chars
 
    )

    for i in range(len(chunks)):
        chunks[i] = {"text": chunks[i].text, "filename": chunks[i].metadata.filename}

    chunk_texts = [x['text'] for x in chunks]
    return chunks

#@sleep_and_retry
#@limits(calls=RATE_LIMIT, period=1)
def add_data_to_weaviate(files, client, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):
    for filename in files:
        try:
            elements = partition_pdf(filename=filename)
            chunks = get_chunks(elements, chunk_under_n_chars, chunk_new_after_n_chars)
        except IndexError as e:
            print(e)
            continue

        print(f"Uploading {len(chunks)} chunks for {str(filename)}.")
        for i, chunk in enumerate(chunks):
            try:
                client.data_object.create(class_name="Test", data_object={"content": chunk['text'], "filename": filename})
            except Exception as e:
                print(e)
                print(f"Failed to upload chunk {i} for {str(filename)}.")

        with client.batch as batch:
            for data_object in chunks:
                batch.add_data_object(data_object={"content": chunk['text'], "filename": filename}, class_name="Test", uuid=generate_uuid5(data_object))

        
    client.batch.flush()







In [13]:
from weaviate import Client
import time
import uuid

def configure_batch(client: Client, batch_size: int, batch_target_rate: int):
    """
    Configure the weaviate client's batch so it creates objects at `batch_target_rate`.

    Parameters
    ----------
    client : Client
        The Weaviate client instance.
    batch_size : int
        The batch size.
    batch_target_rate : int
        The batch target rate as # of objects per second.
    """

    def callback(batch_results: dict) -> None:

        # you could print batch errors here
        time_took_to_create_batch = batch_size * (client.batch.creation_time/client.batch.recommended_num_objects)
        time.sleep(
            max(batch_size/batch_target_rate - time_took_to_create_batch + 1, 0)
        )

    client.batch.configure(
        batch_size=batch_size,
        timeout_retries=5,
        callback=callback,
    )

def add_data_to_weaviate(files, client, chunk_under_n_chars=500, chunk_new_after_n_chars=1500, batch_size=10, batch_target_rate=2):
    configure_batch(client, batch_size, batch_target_rate)

    for filename in files:
        try:
            elements = partition_pdf(filename=filename)
            chunks = get_chunks(elements, chunk_under_n_chars, chunk_new_after_n_chars)
        except IndexError as e:
            print(e)
            continue

        print(f"Uploading {len(chunks)} chunks for {str(filename)}.")
        with client.batch as batch:
            for chunk in chunks:
                data_object = {"content": chunk['text'], "filename": filename}
                batch.add_data_object(data_object=data_object, class_name="Test", uuid=uuid.uuid5(uuid.NAMESPACE_DNS, str(data_object)))

    client.batch.flush()


In [14]:

directory_path = 'data/coursematerial/test_data'
import glob
# Dictionary to hold file names and their elements

# Find all PDF files in the specified directory
pdf_files = glob.glob(os.path.join(directory_path, '*.pdf'))
add_data_to_weaviate(
    files=pdf_files,
    client=client,
    chunk_under_n_chars=250,
    chunk_new_after_n_chars=500,
    batch_size=10,
    batch_target_rate=2
)



Uploading 77 chunks for data/coursematerial/test_data/lec01.pdf.
Uploading 90 chunks for data/coursematerial/test_data/lec03.pdf.
Uploading 67 chunks for data/coursematerial/test_data/lec02.pdf.




In [16]:
query2 = """
{
  Aggregate {
    Test {
      meta {
        count
      }
    }
  }
}
"""

result = client.query.raw(query2)
print(result)

{'data': {'Aggregate': {'Test': [{'meta': {'count': 20}}]}}}
