In [73]:
from unstructured.partition.pdf import partition_pdf
from pathlib import Path

import weaviate
from weaviate.embedded import EmbeddedOptions
import os

In [74]:
client = weaviate.Client(
    embedded_options=EmbeddedOptions(
        additional_env_vars={"X-HuggingFace-Api-Key": "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"}
    )
)
#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
assert client.is_ready()

embedded weaviate is already listening on port 8079


In [75]:
client.schema.delete_all()
# Create a new class with a vectorizer
schema = {
    "class": "Test",    
    "vectorizer": "text2vec-huggingface",
    "properties": [
        {
            "name": "content",  #What we want to vectorize
            "dataType": ["text"],
            "description": "Content of PDF",
            "moduleConfig": {
                "text2vec-huggingface": {"skip": False, "vectorizePropertyName": False}
            },
        },
        {
            "name": "filename",
            "dataType": ["text"],
            "description": "PDF filename"
        },
    ],
    "moduleConfig": {
    "text2vec-huggingface": {
      "model": "sentence-transformers/all-MiniLM-L6-v2",  # Can be any public or private Hugging Face model.
      "options": {
        "waitForModel": True,  # Try this if you get a "model not ready" error
      }
}
}
}

client.schema.create_class(schema)

{"level":"info","msg":"Created shard test_JWRqUX6eGimL in 4.881169ms","time":"2024-03-30T15:41:55-04:00"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-03-30T15:41:55-04:00","took":103540}


In [71]:
import requests
import json
resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')
data = json.loads(resp.text)  # Load data

client.batch.configure(batch_size=100)  # Configure batch
with client.batch as batch:  # Initialize a batch process
    for i, d in enumerate(data):  # Batch import data
        print(f"importing question: {i+1}")
        properties = {
            "answer": d["Answer"],
            "question": d["Question"],
            "category": d["Category"],
        }
        batch.add_data_object(
            data_object=properties,
            class_name="Question"
        )

importing question: 1
importing question: 2
importing question: 3
importing question: 4
importing question: 5
importing question: 6
importing question: 7
importing question: 8
importing question: 9
importing question: 10


In [1]:
response = (
    client.query
    .get("Question", ["question", "answer", "category"])

    .with_limit(10)
    .do()
)

print(json.dumps(response, indent=4))

NameError: name 'client' is not defined

In [76]:
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import DataSourceMetadata
from unstructured.partition.pdf import partition_pdf
from weaviate.util import generate_uuid5

In [55]:
def get_chunks(elements, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):

    chunks = chunk_by_title(
        elements,
        multipage_sections=False, # If True, the title of the first page is used for all pages
        combine_text_under_n_chars=chunk_under_n_chars,
        new_after_n_chars=chunk_new_after_n_chars
 
    )

    for i in range(len(chunks)):
        chunks[i] = {"text": chunks[i].text, "filename": chunks[i].metadata.filename}

    chunk_texts = [x['text'] for x in chunks]
    return chunks

#@sleep_and_retry
#@limits(calls=RATE_LIMIT, period=1)
# def add_data_to_weaviate(files, client, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):
#     for filename in files:
#         try:
#             elements = partition_pdf(filename=filename)
#             chunks = get_chunks(elements, chunk_under_n_chars, chunk_new_after_n_chars)
#         except IndexError as e:
#             print(e)
#             continue

#         print(f"Uploading {len(chunks)} chunks for {str(filename)}.")
#         for i, chunk in enumerate(chunks):
#             try:
#                 client.data_object.create(class_name="Test", data_object={"content": chunk['text'], "filename": filename})
#             except Exception as e:
#                 print(e)
#                 print(f"Failed to upload chunk {i} for {str(filename)}.")

#         with client.batch as batch:
#             for data_object in chunks:
#                 batch.add_data_object(data_object={"content": chunk['text'], "filename": filename}, class_name="Test", uuid=generate_uuid5(data_object))

        
#     client.batch.flush()







In [83]:
from weaviate import Client
import time


def configure_batch(client: Client, batch_size: int, batch_target_rate: int):
    """
    Configure the weaviate client's batch so it creates objects at `batch_target_rate`.

    Parameters
    ----------
    client : Client
        The Weaviate client instance.
    batch_size : int
        The batch size.
    batch_target_rate : int
        The batch target rate as # of objects per second.
    """

    def callback(batch_results: dict) -> None:

        # you could print batch errors here
        time_took_to_create_batch = batch_size * (client.batch.creation_time/client.batch.recommended_num_objects)
        time.sleep(
            max(batch_size/batch_target_rate - time_took_to_create_batch + 1, 0)
        )

    client.batch.configure(
        batch_size=batch_size,
        timeout_retries=5,
        callback=callback,
    )


directory_path = 'data/coursematerial/test_data'
import glob
# Dictionary to hold file names and their elements

# Find all PDF files in the specified directory
pdf_files = glob.glob(os.path.join(directory_path, '*.pdf'))

configure_batch(client, batch_size=10, batch_target_rate=1)

for filename in pdf_files:
    try:
        elements = partition_pdf(filename=filename)
        chunks = get_chunks(elements, 100, 500)
    except IndexError as e:
        print(e)
        continue

    print(f"Uploading {len(chunks)} chunks for {str(filename)}.")
    with client.batch as batch:
        for chunk in chunks:
            data_object = {"content": chunk['text'], "filename": filename}
            print(data_object)
            try:
                print("Uploading chunk")
                batch.add_data_object(data_object=data_object, class_name="Test")
            except Exception as e:
                print(e)
                print(f"Failed to upload chunk for {str(filename)}.")
       


Uploading 103 chunks for data/coursematerial/test_data/lec01.pdf.
{'content': 'STA 314: Statistical Methods for Machine Learning I Lecture 1 - Introduction and Nearest Neighbours\n\nChris J. Maddison1', 'filename': 'data/coursematerial/test_data/lec01.pdf'}
Uploading chunk
{'content': 'University of Toronto\n\n1Slides adapted from CSC 311.\n\nIntro ML (UofT)\n\nSTA314-Lec1\n\n1 / 65\n\nAre you happy with fully online tutorials?\n\nI sent out a survey to measure preferences for in-person tutorials. Please take a moment to ﬁll it out:', 'filename': 'data/coursematerial/test_data/lec01.pdf'}
Uploading chunk
{'content': 'https://forms.office.com/r/mzZ06k9Dfa\n\nYou need to sign into your UofT account to ﬁll it out.\n\nIntro ML (UofT)\n\nSTA314-Lec1\n\n2 / 65', 'filename': 'data/coursematerial/test_data/lec01.pdf'}
Uploading chunk
{'content': '5XptJ09dB)Register by 6:00 p.m. ET on Sep 20.\n\nvoiceJoin phase 2 of a DoSS pilot: The UndergraduateConsultative CommitteeStudentTime commitment: 2–



In [88]:
query2 = """
{
  Aggregate {
    Test {
      meta {
        count
      }
    }
  }
}
"""

result = client.query.raw(query2)
print(result)

{'data': {'Aggregate': {'Test': [{'meta': {'count': 2}}]}}}


In [87]:
import json
response = (
    client.query
    .get("Test", ["filename"])
    .with_limit(100)
    .do()
)

print(json.dumps(response, indent=4))

{
    "data": {
        "Get": {
            "Test": [
                {
                    "filename": "data/coursematerial/test_data/lec01.pdf"
                },
                {
                    "filename": "data/coursematerial/test_data/lec01.pdf"
                }
            ]
        }
    }
}


