In [None]:
from unstructured.partition.pdf import partition_pdf
from pathlib import Path

import weaviate
from weaviate.embedded import EmbeddedOptions
import os

## Where the DB is stored locally:

When Embedded Weaviate starts for the first time, it creates a permanent datastore in the location set in your persistence_data_path. When your client exits, the Embedded Weaviate instance also exits, but the data persists . The next time the client runs, it starts a new instance of Embedded Weaviate. New Embedded Weaviate instances use the data that is saved in the datastore.

## Data storage directory

If XDG_DATA_HOME is set, the default is: XDG_DATA_HOME/weaviate/

If XDG_DATA_HOME is not set, the default is: ~/.local/share/weaviate

In my case the data is stored in the following location: /Users/username/.local/share/weaviate

In [None]:
client = weaviate.Client(
    embedded_options=EmbeddedOptions(
        additional_env_vars={"X-HuggingFace-Api-Key": "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"}
    )
)
assert client.is_ready()

## This is the structure of the data vector dabase: We called it PDF_Document. This is the "Class" that we are going to use to store the data. 


In [None]:
client.schema.delete_all()
# Create a new class with a vectorizer
schema = {
    "class": "PDF_Document",    
    "vectorizer": "text2vec-huggingface",
    "properties": [
        {
            "name": "content",  #What we want to vectorize
            "dataType": ["text"],
            "description": "Content of PDF",
            "moduleConfig": {
                "text2vec-huggingface": {"skip": False, "vectorizePropertyName": False}
            },
        },
        {
            "name": "filename",
            "dataType": ["text"],
            "description": "PDF filename"
        },
    ],
    "moduleConfig": {
    "text2vec-huggingface": {
      "model": "sentence-transformers/all-MiniLM-L6-v2",  # Can be any public or private Hugging Face model.
      "options": {
        "waitForModel": True,  # Try this if you get a "model not ready" error
      }
}
}
}

client.schema.create_class(schema)

In [None]:
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import DataSourceMetadata
from unstructured.partition.pdf import partition_pdf
from weaviate.util import generate_uuid5

In [None]:
def get_chunks(elements, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):

    chunks = chunk_by_title(
        elements,
        multipage_sections=False, # If True, the title of the first page is used for all pages
        combine_text_under_n_chars=chunk_under_n_chars,
        new_after_n_chars=chunk_new_after_n_chars
 
    )

    for i in range(len(chunks)):
        chunks[i] = {"text": chunks[i].text, "filename": chunks[i].metadata.filename}

    chunk_texts = [x['text'] for x in chunks]
    return chunks


# def add_data_to_weaviate(files, client, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):
#     for filename in files:
#         try:
#             elements = partition_pdf(filename=filename)
#             chunks = get_chunks(elements, chunk_under_n_chars, chunk_new_after_n_chars)
#         except IndexError as e:
#             print(e)
#             continue

#         print(f"Uploading {len(chunks)} chunks for {str(filename)}.")
#         for i, chunk in enumerate(chunks):
#             try:
#                 client.data_object.create(class_name="PDF_Document", data_object={"content": chunk['text'], "filename": filename})
#             except Exception as e:
#                 print(e)
#                 print(f"Failed to upload chunk {i} for {str(filename)}.")

#         with client.batch as batch:
#             for data_object in chunks:
#                 batch.add_data_object(data_object={"content": chunk['text'], "filename": filename}, class_name="PDF_Document", uuid=generate_uuid5(data_object))

        
#     client.batch.flush()







In [None]:
from weaviate import Client
import time
import uuid

def configure_batch(client: Client, batch_size: int, batch_target_rate: int):
    """
    Configure the weaviate client's batch so it creates objects at `batch_target_rate`.

    Parameters
    ----------
    client : Client
        The Weaviate client instance.
    batch_size : int
        The batch size.
    batch_target_rate : int
        The batch target rate as # of objects per second.
    """

    def callback(batch_results: dict) -> None:

        # you could print batch errors here
        time_took_to_create_batch = batch_size * (client.batch.creation_time/client.batch.recommended_num_objects)
        time.sleep(
            max(batch_size/batch_target_rate - time_took_to_create_batch + 1, 15)
        )

    client.batch.configure(
        batch_size=batch_size,
        timeout_retries=5,
        callback=callback,
    )

# def add_data_to_weaviate(files, client, chunk_under_n_chars=500, chunk_new_after_n_chars=1500, batch_size=10, batch_target_rate=2):
#     configure_batch(client, batch_size, batch_target_rate)

#     for i, filename in enumerate(files):
#         print(f"Processing file {i+1}/{len(files)}: {filename}")  # print the current file being processed
#         try:
#             elements = partition_pdf(filename=filename)
#             chunks = get_chunks(elements, chunk_under_n_chars, chunk_new_after_n_chars)
#         except IndexError as e:
#             print(e)
#             continue

#         print(f"Uploading {len(chunks)} chunks for {str(filename)}.")
#         with client.batch as batch:
#             for i, chunk in enumerate(chunks):
#                 data_object = {"content": chunk['text'], "filename": filename}
#                 try:
#                     batch.add_data_object(data_object=data_object, class_name="PDF_Document", uuid=uuid.uuid5(uuid.NAMESPACE_DNS, str(data_object)))
#                 except Exception as e:
#                     print(e)
#                     print(f"Failed to add chunk {i} to batch for {str(filename)}. Continuing with next chunk.")
#                     continue  # continue with the next chunk if an error occurred

#         print(f"Flushing batch for {str(filename)}.")
#         client.batch.flush()

# Ensure the uuid module is imported

def add_data_to_weaviate(files, client, chunk_under_n_chars=500, chunk_new_after_n_chars=1500, batch_size=5, batch_target_rate=2):
    configure_batch(client, batch_size, batch_target_rate)  # Assuming this correctly configures the batch

    for filename in files:
        try:
            elements = partition_pdf(filename=filename)  # Load and process data from files
            chunks = get_chunks(elements, chunk_under_n_chars, chunk_new_after_n_chars)  # Create manageable chunks

            print(f"Uploading {len(chunks)} chunks for {str(filename)}.")

            with client.batch(batch_size=10 ) as batch:
                for chunk in chunks:
                    # Print the current chunk being processed  
                    data_object = {
                        "content": chunk['text'],
                        "filename": filename
                    }
        
                    try:

                        batch.add_data_object(
                            data_object=data_object,
                            class_name="PDF_Document",
                            uuid=generate_uuid5(data_object)
                        )
                
                    except Exception as e:
                        print(f"Failed to add data object to batch for {filename}: {e}")
                        continue
     
                    
        except Exception as e:
            print(f"An error occurred while processing {filename}: {e}")


## Add the files to the vector database

In [None]:

directory_path = '../data/coursematerial/'
import glob
# Dictionary to hold file names and their elements

# Find all PDF files in the specified directory
pdf_files = glob.glob(os.path.join(directory_path, '*.pdf'))

add_data_to_weaviate(
    files=pdf_files,
    client=client,
    chunk_under_n_chars=250,
    chunk_new_after_n_chars=500,
    batch_size=5,
    batch_target_rate=2
)



In [None]:
client.query.aggregate("PDF_Document").with_meta_count().do()


In [None]:
client.is_ready()

## Cells below are two examples of queries to the database to get the data you need.

In [None]:
import json

client = weaviate.Client("http://localhost:8079") # Replace with your endpoint
some_objects = client.data_object.get()
print(json.dumps(some_objects))

In [None]:
client = weaviate.Client("http://localhost:8079")
# Perform a query
query1 = """
{
  Get {
    PDF_Document (limit: 20) {
      content
      filename
    }
  }
}
"""
result = client.query.raw(query1)
print(result)


# print which files are in the results
for file in result['data']['Get']['PDF_Document']:
    print(file['filename'])




In [None]:
query1 = """
{
  Get {
    PDF_Document (limit: 2000) {
      content
      filename
    }
  }
}
"""

result = client.query.raw(query1)
print(result)



In [None]:
query2 = """
{
  Aggregate {
    PDF_Document {
      meta {
        count
      }
    }
  }
}
"""

result = client.query.raw(query2)
print(result)