## Weaviate Document Chunking and Upload Example

This notebook demonstrates how to chunk document text and upload chunks to a Weaviate instance using the official Python client.

In [None]:
# Install Weaviate client
!pip install weaviate-client

In [None]:
import weaviate

# Connect to local Weaviate instance
client = weaviate.Client("http://localhost:8080")

In [None]:
# Define schema class
schema_class = {
    "class": "Document",
    "properties": [
        {
            "name": "chunk",
            "dataType": ["text"],
            "description": "A chunked segment of the document"
        },
        {
            "name": "source",
            "dataType": ["string"]
        }
    ],
    "vectorizer": "text2vec-transformers"
}

# Create schema class (ignore error if already exists)
try:
    client.schema.create_class(schema_class)
    print("Schema class 'Document' created successfully.")
except Exception as e:
    print("Schema creation error (maybe exists):", e)

In [None]:
# Chunking function
def chunk_text(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

In [None]:
# Example document text to chunk
document_text = '''
OpenAI's GPT-4 is an advanced large language model that can ... (your long document here)
More content can follow in the document that will be chunked and uploaded to Weaviate.
'''.replace('\n', ' ')

# Chunk the document
chunks = chunk_text(document_text)
print(f'Total chunks created: {len(chunks)}')

In [None]:
# Upload chunks to Weaviate
for idx, chunk in enumerate(chunks):
    client.data_object.create(
        {
            "chunk": chunk,
            "source": "example-doc-001"
        }, "Document", str(idx))
print("Upload complete!")

In [None]:
# Query sample
near_text = {
    "concepts": ["large language models"]
}
result = client.query.get("Document", ["chunk", "source"]).with_near_text(near_text).with_limit(3).do()
print(result)