In [17]:
from unstructured.partition.pdf import partition_pdf
from pathlib import Path

import weaviate
from weaviate.embedded import EmbeddedOptions
import os

## Where the DB is stored locally:

When Embedded Weaviate starts for the first time, it creates a permanent datastore in the location set in your persistence_data_path. When your client exits, the Embedded Weaviate instance also exits, but the data persists . The next time the client runs, it starts a new instance of Embedded Weaviate. New Embedded Weaviate instances use the data that is saved in the datastore.

## Data storage directory

If XDG_DATA_HOME is set, the default is: XDG_DATA_HOME/weaviate/

If XDG_DATA_HOME is not set, the default is: ~/.local/share/weaviate

In my case the data is stored in the following location: /Users/username/.local/share/weaviate

In [18]:
client = weaviate.Client(
    embedded_options=EmbeddedOptions(
        additional_env_vars={"X-HuggingFace-Api-Key": "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"}
    )
)
assert client.is_ready()

embedded weaviate is already listening on port 8079


## This is the structure of the data vector dabase: We called it PDF_Document. This is the "Class" that we are going to use to store the data. 


In [19]:
client.schema.delete_all()
# Create a new class with a vectorizer
schema = {
    "class": "PDF_Document",    
    "vectorizer": "text2vec-huggingface",
    "properties": [
        {
            "name": "content",  #What we want to vectorize
            "dataType": ["text"],
            "description": "Content of PDF",
            "moduleConfig": {
                "text2vec-huggingface": {"skip": False, "vectorizePropertyName": False}
            },
        },
        {
            "name": "filename",
            "dataType": ["text"],
            "description": "PDF filename"
        },
    ],
    "moduleConfig": {
    "text2vec-huggingface": {
      "model": "sentence-transformers/all-MiniLM-L6-v2",  # Can be any public or private Hugging Face model.
      "options": {
        "waitForModel": True,  # Try this if you get a "model not ready" error
      }
}
}
}

client.schema.create_class(schema)

{"level":"info","msg":"Created shard pdf_document_oG8oFCwTEjPM in 22.034975ms","time":"2024-03-28T18:03:23-04:00"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-03-28T18:03:23-04:00","took":126865}


In [20]:
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import DataSourceMetadata
from unstructured.partition.pdf import partition_pdf
from weaviate.util import generate_uuid5

In [26]:
def get_chunks(elements, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):

    chunks = chunk_by_title(
        elements,
        multipage_sections=False, # If True, the title of the first page is used for all pages
        combine_text_under_n_chars=chunk_under_n_chars,
        new_after_n_chars=chunk_new_after_n_chars
 
    )

    for i in range(len(chunks)):
        chunks[i] = {"text": chunks[i].text, "filename": chunks[i].metadata.filename}
        print(chunks[i])


    return chunks


# def add_data_to_weaviate(files, client, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):
#     for filename in files:
#         try:
#             elements = partition_pdf(filename=filename)
#             chunks = get_chunks(elements, chunk_under_n_chars, chunk_new_after_n_chars)
#         except IndexError as e:
#             print(e)
#             continue

#         print(f"Uploading {len(chunks)} chunks for {str(filename)}.")
#         for i, chunk in enumerate(chunks):
#             try:
#                 client.data_object.create(class_name="PDF_Document", data_object={"content": chunk['text'], "filename": filename})
#             except Exception as e:
#                 print(e)
#                 print(f"Failed to upload chunk {i} for {str(filename)}.")

#         with client.batch as batch:
#             for data_object in chunks:
#                 batch.add_data_object(data_object={"content": chunk['text'], "filename": filename}, class_name="PDF_Document", uuid=generate_uuid5(data_object))








{"level":"info","msg":"Created shard test_HJ3vdbqrXH2z in 4.199115ms","time":"2024-03-28T19:25:51-04:00"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-03-28T19:25:51-04:00","took":111810}
{"level":"info","msg":"Created shard test_smFS6NW6twFn in 2.989293ms","time":"2024-03-28T19:34:11-04:00"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-03-28T19:34:11-04:00","took":144016}


In [10]:
from weaviate import Client
import time
import uuid

def configure_batch(client: Client, batch_size: int, batch_target_rate: int):
    """
    Configure the weaviate client's batch so it creates objects at `batch_target_rate`.

    Parameters
    ----------
    client : Client
        The Weaviate client instance.
    batch_size : int
        The batch size.
    batch_target_rate : int
        The batch target rate as # of objects per second.
    """

    def callback(batch_results: dict) -> None:

        # you could print batch errors here
        time_took_to_create_batch = batch_size * (client.batch.creation_time/client.batch.recommended_num_objects)
        time.sleep(
            max(batch_size/batch_target_rate - time_took_to_create_batch + 1, 0)
        )

    client.batch.configure(
        batch_size=batch_size,
        timeout_retries=5,
        callback=callback,
    )

def add_data_to_weaviate(files, client, chunk_under_n_chars=500, chunk_new_after_n_chars=1500, batch_size=10, batch_target_rate=2):
    configure_batch(client, batch_size, batch_target_rate)

    for filename in files:
        try:
            elements = partition_pdf(filename=filename)
            chunks = get_chunks(elements, chunk_under_n_chars, chunk_new_after_n_chars)
        except IndexError as e:
            print(e)
            continue

        print(f"Uploading {len(chunks)} chunks for {str(filename)}.")
        with client.batch as batch:
            for chunk in chunks:
                data_object = {"content": chunk['text'], "filename": filename}
                batch.add_data_object(data_object=data_object, class_name="PDF_Document", uuid=generate_uuid5(data_object))




## Add the files to the vector database

In [21]:

directory_path = '../data/coursematerial/'
import glob
# Dictionary to hold file names and their elements

# Find all PDF files in the specified directory
pdf_files = glob.glob(os.path.join(directory_path, '*.pdf'))
# put in alphabetical order
pdf_files.sort()

add_data_to_weaviate(
    files=pdf_files,
    client=client,
    chunk_under_n_chars=250,
    chunk_new_after_n_chars=400,
    batch_size=10,
    batch_target_rate=2
)



Uploading 77 chunks for ../data/coursematerial/lec01.pdf.
360
441
360
282
308
264
259
129
382
263
466
301
42
500
40
128
453
448
435
291
468
370
389
307
427
439
272
370
360
273
460
250
360
420
321
276
349
381
362
299
266
446
438
145
499
341
266
360
313
268
305
379
293
318
427
308
279
338
294
299
380
298
379
327
270
182
467
349
162
355
275
194
486
131
483
271
249
Uploading 67 chunks for ../data/coursematerial/lec02.pdf.
308
293
376
281
264
259
312
374
172
490
395
285
432
220
443
299
412
388
390
324
294
271
344
263
316
251
363
227
392
176
355
329
376
263
277
491
311
266
379
135
482
250
332
497
313
220
347
257
261
250
360
329
313
302
339
112
500
200
347
306
299
284
263
263
500
200
180


{"action":"restapi_management","level":"info","msg":"Shutting down... ","time":"2024-03-28T18:24:28-04:00"}
{"action":"restapi_management","level":"info","msg":"Stopped serving weaviate at http://127.0.0.1:8079","time":"2024-03-28T18:24:28-04:00"}


Uploading 90 chunks for ../data/coursematerial/lec03.pdf.
411
324
296
363
86
452
117
452
267
270
275
453
447
294
311
266
97
496
244
424
405
287
308
275
135
500
287
500
241
358
265
320
225
8
500
33
500
25
8
500
72
500
36
500
106
242
8
500
33
500
25
206
8
500
33
500
37
59
8
500
76
304
18
8
500
76
8
500
33
500
49
287
24
8
500
76
8
500
33
500
25
461
388
398
397
251
212
456
413
450
Embedded weaviate wasn't listening on ports http:8079 & grpc:50060, so starting embedded weaviate again
Embedded weaviate wasn't listening on ports http:8079 & grpc:50060, so starting embedded weaviate again
Started /Users/ceciliaacosta/.cache/weaviate-embedded: process ID 13427
Started /Users/ceciliaacosta/.cache/weaviate-embedded: process ID 13428


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-03-28T18:24:29-04:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-03-28T18:24:29-04:00"}
{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-03-28T18:24:29-04:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-03-28T18:24:29-04:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-03-28T18:2

listen tcp :6060: bind: address already in use


{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:8079","time":"2024-03-28T18:24:29-04:00"}
{"level":"info","msg":"Completed loading shard pdf_document_oG8oFCwTEjPM in 16.962956ms","time":"2024-03-28T18:24:29-04:00"}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-03-28T18:24:29-04:00","took":564838}


Uploading 215 chunks for ../data/coursematerial/lec04.pdf.
173
500
261
395
275
105
423
391
7
8
499
8
500
7
500
7
500
7
500
7
500
7
500
7
499
8
499
8
499
8
499
8
499
8
499
8
499
8
499
8
499
8
499
8
500
7
500
6
500
6
500
6
498
8
499
8
499
8
500
7
500
7
500
7
500
76
500
48
500
48
500
48
500
39
8
500
7
500
6
498
8
499
8
499
8
499
75
8
499
8
499
8
499
8
499
8
499
8
499
8
499
96
8
499
8
499
8
499
8
500
7
500
7
500
7
499
8
500
7
500
7
499
8
500
7
500
6
500
6
500
6
498
8
499
8
499
18
500
48
500
48
500
48
500
42
8
500
7
500
6
498
8
499
8
499
8
499
261
358
255
268
154
484
305
335
405
304
353
276
301
278
256
470
100
484
419
180
400
437
218
370
321
254
292
444
328
218
492
294
394
280
454
451
301
411
357
366
291
259
294
316
264
295
179
401
337
436
399
309
293
409
274
220
488
20
Uploading 87 chunks for ../data/coursematerial/lec05.pdf.
439
284
317
274
126
497
450
385
396
268
432
357
333
341
457
198
500
500
111
24
500
500
500
500
500
500
500
500
500
500
500
500
500
500
39
74
500
500
500
500
500
500
5



Uploading 151 chunks for ../data/coursematerial/lec06.pdf.
167
481
399
216
300
206
8
500
45
271
8
500
49
8
500
49
397
8
500
49
8
500
53
8
500
33
192
8
500
29
8
500
61
8
500
37
8
500
53
8
500
53
8
500
29
8
500
29
242
8
500
29
8
500
61
8
500
37
8
500
53
8
500
53
8
500
29
8
500
29
232
8
500
29
8
500
53
8
500
53
8
500
37
8
500
29
8
500
29
8
500
61
393
136
8
500
49
14
500
48
500
48
500
45
346
277
328
272
159
8
500
33
8
500
53
8
500
49
446
147
477
281
379
379
266
259
360
317
289
382
303
319
270
252
357
258
254
276
289
319
93
499
435
346
451
345
285
411
283
223
448
Uploading 59 chunks for ../data/coursematerial/lec07.pdf.
175
419
482
299
411
474
493
17
8
500
15
500
25
500
20
395
412
312
454
381
314
389
257
289
324
401
271
293
233
307
404
297
266
323
464
274
424
394
255
275
374
424
253
328
353
279
498
98
492
372
336
154
372
268
306
324
413
263
143
Uploading 87 chunks for ../data/coursematerial/lec08.pdf.
311
386
435
465
352
347
286
404
294
335
332
271
295
286
335
464
269
383
466
23
10
500
357




## Cells below are two examples of queries to the database to get the data you need.

In [24]:
client = weaviate.Client("http://localhost:8079")
# Perform a query
query1 = """
{
  Get {
    PDF_Document (limit: 10) {
      content
      filename
    }
  }
}
"""
result = client.query.raw(query1)
print(result)



{'data': {'Get': {'PDF_Document': [{'content': 'In other words, STA314 takes a more statistical perspective than CSC311 while covering the same core of material.\n\nIntro ML (UofT)\n\nSTA314-Lec1\n\n15 / 65\n\nAdvanced Courses\n\nThis course will help prepare you for the following courses.\n\nSTA414 (Statistical Methods for Machine Learning II)\n\nThis course is the follow-up course, which delves deeper into the probabilistic interpretation of machine learning that we cover in the last few weeks.', 'filename': '../data/coursematerial/lec01.pdf'}, {'content': 'In case of illness, you should ﬁll out the absence declaration form on ACORN and notify the instructors to request special consideration.\n\nFor accessibility services: If you require additional academic accommodations, please contact UofT Accessibility Services as soon as possible, studentlife.utoronto.ca/as. There is a volunteer note-taker in the course, if you need this service. Check Quercus for details.\n\nIntro ML (UofT)\n\n



In [25]:
query2 = """
{
  Aggregate {
    PDF_Document {
      meta {
        count
      }
    }
  }
}
"""

result = client.query.raw(query2)
print(result)

{'data': {'Aggregate': {'PDF_Document': [{'meta': {'count': 20}}]}}}


