# Parsing

In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
from src.util import parse_documents
from src.parser.markdown import MarkdownParser
from src.parser.text import TextParser
from config import DOCS_PATH, MARKDOWNS_PATH, TEXTS_PATH

markdown_parser = MarkdownParser()
text_parser = TextParser()

In [3]:
# Was run previously under another project structure
# %time
# parse_documents(markdown_parser, DOCS_PATH, MARKDOWNS_PATH, '.md')
# parse_documents(text_parser, DOCS_PATH, TEXTS_PATH, '.txt')

# Chunking

In [3]:
%time
import os
from itertools import product

from src.util import generate_chunks
from config import CHUNKS_PATH

parsed_params = [
    (MARKDOWNS_PATH, '.md'),
    (TEXTS_PATH, '.txt')
]
chunking_params = [
    (4096, 512),
    (2048, 256),
    (1024, 128)
]

for parsed_param, chunking_param in product(parsed_params, chunking_params):
    input_directory, suffix = parsed_param
    chunk_size, chunk_overlap = chunking_param
    output_filepath = os.path.join(CHUNKS_PATH, f'{chunk_size}-{suffix.removeprefix('.')}.json')

    generate_chunks(chunk_size, chunk_overlap, input_directory, suffix, output_filepath)

CPU times: user 1 μs, sys: 0 ns, total: 1 μs
Wall time: 2.62 μs


In [10]:
import os
import time

from src.util import load_chunks_to_database
from src.vec_database.dense import DenseDatabase
from src.embedding.fastembed_ import MiniLmEmbedding
from src.embedding.openai_ import OpenEmbeddingSmall
from src.embedding.qwen import QwenEmbeddingSmall
from config import OPENAI_KEY, DENSE_CONN_STRING, CHUNKS_PATH


async def upload_documents(*embeddings):
    print('Uploading documents to DenseDatabase:')

    for chunk_file in os.listdir(CHUNKS_PATH):
        if 'md' in chunk_file:
            continue

        for embedding_name, embedding, batch_size in embeddings:
            collection_name = f'{chunk_file.removesuffix('.json')}-{embedding_name}'
            database = DenseDatabase(DENSE_CONN_STRING, embedding)

            try:
                start = time.time()

                await load_chunks_to_database(
                    database=database,
                    collection_name=collection_name,
                    chunk_file=os.path.join(CHUNKS_PATH, chunk_file),
                    batch_size=batch_size,
                )

                end = time.time()
                print(f'Uploaded: {collection_name} ({abs(end - start):.2f}s)')
            finally:
                await database.client.close()

In [11]:
await upload_documents(
    ('minilm', MiniLmEmbedding(), None),
    ('qwen', QwenEmbeddingSmall(), None)
)

Uploading documents to DenseDatabase:
Uploaded: 2048-txt-minilm (12.93s)
Uploaded: 2048-txt-qwen (439.88s)
Uploaded: 1024-txt-minilm (23.89s)
Uploaded: 1024-txt-qwen (480.07s)
Uploaded: 4096-txt-minilm (7.49s)
Uploaded: 4096-txt-qwen (864.69s)


In [12]:
await upload_documents(
    ('openai', OpenEmbeddingSmall(OPENAI_KEY), 800)
)

Uploading documents to DenseDatabase:
Uploaded: 2048-txt-openai (208.34s)
Uploaded: 1024-txt-openai (361.77s)
Uploaded: 4096-txt-openai (128.90s)
