In [15]:
import os
import uuid
from tqdm import tqdm
from pdfminer.high_level import extract_text
import numpy as np
from app.splitter import TextSplitter
from app.openai import token_size, get_embeddings, get_embedding
from app.loader import load_docs
from app.config import settings
from redis import Redis
from redis.commands.search.field import NumericField, TagField, TextField, VectorField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.json.path import Path
from redis.commands.search.query import Query

In [16]:
DOCS_DIR = 'data/docs'

def load_docs():
    docs = []
    pdf_files = [f for f in os.listdir(DOCS_DIR) if f.endswith('.pdf')]
    for filename in tqdm(pdf_files):
        file_path = os.path.join(DOCS_DIR, filename)
        text = extract_text(file_path)
        docs.append((filename, text))
    print(f'Loaded {len(docs)} PDF documents')

    chunks = []
    text_splitter = TextSplitter(chunk_size=512, chunk_overlap=150)
    print('\nSplitting documents into chunks')
    for doc_name, doc_text in docs:
        doc_id = str(uuid.uuid4())[:8]
        doc_chunks = text_splitter.split(doc_text)
        for chunk_idx, chunk_text in enumerate(doc_chunks):
            chunk = {
                'id': f'{doc_id}:{chunk_idx:04}',
                'text': chunk_text,
                'doc_name': doc_name,
                'vector': None
            }
            chunks.append(chunk)
        print(f'{doc_name}: {len(doc_chunks)} chunks')
    chunk_sizes = [token_size(c['text']) for c in chunks]
    print(f'\nTotal chunks: {len(chunks)}')
    print(f'Min chunk size: {min(chunk_sizes)} tokens')
    print(f'Max chunk size: {max(chunk_sizes)} tokens')
    print(f'Average chunk size: {round(sum(chunk_sizes)/len(chunks))} tokens')
    return chunks

In [17]:
chunks = load_docs()

100%|██████████| 1/1 [00:04<00:00,  4.10s/it]


Loaded 1 PDF documents

Splitting documents into chunks
Inception.pdf: 127 chunks

Total chunks: 127
Min chunk size: 351 tokens
Max chunk size: 512 tokens
Average chunk size: 496 tokens


In [18]:
def batchify(iterable, batch_size):
    for i in range(0, len(iterable), batch_size):
        yield iterable[i:i+batch_size]

In [19]:
vectors = []
print('\nEmbedding chunks')
with tqdm(total=len(chunks)) as pbar:
    for batch in batchify(chunks, batch_size=64):
        batch_vectors = await get_embeddings([chunk['text'] for chunk in batch])
        vectors.extend(batch_vectors)
        pbar.update(len(batch))

for chunk, vector in zip(chunks, vectors):
    chunk['vector'] = np.array(vector, dtype=np.float32).tobytes()


Embedding chunks


100%|██████████| 127/127 [00:02<00:00, 55.78it/s]


In [33]:
r = Redis(host='localhost', port=6379, decode_responses=True)
# r.flushdb()

True

In [34]:
def create_index(index_name='idx:vdb', prefix='vdb:'):
    schema = (
        TextField('text'),
        TextField('doc_name'),
        VectorField('vector',
            'FLAT', {
                'TYPE': 'FLOAT32',
                'DIM': settings.EMBEDDING_DIMENSIONS,
                'DISTANCE_METRIC': 'COSINE',
            }
        ),
    )
    try:
        r.ft(index_name).create_index(
            fields=schema,
            definition=IndexDefinition(prefix=[prefix], index_type=IndexType.HASH)
        )
        print(f'Index {index_name} created successfully')
    except Exception as e:
        print(f'Error creating index {index_name}: {e}')

In [35]:
create_index()

Index idx:vdb created successfully


In [39]:
def add(items):
    pipe = r.pipeline()
    for item in items:
        key = f'vdb:{item['id']}'
        pipe.hset(key, mapping=item)
    pipe.execute()

In [37]:
add(chunks)

In [41]:
def query(vector, top_k=3):
    query = (
        Query(f'(*)=>[KNN {top_k} @vector $vector AS score]')
        .sort_by('score')
        .return_fields('score', 'id', 'text', 'doc_name')
        .paging(0, top_k)
        .dialect(2)
    )
    res = r.ft('idx:vdb').search(query, {'vector': vector})
    return res.docs

In [42]:
question = 'What is the most resilient parasite?'
vector = await get_embedding(question)
vector = np.array(vector, dtype=np.float32).tobytes()

In [44]:
docs = query(vector)
for doc in docs:
    print(doc['score'], doc['id'])
    print(doc['text'])
    print('\n\n************************************************\n\n')

0.731605470181 vdb:b7c5dbf9:0001
INT. SAME - MOMENTS LATER

The Elderly Man watches the Bearded Man WOLF down his food. 
He SLIDES the handgun down the table towards him.

ELDERLY JAPANESE MAN

(in English)

Are you here to kill me?

The Bearded Man glances up at him, then back to his food.

2.

The Elderly Japanese Man picks up the cone between thumb and 
forefinger.

I know what this is.

ELDERLY JAPANESE MAN

He SPINS it onto a table- it CIRCLES gracefully across the 
polished ebony... a SPINNING TOP.

ELDERLY JAPANESE MAN

I’ve seen one before. Many, many 
years ago...

The Elderly Japanese Man STARES at the top mesmerized.

ELDERLY JAPANESE MAN

It belonged to a man I met in a 
half-remembered dream...

MOVE IN on the GRACEFULLY SPINNING TOP...

ELDERLY JAPANESE MAN

A man possessed of some radical 
notions...

The Elderly Japanese Man STARES, remembering... 

What’s the most resilient parasite?

COBB (V.O.)

CUT TO:

INT. SAME ELEGANT DINING ROOM - NIGHT (YEARS EARLIER)

The spe