----

## Initial Vector DB Setup

In [158]:
import dotenv
import os
import pandas as pd
import sys

from pathlib import Path

from app.indexing.metadata import DocumentMetadata

In [159]:
# Load and set environment

dotenv.load_dotenv()
os.environ['USER_AGENT'] = 'myagent'
PROJECT_HOME = Path(os.environ.get('PROJECT_HOME', Path.cwd() / '..')).resolve()
sys.path.append(str(PROJECT_HOME))

In [160]:
# from app.databases.vector.milvus import Milvus
from app.databases.vector import VectorDB

vector_db = VectorDB(
    # auto_id=True,
    # drop_old=True,  # Drop existing values inside the collection
)

---

## Ingesting docx documents


In [None]:

# # Scan files and load them into the vector DB.

# from langchain_community.document_loaders.word_document import Docx2txtLoader
# from langchain_text_splitters import RecursiveCharacterTextSplitter
# from tqdm.notebook import tqdm
# from datetime import datetime

# # File paths
# docs_path = PROJECT_HOME / 'data' / 'docx'
# # index_df = pd.read_csv(docs_path.parent / 'drive_files.csv')

# # Scan all files that appear in the CSV.
# from pathlib import Path
# from tqdm import tqdm

# # Specify your base directory containing the documents
# docs_path = Path(docs_path)

# # Get all .docx files in the directory
# files = list(docs_path.glob("*.docx"))

# for file_path in tqdm(files, desc="Processing documents"):
#     try:
#         loader = Docx2txtLoader(file_path)
#         docs = loader.load()

#         # Convert timestamp to formatted string
#         timestamp = file_path.stat().st_mtime
#         modified_date = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')


#         # Add metadata to all file chunks
#         for doc in docs:
#             doc.metadata = doc.metadata | {
#                 'source_name': file_path.name,
#                 'modified_at': modified_date,  # Gets file modification time
#                 'source_id': file_path.name,
#             }

#         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
#         splits = text_splitter.split_documents(docs)
#         vector_db.add_documents(documents=splits)

#     except Exception as exc:
#         print(f"Error processing {file_path}: {exc}")  # Optional: for debugging


# print('Done!')

# Ingesting documents


In [126]:
from pathlib import Path
from tqdm.notebook import tqdm


DOCS_BASE_PATH = PROJECT_HOME / 'data'

def get_documents_from_subfolder(subpath):
    ''' Returns all the documents from a sub-path of the DOCS_BASE_PATH'''
    return list((DOCS_BASE_PATH / subpath).glob("*"))

In [154]:
from langchain_community.document_loaders.text import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm.notebook import tqdm
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.word_document import Docx2txtLoader

def get_splits_from_paths(file_paths, loader):
    '''
    Returns langchain Documents split using a RecusriveCharacterTextSplitter (for now).
    Their metadata is set to our project metadata.

    loader needs to be some langchain loader, e.g. TextLoader.
    '''
    splits = []
    for file_path in tqdm(file_paths, desc="Processing documents"):
        try:
            loader = loader(file_path)
            docs = loader.load()
    
            # Convert timestamp to formatted string
            timestamp = file_path.stat().st_mtime
            modified_date = datetime.fromtimestamp(timestamp)# .strftime('%Y-%m-%d %H:%M:%S')
    
            # Add metadata to all file chunks
            for doc in docs:
                metadata = DocumentMetadata(source_id= file_path.name,
                                 source_name= file_path.name,
                                 modified_at= modified_date)
                doc.metadata = metadata.to_dict()
    
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
            splits += text_splitter.split_documents(docs)
            # vector_db.add_documents(documents=splits)
    
        except Exception as exc:
            print(f"Error processing {file_path}: {exc}")  # Optional: for debugging
    return splits
    
def get_txt_splits_from_paths(file_paths): return get_splits_from_paths(file_paths, TextLoader)
def get_pdf_splits_from_paths(file_paths): return get_splits_from_paths(file_paths, PyPDFLoader)
def get_docx_splits_from_paths(file_paths): return get_splits_from_paths(file_paths, Docx2txtLoader)


In [150]:
get_documents_from_subfolder("txt")

[PosixPath('/home/jovyan/work/data/txt/aesops fables_small.txt')]

## Ingesting txt documents

In [151]:
txt_splits = get_txt_splits_from_paths(get_documents_from_subfolder('txt'))
txt_splits[:2]

Processing documents:   0%|          | 0/1 [00:00<?, ?it/s]

[Document(metadata={'source_id': 'aesops fables_small.txt', 'source_name': 'aesops fables_small.txt', 'modified_at': '2024-11-05T12:25:50.989352'}, page_content='\ufeffINTRODUCTION'),
 Document(metadata={'source_id': 'aesops fables_small.txt', 'source_name': 'aesops fables_small.txt', 'modified_at': '2024-11-05T12:25:50.989352'}, page_content='_Æsop embodies an epigram not uncommon in human history; his fame\nis all the more deserved because he never deserved it. The firm\nfoundations of common sense, the shrewd shots at uncommon sense, that\ncharacterise all the Fables, belong not him but to humanity. In\nthe earliest human history whatever is authentic is universal: and\nwhatever is universal is anonymous. In such cases there is always\nsome central man who had first the trouble of collecting them, and\nafterwards the fame of creating them. He had the fame; and, on the\nwhole, he earned the fame. There must have been something great and\nhuman, something of the human future and the h

In [130]:
vector_db.add_documents(documents=txt_splits)

[453719138828027015,
 453719138828027016,
 453719138828027017,
 453719138828027018,
 453719138828027019,
 453719138828027020,
 453719138828027021,
 453719138828027022,
 453719138828027023,
 453719138828027024]

## Ingesting PDF files

In [164]:
pdf_splits_alice = get_pdf_splits_from_paths(get_documents_from_subfolder("pdf_alice"))
pdf_splits_test =  get_pdf_splits_from_paths(get_documents_from_subfolder("pdf"))

Processing documents:   0%|          | 0/1 [00:00<?, ?it/s]

Processing documents:   0%|          | 0/1 [00:00<?, ?it/s]

Ignoring wrong pointing object 6 0 (offset 0)


In [165]:
# vector_db.add_documents(documents=pdf_splits_test)
vector_db.add_documents(documents=pdf_splits_alice)

[453719138828027240,
 453719138828027241,
 453719138828027242,
 453719138828027243,
 453719138828027244,
 453719138828027245,
 453719138828027246,
 453719138828027247,
 453719138828027248,
 453719138828027249,
 453719138828027250,
 453719138828027251,
 453719138828027252,
 453719138828027253,
 453719138828027254,
 453719138828027255,
 453719138828027256,
 453719138828027257,
 453719138828027258,
 453719138828027259,
 453719138828027260,
 453719138828027261,
 453719138828027262,
 453719138828027263,
 453719138828027264,
 453719138828027265,
 453719138828027266,
 453719138828027267,
 453719138828027268,
 453719138828027269,
 453719138828027270,
 453719138828027271,
 453719138828027272,
 453719138828027273,
 453719138828027274,
 453719138828027275,
 453719138828027276,
 453719138828027277,
 453719138828027278,
 453719138828027279,
 453719138828027280,
 453719138828027281,
 453719138828027282,
 453719138828027283,
 453719138828027284,
 453719138828027285,
 453719138828027286,
 453719138828

## Ingesting Docx files

In [162]:
docx_splits =  get_docx_splits_from_paths(get_documents_from_subfolder("docx"))

Processing documents:   0%|          | 0/1 [00:00<?, ?it/s]

In [163]:
vector_db.add_documents(documents=docx_splits)

[453719138828027234,
 453719138828027235,
 453719138828027236,
 453719138828027237,
 453719138828027238]

In [72]:

# # Scan files and load them into the vector DB.

# from langchain_community.document_loaders.text import TextLoader
# from langchain_text_splitters import RecursiveCharacterTextSplitter
# from tqdm.notebook import tqdm
# from datetime import datetime


# # Get all .docx files in the directory
# files = get_documents_from_subfolder("txt")

# for file_path in tqdm(files, desc="Processing documents"):
#     try:
#         loader = TextLoader(file_path)
#         docs = loader.load()

#         # Convert timestamp to formatted string
#         timestamp = file_path.stat().st_mtime
#         modified_date = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')

#         # Add metadata to all file chunks
#         for doc in docs:
#             doc.metadata = doc.metadata | {
#                 'source_name': file_path.name,
#                 'modified_at': modified_date,  # Gets file modification time
#                 'source_id': file_path.name,
#             }

#         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
#         splits = text_splitter.split_documents(docs)
#         vector_db.add_documents(documents=splits)

#     except Exception as exc:
#         print(f"Error processing {file_path}: {exc}")  # Optional: for debugging


# print('Done!')

Processing documents:   0%|          | 0/1 [00:00<?, ?it/s]

Done!


In [30]:
files = get_documents_from_subfolder("txt")

loader = TextLoader(files[0])
docs = loader.load()

In [39]:
docs[0].metadata

{'source': '/home/jovyan/work/data/txt/aesops fables.txt'}

## Ingesting PDFs OLD

In [33]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm.notebook import tqdm
from datetime import datetime


In [80]:
def get_pdf_splits_from_paths(files):
    '''
    Returns langchain Documents split using a RecusriveCharacterTextSplitter (for now).
    Their metadata is set to our project metadata.
    '''
    splits = []
    for file_path in tqdm(files, desc="Processing documents"):
        try:
            loader = PyPDFLoader(file_path)
            docs = loader.load()
    
            # Convert timestamp to formatted string
            timestamp = file_path.stat().st_mtime
            modified_date = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
    
            # Add metadata to all file chunks
            # for doc in docs:
            for doc in docs[:1]: # Temp - only do 1!
                doc.metadata =  {
                    'source_name': file_path.name,
                    'modified_at': modified_date,  # Gets file modification time
                    'source_id': file_path.name,
                }
    
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
            splits += text_splitter.split_documents(docs)
            # vector_db.add_documents(documents=splits)
    
        except Exception as exc:
            print(f"Error processing {file_path}: {exc}")  # Optional: for debugging
    return splits
    
        

In [68]:
alice_splits = get_pdf_splits_from_paths(get_documents_from_subfolder('pdf_alice'))

Processing documents:   0%|          | 0/1 [00:00<?, ?it/s]

In [63]:
test_splits = alice_splits = get_pdf_splits_from_paths(get_documents_from_subfolder('pdf'))

Processing documents:   0%|          | 0/1 [00:00<?, ?it/s]

Ignoring wrong pointing object 6 0 (offset 0)


In [69]:
test_splits, alice_splits

([Document(metadata={'source': '/home/jovyan/work/data/pdf/test.pdf', 'page': 0, 'source_name': 'test.pdf', 'modified_at': '2024-11-05 12:18:35', 'source_id': 'test.pdf'}, page_content='Test text in a pdf.  This is for tes1ng purposes.')],
 [Document(metadata={'source': '/home/jovyan/work/data/pdf_alice/alice-in-wonderland.pdf', 'page': 0, 'source_name': 'alice-in-wonderland.pdf', 'modified_at': '2024-10-31 14:00:47', 'source_id': 'alice-in-wonderland.pdf'}, page_content='ALICE ’S ADVENTURES\nIN WONDERLAND\nby Lewis Carroll\nwith fourty-two illustrations by John Tenniel\nThis book is in public domain.\nNo rigths reserved. Free for copy and distribution.\nThis PDF book is designed and published by PDF REE BOOKS .ORG'),
  Document(metadata={'source': '/home/jovyan/work/data/pdf_alice/alice-in-wonderland.pdf', 'page': 1}, page_content='Contents\nPoem. All in the golden afternoon . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 3\nI Down the Rabbit-Hole . . . . .

In [56]:
# # Scan files and load them into the vector DB.
# files = get_documents_from_subfolder('pdf_alice')

# for file_path in tqdm(files, desc="Processing documents"):
#     try:
#         loader = PyPDFLoader(file_path)
#         docs = loader.load()

#         # Convert timestamp to formatted string
#         timestamp = file_path.stat().st_mtime
#         modified_date = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')

#         # Add metadata to all file chunks
#         for doc in docs[:1]:
#             doc.metadata = doc.metadata | {
#                 'source_name': file_path.name,
#                 'modified_at': modified_date,  # Gets file modification time
#                 'source_id': file_path.name,
#             }

#         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
#         splits = text_splitter.split_documents(docs)
#         # vector_db.add_documents(documents=splits)

#     except Exception as exc:
#         print(f"Error processing {file_path}: {exc}")  # Optional: for debugging


# print('Done!')

Processing documents:   0%|          | 0/1 [00:00<?, ?it/s]

RPC error: [insert_rows], <ParamError: (code=1, message=Field source_name don't match in entities[1])>, <Time:{'RPC start': '2024-11-05 12:30:47.035931', 'RPC error': '2024-11-05 12:30:47.176078'}>
Failed to insert batch starting at entity: 0/207


Error processing /home/jovyan/work/data/pdf_alice/alice-in-wonderland.pdf: <ParamError: (code=1, message=Field source_name don't match in entities[1])>
Done!


In [41]:
files = get_documents_from_subfolder('pdf')

In [42]:
loader = PyPDFLoader(files[0])
docs = loader.load()

Ignoring wrong pointing object 6 0 (offset 0)


In [43]:
docs[0]

Document(metadata={'source': '/home/jovyan/work/data/pdf/test.pdf', 'page': 0}, page_content='Test text in a pdf.  This is for tes1ng purposes. ')

In [14]:
del docs[0].metadata['page']

In [15]:
docs[0].page_content

'ALICE ’S ADVENTURES\nIN WONDERLAND\nby Lewis Carroll\nwith fourty-two illustrations by John Tenniel\nThis book is in public domain.\nNo rigths reserved. Free for copy and distribution.\nThis PDF book is designed and published by PDF REE BOOKS .ORG'

In [20]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs[:1])

In [21]:
splits[:1]

[Document(metadata={'source': '/home/jovyan/work/data/pdf/alice-in-wonderland.pdf'}, page_content='ALICE ’S ADVENTURES\nIN WONDERLAND\nby Lewis Carroll\nwith fourty-two illustrations by John Tenniel\nThis book is in public domain.\nNo rigths reserved. Free for copy and distribution.\nThis PDF book is designed and published by PDF REE BOOKS .ORG')]

In [4]:
# splits[:1]
from langchain_core.documents import Document

document_1 = Document(
    page_content='hello world',
    metadata={"source": "tweet"},
)

In [5]:
document_1

Document(metadata={'source': 'tweet'}, page_content='hello world')

In [6]:
vector_db.add_documents([document_1])

[453719138828026432]

In [23]:

document_2 = Document(
    page_content=splits[0].page_content,
    metadata={"source": "tweet"},
)
document_2

Document(metadata={'source': 'tweet'}, page_content='ALICE ’S ADVENTURES\nIN WONDERLAND\nby Lewis Carroll\nwith fourty-two illustrations by John Tenniel\nThis book is in public domain.\nNo rigths reserved. Free for copy and distribution.\nThis PDF book is designed and published by PDF REE BOOKS .ORG')

In [27]:
vector_db.add_documents(documents=splits)

[453719138828026438]

In [26]:
len(docs)

77