----

## Initial Vector DB Setup

In [None]:
import dotenv
import os
import sys

from pathlib import Path
from datetime import datetime
from tqdm.notebook import tqdm

In [None]:
# Load and set environment

dotenv.load_dotenv()
os.environ['USER_AGENT'] = 'myagent'
PROJECT_HOME = Path(os.environ.get('PROJECT_HOME', Path.cwd() / '..')).resolve()
sys.path.append(str(PROJECT_HOME))

In [None]:
from app.indexing.metadata import DocumentMetadata
from app.databases.vector import VectorDB

vector_db = VectorDB(
    # auto_id=True,
    # drop_old=True,  # Drop existing values inside the collection
)

---

# Ingesting documents


In [None]:
DOCS_BASE_PATH = PROJECT_HOME / 'data'

def get_documents_from_subfolder(subpath):
    ''' Returns all the documents from a sub-path of the DOCS_BASE_PATH'''
    return list((DOCS_BASE_PATH / subpath).glob("*"))

In [None]:
from langchain_community.document_loaders.text import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm.notebook import tqdm
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.word_document import Docx2txtLoader

def get_splits_from_paths(file_paths, loader):
    '''
    Returns langchain Documents split using a RecusriveCharacterTextSplitter (for now).
    Their metadata is set to our project metadata.

    loader needs to be some langchain loader, e.g. TextLoader.
    '''
    splits = []
    for file_path in tqdm(file_paths, desc="Processing documents"):
        try:
            loaded = loader(file_path)
                        
            docs = loaded.load()
    
            # Convert timestamp to formatted string
            timestamp = file_path.stat().st_mtime
            modified_date = datetime.fromtimestamp(timestamp)
    
            # Add metadata to all file chunks
            for doc in docs:
                metadata = DocumentMetadata(source_id= file_path.name,
                                 source_name= file_path.name,
                                 modified_at= modified_date)
                doc.metadata = metadata.to_dict()
    
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
            splits += text_splitter.split_documents(docs)
    
        except Exception as exc:
            print(f"Error processing {file_path}: {exc}")  # Optional: for debugging
    return splits
    
def get_txt_splits_from_paths(file_paths): return get_splits_from_paths(file_paths, TextLoader)
def get_pdf_splits_from_paths(file_paths): return get_splits_from_paths(file_paths, PyPDFLoader)
def get_docx_splits_from_paths(file_paths): return get_splits_from_paths(file_paths, Docx2txtLoader)


In [None]:
get_documents_from_subfolder("txt")

## Ingesting txt documents

In [None]:
# This assumes there is a subfolder of the /data/ folder called "txt".
txt_splits = get_txt_splits_from_paths(get_documents_from_subfolder('txt'))
txt_splits[:2]

In [None]:
vector_db.add_documents(documents=txt_splits)

## Ingesting PDF files

In [None]:
# This assumes there is a subfolder of the /data/ folder called "pdf".
pdf_splits =  get_pdf_splits_from_paths(get_documents_from_subfolder("pdf"))

In [None]:
vector_db.add_documents(documents=pdf_splits)

## Ingesting Docx files

In [None]:
# This assumes there is a subfolder of the /data/ folder called "docx".
docx_splits =  get_docx_splits_from_paths(get_documents_from_subfolder("docx"))

In [None]:
vector_db.add_documents(documents=docx_splits)