----

## Initial Vector DB Setup

In [None]:
import dotenv
import os
import pandas as pd
import sys

from pathlib import Path

from app.indexing.metadata import DocumentMetadata

In [None]:
# Load and set environment

dotenv.load_dotenv()
os.environ['USER_AGENT'] = 'myagent'
PROJECT_HOME = Path(os.environ.get('PROJECT_HOME', Path.cwd() / '..')).resolve()
sys.path.append(str(PROJECT_HOME))

In [None]:
# from app.databases.vector.milvus import Milvus
from app.databases.vector import VectorDB

vector_db = VectorDB(
    # auto_id=True,
    # drop_old=True,  # Drop existing values inside the collection
)

---

## Ingesting docx documents


In [None]:

# # Scan files and load them into the vector DB.

# from langchain_community.document_loaders.word_document import Docx2txtLoader
# from langchain_text_splitters import RecursiveCharacterTextSplitter
# from tqdm.notebook import tqdm
# from datetime import datetime

# # File paths
# docs_path = PROJECT_HOME / 'data' / 'docx'
# # index_df = pd.read_csv(docs_path.parent / 'drive_files.csv')

# # Scan all files that appear in the CSV.
# from pathlib import Path
# from tqdm import tqdm

# # Specify your base directory containing the documents
# docs_path = Path(docs_path)

# # Get all .docx files in the directory
# files = list(docs_path.glob("*.docx"))

# for file_path in tqdm(files, desc="Processing documents"):
#     try:
#         loader = Docx2txtLoader(file_path)
#         docs = loader.load()

#         # Convert timestamp to formatted string
#         timestamp = file_path.stat().st_mtime
#         modified_date = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')


#         # Add metadata to all file chunks
#         for doc in docs:
#             doc.metadata = doc.metadata | {
#                 'source_name': file_path.name,
#                 'modified_at': modified_date,  # Gets file modification time
#                 'source_id': file_path.name,
#             }

#         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
#         splits = text_splitter.split_documents(docs)
#         vector_db.add_documents(documents=splits)

#     except Exception as exc:
#         print(f"Error processing {file_path}: {exc}")  # Optional: for debugging


# print('Done!')

# Ingesting documents


In [None]:
from pathlib import Path
from tqdm.notebook import tqdm


DOCS_BASE_PATH = PROJECT_HOME / 'data'

def get_documents_from_subfolder(subpath):
    ''' Returns all the documents from a sub-path of the DOCS_BASE_PATH'''
    return list((DOCS_BASE_PATH / subpath).glob("*"))

In [None]:
from langchain_community.document_loaders.text import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm.notebook import tqdm
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.word_document import Docx2txtLoader

def get_splits_from_paths(file_paths, loader):
    '''
    Returns langchain Documents split using a RecusriveCharacterTextSplitter (for now).
    Their metadata is set to our project metadata.

    loader needs to be some langchain loader, e.g. TextLoader.
    '''
    splits = []
    for file_path in tqdm(file_paths, desc="Processing documents"):
        try:
            loader = loader(file_path)
            docs = loader.load()
    
            # Convert timestamp to formatted string
            timestamp = file_path.stat().st_mtime
            modified_date = datetime.fromtimestamp(timestamp)# .strftime('%Y-%m-%d %H:%M:%S')
    
            # Add metadata to all file chunks
            for doc in docs:
                metadata = DocumentMetadata(source_id= file_path.name,
                                 source_name= file_path.name,
                                 modified_at= modified_date)
                doc.metadata = metadata.to_dict()
    
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
            splits += text_splitter.split_documents(docs)
            # vector_db.add_documents(documents=splits)
    
        except Exception as exc:
            print(f"Error processing {file_path}: {exc}")  # Optional: for debugging
    return splits
    
def get_txt_splits_from_paths(file_paths): return get_splits_from_paths(file_paths, TextLoader)
def get_pdf_splits_from_paths(file_paths): return get_splits_from_paths(file_paths, PyPDFLoader)
def get_docx_splits_from_paths(file_paths): return get_splits_from_paths(file_paths, Docx2txtLoader)


In [None]:
get_documents_from_subfolder("txt")

## Ingesting txt documents

In [None]:
txt_splits = get_txt_splits_from_paths(get_documents_from_subfolder('txt'))
txt_splits[:2]

In [None]:
vector_db.add_documents(documents=txt_splits)

## Ingesting PDF files

In [None]:
pdf_splits_alice = get_pdf_splits_from_paths(get_documents_from_subfolder("pdf_alice"))
pdf_splits_test =  get_pdf_splits_from_paths(get_documents_from_subfolder("pdf"))

In [None]:
# vector_db.add_documents(documents=pdf_splits_test)
vector_db.add_documents(documents=pdf_splits_alice)

## Ingesting Docx files

In [None]:
docx_splits =  get_docx_splits_from_paths(get_documents_from_subfolder("docx"))

In [None]:
vector_db.add_documents(documents=docx_splits)

In [None]:

# # Scan files and load them into the vector DB.

# from langchain_community.document_loaders.text import TextLoader
# from langchain_text_splitters import RecursiveCharacterTextSplitter
# from tqdm.notebook import tqdm
# from datetime import datetime


# # Get all .docx files in the directory
# files = get_documents_from_subfolder("txt")

# for file_path in tqdm(files, desc="Processing documents"):
#     try:
#         loader = TextLoader(file_path)
#         docs = loader.load()

#         # Convert timestamp to formatted string
#         timestamp = file_path.stat().st_mtime
#         modified_date = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')

#         # Add metadata to all file chunks
#         for doc in docs:
#             doc.metadata = doc.metadata | {
#                 'source_name': file_path.name,
#                 'modified_at': modified_date,  # Gets file modification time
#                 'source_id': file_path.name,
#             }

#         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
#         splits = text_splitter.split_documents(docs)
#         vector_db.add_documents(documents=splits)

#     except Exception as exc:
#         print(f"Error processing {file_path}: {exc}")  # Optional: for debugging


# print('Done!')

In [None]:
files = get_documents_from_subfolder("txt")

loader = TextLoader(files[0])
docs = loader.load()

In [None]:
docs[0].metadata

## Ingesting PDFs OLD

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm.notebook import tqdm
from datetime import datetime


In [None]:
def get_pdf_splits_from_paths(files):
    '''
    Returns langchain Documents split using a RecusriveCharacterTextSplitter (for now).
    Their metadata is set to our project metadata.
    '''
    splits = []
    for file_path in tqdm(files, desc="Processing documents"):
        try:
            loader = PyPDFLoader(file_path)
            docs = loader.load()
    
            # Convert timestamp to formatted string
            timestamp = file_path.stat().st_mtime
            modified_date = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
    
            # Add metadata to all file chunks
            # for doc in docs:
            for doc in docs[:1]: # Temp - only do 1!
                doc.metadata =  {
                    'source_name': file_path.name,
                    'modified_at': modified_date,  # Gets file modification time
                    'source_id': file_path.name,
                }
    
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
            splits += text_splitter.split_documents(docs)
            # vector_db.add_documents(documents=splits)
    
        except Exception as exc:
            print(f"Error processing {file_path}: {exc}")  # Optional: for debugging
    return splits
    
        

In [None]:
alice_splits = get_pdf_splits_from_paths(get_documents_from_subfolder('pdf_alice'))

In [None]:
test_splits = alice_splits = get_pdf_splits_from_paths(get_documents_from_subfolder('pdf'))

In [None]:
test_splits, alice_splits

In [None]:
# # Scan files and load them into the vector DB.
# files = get_documents_from_subfolder('pdf_alice')

# for file_path in tqdm(files, desc="Processing documents"):
#     try:
#         loader = PyPDFLoader(file_path)
#         docs = loader.load()

#         # Convert timestamp to formatted string
#         timestamp = file_path.stat().st_mtime
#         modified_date = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')

#         # Add metadata to all file chunks
#         for doc in docs[:1]:
#             doc.metadata = doc.metadata | {
#                 'source_name': file_path.name,
#                 'modified_at': modified_date,  # Gets file modification time
#                 'source_id': file_path.name,
#             }

#         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
#         splits = text_splitter.split_documents(docs)
#         # vector_db.add_documents(documents=splits)

#     except Exception as exc:
#         print(f"Error processing {file_path}: {exc}")  # Optional: for debugging


# print('Done!')

In [None]:
files = get_documents_from_subfolder('pdf')

In [None]:
loader = PyPDFLoader(files[0])
docs = loader.load()

In [None]:
docs[0]

In [None]:
del docs[0].metadata['page']

In [None]:
docs[0].page_content

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs[:1])

In [None]:
splits[:1]

In [None]:
# splits[:1]
from langchain_core.documents import Document

document_1 = Document(
    page_content='hello world',
    metadata={"source": "tweet"},
)

In [None]:
document_1

In [None]:
vector_db.add_documents([document_1])

In [None]:

document_2 = Document(
    page_content=splits[0].page_content,
    metadata={"source": "tweet"},
)
document_2

In [None]:
vector_db.add_documents(documents=splits)

In [None]:
len(docs)