In [None]:
import dotenv
import os
import pandas as pd
import sys

from pathlib import Path

In [None]:
# Load and set environment

dotenv.load_dotenv()
os.environ['USER_AGENT'] = 'myagent'
PROJECT_HOME = Path(os.environ.get('PROJECT_HOME', Path.cwd() / '..')).resolve()
sys.path.append(str(PROJECT_HOME))

----

In [None]:
from app.databases.milvus import Milvus

vector_db = Milvus(
    # auto_id=True,
    # drop_old=True,  # Drop existing values inside the collection
)

In [None]:
# Scan files and load them into the vector DB.

from langchain_community.document_loaders.word_document import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm.notebook import tqdm

# File paths
docs_path = PROJECT_HOME / 'data' / 'gdrive-files'
index_df = pd.read_csv(docs_path.parent / 'drive_files.csv')

# Scan all files that appear in the CSV.
for idx, (_, file_rec) in tqdm(enumerate(index_df.iterrows()), total=len(index_df)):

    try:
        fs_file_path = docs_path / f'{file_rec["ID"]}_{file_rec["Name"]}'

        # Some files are missing a `.docx` in the `Name` column.
        try:
            loader = Docx2txtLoader(fs_file_path)
        except ValueError:
            fs_file_path = Path(str(fs_file_path) + '.docx')
            loader = Docx2txtLoader(fs_file_path)

        docs = loader.load()

        # Add metadata to all file chunks
        for doc in docs:
            doc.metadata = doc.metadata | {
                'source_id': file_rec['URL'],
                'source_name': file_rec['Name'],
                'modified_at': file_rec['Modified Time'],
            }

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        splits = text_splitter.split_documents(docs)
        vector_db.add_documents(documents=splits)

    except Exception as exc:
        # If a file failed for some reason, just print the path and continue to the next file.
        # We can't stop everything for every failure.
        print(fs_file_path, flush=True)
        print(exc, flush=True)
        continue

print('Done!')

In [None]:
# # Scan files and load them into the vector DB.

# from langchain_community.document_loaders.word_document import Docx2txtLoader
# from langchain_text_splitters import RecursiveCharacterTextSplitter
# from tqdm import tqdm

# docs_path = PROJECT_HOME / 'data' / 
# for file_path in tqdm(docs_path.glob('**/*.docx')):
#     loader = Docx2txtLoader(file_path)
#     docs = loader.load()

#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
#     splits = text_splitter.split_documents(docs)
#     vector_db.add_documents(documents=splits)

# print('Done!')

----

## Trying it out

In [None]:
retriever = vector_db.as_retriever(search_kwargs={'k': 5})

In [None]:
# retriever.invoke('onelogin')

----

## Adding a Text Directly

In [None]:
# from langchain_community.document_loaders import text_to_docs
# from langchain.schema import Document

In [None]:
# texts = [
#     'The recommended headphones to use while listening to music are BoseQC35',
#     'The recommended headphones to use while listening to podcasts are Airpods Pro',
# ]

# docs = [Document(page_content=text, metadata={
#     # 'tags': ['headphones', 'music'],
#     'modified_at': '2024-09-22',
#     'source_id': 'NA',
#     'source_name': 'NA',
# }) for text in texts]

# docs

In [None]:
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# splits = text_splitter.split_documents(docs)

In [None]:
# res = vector_db.add_documents(documents=splits)
# res

In [None]:
# retriever.invoke('headphones')

In [None]:
# # Deleting a document from the DB
# res = vector_db.delete(expr='source_id like "%NA%"')
# type(res), res