In [1]:
from phi.agent import Agent
from phi.model.openai import OpenAIChat
from phi.embedder.openai import OpenAIEmbedder
from phi.knowledge.pdf import PDFUrlKnowledgeBase
from phi.vectordb.pgvector import PgVector, SearchType

In [2]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.supabase import SupabaseVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from supabase.client import Client, create_client

In [3]:
import os 
from dotenv import load_dotenv

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
supabase_url = os.getenv("SUPABASE_URL")
supabase_key = os.getenv("SUPABASE_KEY")

In [8]:
supabase: Client = create_client(supabase_url, supabase_key)
embeddings = OpenAIEmbeddings()
directory = "../documents"
loader = DirectoryLoader(path=directory, glob="**/*.md", loader_cls=UnstructuredMarkdownLoader)
text_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN, chunk_size=1000, chunk_overlap=100
)

In [10]:
loader.load()

[Document(metadata={'source': '../documents/Crop Watch 29th February 2024 (1).pdf.md'}, page_content='This image contains detailed information about rice production in Sri Lanka, particularly focusing on paddy cultivation. Here\'s a breakdown of the key information:\n\nSeasonal Production Data:\n\nThe table shows paddy production from 2010 to 2023, with a forecast for 2024.\n\nProduction is divided into Maha and Yala seasons, with total production given.\n\nIn 2023, the total production was 4,526 thousand metric tons (mt).\n\nThe 2024 forecast shows a total production of 4,639 thousand mt.\n\nProduction Graph:\n\nThe graph illustrates the seasonal production trends from 2007 to 2024.\n\nIt shows Yala and Maha season production separately.\n\nThere\'s a notable increase in production for 2023 and 2024, especially in the Maha season.\n\nDistrict-wise Expected Production:\n\nA detailed table lists expected production for various districts and regions.\n\nThe highest production is expected

In [11]:
def prepare_database():
    docs = loader.load()
    documents = text_splitter.split_documents(docs)
    SupabaseVectorStore.from_documents(
        documents=documents,
        embedding=embeddings,
        client=supabase,
        table_name="documents",
        chunk_size=500
    )
    print("database ready")

In [12]:
prepare_database()

database ready
