### Indexing API

In [16]:
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv('../application/.env'))

True

Lets add Documents and Embeddings!

In [17]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter

loader = DirectoryLoader('./restaurant', glob="**/*.txt", loader_cls=TextLoader)
data = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=150, chunk_overlap=20)
docs = text_splitter.split_documents(data)
print(len(docs))

Created a chunk of size 347, which is longer than the specified 150
Created a chunk of size 257, which is longer than the specified 150
Created a chunk of size 229, which is longer than the specified 150
Created a chunk of size 264, which is longer than the specified 150
Created a chunk of size 252, which is longer than the specified 150
Created a chunk of size 351, which is longer than the specified 150
Created a chunk of size 437, which is longer than the specified 150


12


In [18]:
import os

host = os.getenv("PG_VECTOR_HOST")
user = os.getenv("PG_VECTOR_USER")
password = os.getenv("PG_VECTOR_PASSWORD")
database = os.getenv("PGDATABASE")
COLLECTION_NAME = "langchain_collection"

CONNECTION_STRING = f"postgresql+psycopg2://{user}:{password}@{host}:5432/{database}"
CONNECTION_STRING

'postgresql+psycopg2://codingcrashcourses:test#1234@udemypgvectordatabase22.postgres.database.azure.com:5432/pgvector'

In [19]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector
# from langchain_postgres import PGVector

embeddings = OpenAIEmbeddings()

vector_store = PGVector(
    embedding_function=embeddings,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [20]:
from langchain.indexes import SQLRecordManager, index

In [21]:
namespace = f"pgvector/{COLLECTION_NAME}"
record_manager = SQLRecordManager(
    namespace, db_url=CONNECTION_STRING
)

In [22]:
record_manager.create_schema()

In [23]:
# import os
# os.environ.get('OPENAI_API_KEY')

Update the documents to see changes (2nd run)

In [24]:
index(
    docs,
    record_manager,
    vector_store,
    cleanup=None,
    source_id_key="source",
)

{'num_added': 12, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [25]:
from langchain.schema import Document

docs[1].page_content = "updated"
del docs[6]
docs.append(Document(page_content="new content", metadata={"source": "important"}))

In [26]:
index(
    docs,
    record_manager,
    vector_store,
    cleanup=None,
    source_id_key="source",
)

{'num_added': 2, 'num_updated': 0, 'num_skipped': 10, 'num_deleted': 0}

In [27]:
docs[1].page_content = "updated again"
del docs[2]
del docs[3]
del docs[4]
docs.append(Document(page_content="more new content", metadata={"source": "important"}))

In [28]:
index(
    docs,
    record_manager,
    vector_store,
    cleanup="incremental",
    source_id_key="source",
)

{'num_added': 2, 'num_updated': 0, 'num_skipped': 8, 'num_deleted': 6}

In [29]:
index(
    [],
    record_manager,
    vector_store,
    cleanup="incremental",
    source_id_key="source",
)

{'num_added': 0, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [30]:
index([], record_manager, vector_store, cleanup="full", source_id_key="source")

{'num_added': 0, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 10}