LangChain indexing makes use of a record manager (RecordManager) that keeps track of document writes into a vector store.

When indexing content, hashes are computed for each document, and the following information is stored in the record manager.

In [1]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import ElasticsearchStore, Chroma
from langchain.indexes import SQLRecordManager
import sys 
sys.path.append(r'C:\Users\ELAFACRB1\Codice\GitHub\media-chat-service\src\embedding')

In [2]:
from langchain.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper

In [3]:
from utils import EmbeddingFunction, ChromaDBManager
embedding = EmbeddingFunction('openAI').embedder
embedding

  warn_deprecated(


OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, async_client=None, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None)

In [4]:
collection_name = "test_index"
url="http://ec2-18-209-145-26.compute-1.amazonaws.com:9200"
namespace = f"elasticsearch/{collection_name}"

In [5]:
vector_store = ElasticsearchStore(
    collection_name,    
    es_url=url, 
    embedding=embedding
)

In [6]:
vector_store

<langchain_community.vectorstores.elasticsearch.ElasticsearchStore at 0x1cfba2e7a90>

Compatible Vectorstores: AnalyticDB, AstraDB, AwaDB, Bagel, Cassandra, Chroma, DashVector, DatabricksVectorSearch, DeepLake, Dingo, ElasticVectorSearch, ElasticsearchStore, FAISS, HanaDB, Milvus, MyScale, PGVector, Pinecone, Qdrant, Redis, ScaNN, SupabaseVectorStore, SurrealDBStore, TimescaleVector, Vald, Vearch, VespaStore, Weaviate, ZepVectorStore.

In [5]:
#initialize db
record_manager = SQLRecordManager(
    namespace, db_url="sqlite:///record_manager_cache.sql"
)
record_manager.create_schema()
record_manager

<langchain.indexes._sql_record_manager.SQLRecordManager at 0x18f0af648d0>

In [4]:
#retieval wiki data 
wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())

In [5]:
retrieved_docs=wikipedia.run("CAL KESTIS")
retrieved_docs

'Page: Cal Kestis\nSummary: Cal Kestis is a fictional character in the Star Wars franchise. He is the playable protagonist of the Star Wars Jedi game series which includes the 2019 video game Star Wars Jedi: Fallen Order and its 2023 sequel, Star Wars Jedi: Survivor, developed by Respawn Entertainment. The character has also appeared in other media of the franchise, such as the novel Star Wars Jedi: Battle Scars by Sam Maggs, which bridges the gap between the two Star Wars Jedi games. Cal is played by American actor and model Cameron Monaghan through performance capture.\nWithin the franchise, Cal is a former Jedi Padawan and survivor of the Great Jedi Purge who, during the early reign of the Galactic Empire, lives in seclusion on the planet Bracca until an incident forces him to reveal his Force powers. Pursued throughout the galaxy by the Inquisitors, the Empire\'s Jedi hunters, Cal embarks on a quest to try and rebuild the Jedi Order as he is joined by a number of unlikely allies, w

split

In [6]:
from utils import TextSplitter
textSplitter = TextSplitter()
splitted_docs=textSplitter.split_text(retrieved_docs)
splitted_docs

['Page: Cal Kestis\nSummary: Cal Kestis is a fictional character in the Star Wars franchise. He is the playable protagonist of the Star Wars Jedi game series which includes the 2019 video game Star Wars Jedi: Fallen Order and its 2023 sequel, Star Wars Jedi: Survivor, developed by Respawn Entertainment.',
 'The character has also appeared in other media of the franchise, such as the novel Star Wars Jedi: Battle Scars by Sam Maggs, which bridges the gap between the two Star Wars Jedi games. Cal is played by American actor and model Cameron Monaghan through performance capture.\nWithin the franchise, Cal is a former Jedi Padawan and survivor of the Great Jedi Purge who, during the early reign of the Galactic Empire, lives in seclusion on the planet Bracca until an incident forces him to reveal his Force powers. Pursued throughout the galaxy by the Inquisitors, the Empire\'s Jedi hunters, Cal embarks on a quest to try and rebuild the Jedi Order as he is joined by a number of unlikely alli

In [7]:
from langchain.docstore.document import Document
final_docs=[]
for doc in splitted_docs:
    final_docs.append(Document(page_content=doc, metadata={"source": "local"}))
final_docs


[Document(page_content='Page: Cal Kestis\nSummary: Cal Kestis is a fictional character in the Star Wars franchise. He is the playable protagonist of the Star Wars Jedi game series which includes the 2019 video game Star Wars Jedi: Fallen Order and its 2023 sequel, Star Wars Jedi: Survivor, developed by Respawn Entertainment.', metadata={'source': 'local'}),
 Document(page_content='The character has also appeared in other media of the franchise, such as the novel Star Wars Jedi: Battle Scars by Sam Maggs, which bridges the gap between the two Star Wars Jedi games. Cal is played by American actor and model Cameron Monaghan through performance capture.\nWithin the franchise, Cal is a former Jedi Padawan and survivor of the Great Jedi Purge who, during the early reign of the Galactic Empire, lives in seclusion on the planet Bracca until an incident forces him to reveal his Force powers. Pursued throughout the galaxy by the Inquisitors, the Empire\'s Jedi hunters, Cal embarks on a quest to 

In [16]:
#with chroma vector store
persist_directory=r'C:\Users\ELAFACRB1\Codice\GitHub\media-chat-service\storage'
vectorstore = Chroma.from_documents(final_docs, embedding, persist_directory=persist_directory)
vectorstore

<langchain_community.vectorstores.chroma.Chroma at 0x1cfbb990490>

indexing

this should be done ayncronously

In [17]:
from langchain.indexes import index

index(
    final_docs,
    record_manager,
    vector_store,
    cleanup="full",
    source_id_key="source"
)

{'num_added': 2, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 7}

retrieval

In [21]:
results = vector_store.similarity_search(
    "where's the first appearence of Cal Kestis?", k=4
)
results

[Document(page_content='Page: Cal Kestis\nSummary: Cal Kestis is a fictional character in the Star Wars franchise. He is the playable protagonist of the Star Wars Jedi game series which includes the 2019 video game Star Wars Jedi: Fallen Order and its 2023 sequel, Star Wars Jedi: Survivor, developed by Respawn Entertainment. The character has also appeared in other media of the franchise, such as the novel Star Wars Jedi: Battle Scars by Sam Maggs, which bridges the gap between the two Star Wars Jedi games. Cal is played by American actor and model Cameron Monaghan through performance capture.\nWithin the franchise, Cal is a former Jedi Padawan and survivor of the Great Jedi Purge who, during the early reign of the Galactic Empire, lives in seclusion on the planet Bracca until an incident forces him to reveal his Force powers. Pursued throughout the galaxy by the Inquisitors, the Empire\'s Jedi hunters, Cal embarks on a quest to try and rebuild the Jedi Order as he is joined by a numbe

# WIth Qdrant

In [None]:
# !docker run --restart unless-stopped -d --name qdrant -p 6333:6333 -p 6334:6334 -v $(pwd)/qdrant_storage:/qdrant/storage:z qdrant/qdrant

In [10]:
import qdrant_client

In [12]:
from qdrant_client import QdrantClient
client = QdrantClient("http://ec2-18-209-145-26.compute-1.amazonaws.com:6333/dashboard", port=6333)
client

<qdrant_client.qdrant_client.QdrantClient at 0x18f0c296c50>

In [20]:
from qdrant_client.http.models import Distance, VectorParams
client.create_collection(
    collection_name="openai_documents",
    vectors_config=VectorParams(size=1536, distance=Distance.DOT),
)

True

See [Dealing with Vector Dimension Mismatch: My Experience with OpenAI Embeddings and Qdrant Vector Storage](https://medium.com/@epappas/dealing-with-vector-dimension-mismatch-my-experience-with-openai-embeddings-and-qdrant-vector-20a6e13b6d9f)

### With langchain clients

In [23]:
from langchain.vectorstores import Qdrant
vector_store = Qdrant(
    client=client, collection_name="openai_documents",
    embeddings=embedding,
)
vector_store

<langchain_community.vectorstores.qdrant.Qdrant at 0x18f12aac9d0>

In [24]:
from langchain.indexes import index

index(
    final_docs,
    record_manager,
    vector_store,
    cleanup="full",
    source_id_key="source"
)

{'num_added': 7, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 2}

# With utils

In [8]:
import sys 
sys.path.append(r'C:\Users\ELAFACRB1\Codice\GitHub\media-chat-service\src\embedding')
from utils import QDrantDBManager

In [13]:
qdrantClient = QDrantDBManager(
    url="http://ec2-18-209-145-26.compute-1.amazonaws.com:6333/dashboard",
    port=6333,
    collection_name="wikipedia_3",
    vector_size=1536,
    embedding=embedding,
    record_manager_url="sqlite:///record_manager_cache.sql"
)

In [14]:
qdrantClient.client

<qdrant_client.qdrant_client.QdrantClient at 0x2a909274410>

In [15]:
qdrantClient.index_documents(final_docs)