In [1]:
import os
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter



In [13]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [25]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_qdrant import FastEmbedSparse, RetrievalMode

In [5]:
# Setting up LLM and Embedding model

llm= AzureChatOpenAI(
                openai_api_key = os.getenv("AZURE_OPENAI_API_KEY"),
                openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION"),
                azure_deployment = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"),
                azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
                temperature = 0,
            )

embeddings = AzureOpenAIEmbeddings(
            openai_api_type = "azure",
            openai_api_key = os.getenv("AZURE_OPENAI_API_KEY"),
            azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
            deployment = "text-embedding-ada-002",
            model = "text-embedding-ada-002"
        )

#Quadrant_setup

In [8]:
# client = QdrantClient(path="/tmp/langchain_qdrant") #On-disk storage
client = QdrantClient(":memory:") #In-memory storage

client.create_collection(
    collection_name="demo_collection",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name="demo_collection",
    embedding=embeddings,
)

In [12]:
# Step 2: Load the PDF and split the text into chunks
pdf_loader = PyPDFLoader("./data/21-CFR-gmp.pdf")  # Load your PDF file
documents = pdf_loader.load()  # Load all pages as documents

In [14]:
# Split the text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [None]:
# len(chunks)
vector_store.add_documents(chunks)
print("PDF content successfully stored in the vector store!")

In [23]:
# Step 3: Generate embeddings for each chunk and store them in the vector store
results = vector_store.similarity_search(
    "What controls must be exercised over computer systems to ensure the accuracy and security of production records?", k=2
)

results

[Document(metadata={'source': './data/21-CFR-gmp.pdf', 'page': 5, '_id': 'bf93b35ec2f440049bdc35487bb02dea', '_collection_name': 'demo_collection'}, page_content='163 Food and Drug Administration, HHS ยง 211.82 \nchecks and inspections shall be main-\ntained. \n(b) Appropriate controls shall be ex-\nercised over computer or related sys-tems to assure that changes in master production and control records or other records are instituted only by author-ized personnel. Input to and output from the computer or related system of formulas or other records or data shall be checked for accuracy. The degree and frequency of input/output verification shall be based on the com-plexity and reliability of the computer or related system. A backup file of data entered into the computer or related system shall be maintained except where certain data, such as calcula-tions performed in connection with lab-oratory analysis, are eliminated by computerization or other automated processes. In such instances

In [None]:
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

#Quadrant-Hybrid search

In [27]:
from langchain_qdrant import FastEmbedSparse, RetrievalMode

sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")

qdrant = QdrantVectorStore.from_documents(
    documents=chunks,
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    location=":memory:",
    collection_name="my_documents-Hybrid",
    retrieval_mode=RetrievalMode.HYBRID,
)

  from .autonotebook import tqdm as notebook_tqdm


ValueError: The 'fastembed' package is not installed. Please install it with `pip install fastembed` or `pip install fastembed-gpu`.