In [1]:
import os
from dotenv import load_dotenv
# import openai
import chromadb
from sentence_transformers import SentenceTransformer  # For embeddings

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
# Load environment variables from .env file
load_dotenv(override=True)

# Retrieve the API key
cohere_key = os.getenv("COHERE_API_KEY")

In [16]:
import cohere
import qdrant_client
from qdrant_client import QdrantClient
from qdrant_client.models import Batch

# Initialize clients
cohere_client = cohere.ClientV2(cohere_key)
# qdrant_client = qdrant_client.QdrantClient(host="localhost", port=6333)

In [34]:
qdrant_client = QdrantClient(host="host.docker.internal", port=6333)

In [35]:
import os
from typing import List
from PyPDF2 import PdfReader
from concurrent.futures import ThreadPoolExecutor

def process_pdf_to_text(file_path: str, num_threads: int = 4) -> str:
    """
    Processes a PDF file and extracts its content as a single text string.

    Args:
        file_path: The path to the PDF file.
        num_threads: The number of threads to use for parallel page processing.

    Returns:
        The extracted text as a single string with no newlines. Returns an empty string on error.

    Raises:
        FileNotFoundError: If the file does not exist.
        ValueError: If the file is not a PDF.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    if not file_path.lower().endswith(".pdf"):
        raise ValueError(f"File is not a PDF: {file_path}")

    try:
        with open(file_path, 'rb') as file:
            reader = PdfReader(file)
            num_pages = len(reader.pages)

            # Use ThreadPoolExecutor to process pages in parallel
            with ThreadPoolExecutor(max_workers=num_threads) as executor:
                page_texts = list(executor.map(extract_page_text, [(reader, i) for i in range(num_pages)]))

            return " ".join(page_texts)  # Join all pages into a single string without newlines

    except Exception as e:
        print(f"Error processing PDF {file_path}: {e}")  # More specific error
        return ""

def extract_page_text(args):
    """Helper function for parallel PDF page extraction."""
    reader, page_num = args
    try:
        page = reader.pages[page_num]
        return " ".join(page.extract_text().split())  # Remove newlines and extra spaces
    except Exception as e:
        print(f"Error processing page {page_num}: {e}")  # Error handling
        return ""

In [36]:
extracted_text = process_pdf_to_text("./legal.pdf")
print(f"Extracted Text:\n{extracted_text}")

Extracted Text:
Additional Application Information If we request additional information to process your application, including but not limited to income and additional identification documentation, please ensure that it is submitted to the Leasing Office within 3 calendar days of the request. If documentation is not received within 3 days, the application will be canceled. Please note that processing of your application may take up to 10 days. Hold Deposit The hold deposit will be applied to the balance due at move- in. If written cancellation is received within 3 days from the date of application or if we cannot approve your application, a refund of the hold deposit payment will be mailed within 20 business days. Required documents must be received within 3 days of the request, or your application will be canceled, and the hold deposit will be forfeited. Security Deposit Requirement The Refundable Security Deposit amount disclosed here is subject to change, pending final credit review

In [37]:
def customize_chunking(text, chunk_size=150):
    list_of_chunks = []
    chunk = ""
    for i in range(0, len(text), chunk_size):
        chunk += text[i:i+chunk_size] + "\n"
        list_of_chunks.append(chunk)
    return list_of_chunks

In [38]:
list_chunks = customize_chunking(extracted_text)
print(f"Number of chunks: {len(list_chunks)}")

Number of chunks: 33


In [63]:
len(list_chunks[0])

151

In [64]:
print(list_chunks[0])

Additional Application Information If we request additional information to process your application, including but not limited to income and additiona



In [39]:
# Generating the embeddings with Cohere client library
embeddings = cohere_client.embed(
    texts=list_chunks,
    model="embed-english-light-v3.0",
    input_type="search_document",
    embedding_types=["float"]
)

In [180]:
type(embeddings)

cohere.types.embed_by_type_response.EmbedByTypeResponse

In [40]:
def query_chunking(query: List[str]):
    response = cohere_client.embed(
        texts=query,
        model="embed-english-light-v3.0",
        input_type="search_query",
        embedding_types=["float"]
    )
    return response # ["results"][0]["text"]

In [41]:
query = ['How do I do online payments?']
query_chunks = query_chunking(query)
query_embeddings = query_chunks.embeddings.float

In [42]:
# Extracting the embeddings
embedding_floats = embeddings.embeddings.float

https://docs.cohere.com/v2/reference/embed

### QDrant

In [57]:
qdrant_key = os.getenv("QDRANT_API_KEY")
qdrant_host = os.getenv("QDRANT_HOST")

In [60]:
qdrant_client = QdrantClient(
    url=qdrant_host,  # Your Qdrant Cloud URL
    # prefer_grpc=True,  # Use gRPC for better performance
    api_key=qdrant_key # Required for Qdrant Cloud authentication
)

# Test the connection by listing collections
# print(qdrant_client.get_collections())

In [66]:
qdrant_client.delete_collection("new-collection")
# print("Collection 'daves-rag' deleted successfully.")

False

In [54]:
from qdrant_client.models import VectorParams, Distance
from qdrant_client import models

In [26]:
type(embedding_floats), type(embedding_floats[0]), type(embedding_floats[0][0])

(list, list, float)

In [None]:
qdrant_client.create_collection(
    collection_name="daves-rag",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),  # Ensure 384 is correct
)

In [223]:
qdrant_client.upsert(
    collection_name="daves-rag",
    points=models.Batch(
        ids=[i for i in range(len(embedding_floats))],  # Unique IDs for each embedding
        payloads=[{"text": f"Document {i}"} for i in range(len(embedding_floats))],  # Optional metadata
        vectors=embedding_floats,  # Ensure embeddings are a List[List[float]]
    ),
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

https://qdrant.tech/documentation/frameworks/langchain/#


https://qdrant.tech/documentation/guides/installation/

In [None]:
collection_info = qdrant_client.get_collection("new-collection")
print(collection_info)

In [None]:
collection_info = qdrant_client.get_collection("daves-rag")
print(collection_info)

In [None]:
?qdrant_client.upsert

#### Without Manual input

In [151]:
# from qdrant_client import QdrantClient

# Initialize client
# qdrant_client = QdrantClient(host="localhost", port=6333)  # Use 'url=' if using Qdrant Cloud

# Define query embedding (Must match the vector size of the collection)

# Perform vector search
top_results = qdrant_client.search(
    collection_name="daves-rag",  # Your collection name
    query_vector=query_embeddings[0],
    limit=5  # Return the top 5 most similar documents
)

# Print results
for result in top_results:
    print(f"Document ID: {result.id}, Score: {result.score}, Payload: {result.payload}")

  top_results = qdrant_client.search(


Document ID: 19, Score: 0.2872586250305176, Payload: None
Document ID: 20, Score: 0.2680310010910034, Payload: None
Document ID: 21, Score: 0.2604231834411621, Payload: None
Document ID: 18, Score: 0.2585766315460205, Payload: None
Document ID: 17, Score: 0.25060632824897766, Payload: None


In [9]:
import numpy as np

In [None]:
qdrant_client.scroll(collection_name="daves-rag", limit=5)

In [205]:
print(f"First vector shape: {len(embedding_floats[0])}")  # Should be 384
print(f"First vector: {embedding_floats[0][:5]}")  # Print first 5 values


First vector shape: 384
First vector: [0.01524353, -0.007095337, -0.024765015, -0.019088745, 0.0496521]


In [206]:
stored_vectors, _ = qdrant_client.scroll(collection_name="daves-rag", limit=5)

for vec in stored_vectors:
    if vec.vector is None:
        print(f"🚨 Missing vector for ID {vec.id}")
    else:
        print(f"✅ ID: {vec.id}, Vector: {vec.vector[:5]}")  # Print first 5 values


🚨 Missing vector for ID 0
🚨 Missing vector for ID 1
🚨 Missing vector for ID 2
🚨 Missing vector for ID 3
🚨 Missing vector for ID 4


In [207]:
qdrant_client = QdrantClient(host="host.docker.internal", port=6333)

In [208]:
!docker ps

CONTAINER ID   IMAGE           COMMAND             CREATED        STATUS             PORTS                              NAMES
0b78c463ca42   qdrant/qdrant   "./entrypoint.sh"   18 hours ago   Up About an hour   0.0.0.0:6333-6334->6333-6334/tcp   gifted_goldstine


In [187]:
type(embedding_floats[0])

list

In [18]:
retrieved_points = qdrant_client.scroll(collection_name="daves-rag")

In [197]:
for point in retrieved_points:
    # print(f"ID: {point.id}, Vector: {point.vector}")
    print(point)

[]
None


In [198]:
print(type(embedding_floats))  # Should be <class 'list'>
print(len(embedding_floats))   # Should match the number of vectors
print(type(embedding_floats[0]))  # Should be <class 'list'>
print(len(embedding_floats[0]))  # Should match vector dimension (e.g., 384)


<class 'list'>
50
<class 'list'>
384


In [28]:
stored_vectors, _ = qdrant_client.scroll(
    collection_name="daves-rag",
    with_vectors=True,
    with_payload=True,
    limit=5
) 

for vec in stored_vectors:
    print(f"ID: {vec.id}, Vector: {vec.vector}, Text: {vec.payload}")

ID: 0, Vector: [0.03929654508829117, 0.0678536593914032, -0.08530523627996445, -0.060866933315992355, 0.09488528966903687, -0.03932705521583557, -0.11184870451688766, 0.04735111817717552, 0.08280343562364578, -0.021936504170298576, -0.047961313277482986, -0.02282128483057022, -0.020700858905911446, -0.017192238941788673, -0.06529084593057632, 0.05836513265967369, -0.018595686182379723, -0.03636761009693146, -0.019556744024157524, 0.018260080367326736, 0.086769700050354, 0.051836047321558, 0.001554089947603643, -0.07767780125141144, -0.034750595688819885, 0.006742652505636215, 0.0021680984646081924, 0.10098724067211151, 0.05635149031877518, 0.08323056995868683, -0.07432172447443008, 0.04640531539916992, 0.10952996462583542, 0.023873871192336082, 0.09403102099895477, -0.048968132585287094, -0.040303368121385574, 0.03780156746506691, 0.04509339481592178, 0.027199434116482735, -0.022668737918138504, 0.0065062022767961025, 0.026512963697314262, -0.03594047576189041, 0.01969403773546219, -0.

In [30]:
def retrieve_top_chunks(query:str, collection_name, chunks, n=5):
    # Fetch all stored points
    stored_points = qdrant_client.scroll(collection_name="daves-rag", with_vectors=True, limit=1000)[0]
    query_chunks = query_chunking([query])
    query_embeddings = query_chunks.embeddings.float
    
    # Extract embeddings & IDs
    chunk_embeddings = [point.vector for point in stored_points]
    stored_ids = [point.id for point in stored_points]
    
    def cosine_similarity(a, b):
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

    # --- Compute Similarity Scores ---
    similarities = []
    for chunk_embedding in chunk_embeddings:
        subquery_scores = [cosine_similarity(query_embedding, chunk_embedding) for query_embedding in query_embeddings]
        similarities.append(np.mean(subquery_scores))  # Average similarity if multiple subqueries

    print("Similarity scores:", similarities)

    # --- Retrieve Top `n` Chunks ---
    top_indices = np.argsort(similarities)[::-1][:n]  # Sort and get top `n`

    # Retrieve top similar document chunks
    top_chunks_after_retrieval = [chunks[i] for i in top_indices]

    return top_chunks_after_retrieval

In [11]:
import numpy as np
import pandas as pd

In [22]:
def get_llm_output(top_chunks, ch, query):
    preamble = """
    ## Task & Context
    You give answers to user's questions with precision, based on chunked document string you receive.
    You should focus on serving the user's needs as best you can, which can be wide-ranging but always relevant to the document string.
    If you are not sure about the answer, you can ask for clarification or provide a general response saying you are not sure.
    
    ## Style Guide
    Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.
    """

    # retrieved documents
    documents = [
        {"data": {"title": f"chunk {i}", "snippet": top_chunks[i]}} for i in range(len(top_chunks))
    ]

    # get model response
    response = ch.chat(
        model="command-r-08-2024",
        messages=[{"role": "system", "content": preamble},
                  {"role": "user", "content": query}],
        documents=documents,  
        temperature=0.3
    )

    print("Final answer:")
    print(response.message.content[0].text)


In [39]:
query="How do I pay online rent and what are the terms for subsequent months?"
ch = cohere.ClientV2(cohere_key)

top_chunks = retrieve_top_chunks(query=query, 
                                 collection_name="daves-rag", 
                                 chunks=list_chunks,
                                 n=5)
print(get_llm_output(top_chunks, ch, query))


Similarity scores: [np.float64(0.2835472041505702), np.float64(0.28769699382419356), np.float64(0.31855678036821256), np.float64(0.3158629267113816), np.float64(0.3331100340810262), np.float64(0.3596958581090524), np.float64(0.3634299015467861), np.float64(0.3558121061295941), np.float64(0.36527115485174816), np.float64(0.36527115485174816), np.float64(0.36527115485174816), np.float64(0.36527115485174816), np.float64(0.36527115485174816), np.float64(0.36527115485174816), np.float64(0.36527115485174816), np.float64(0.36527115485174816), np.float64(0.36527115485174816)]
Final answer:
You can pay your rent online using an E-Check without incurring any additional fees. Credit card payments, on the other hand, will incur a 2.95% fee. Other payment types, such as money orders, are also accepted without any additional fees.

If your move-in date is between the 25th and 31st, the following month's rent will be due at move-in.
None


https://cloud.qdrant.io/accounts/5afd6788-b042-400e-82cd-baa221f504f0/clusters/b2e7691c-8588-485a-938b-cc899e360d62/overview

https://qdrant.tech/documentation/embeddings/cohere/

https://qdrant.tech/articles/qa-with-cohere-and-qdrant/

https://medium.com/@sanket.ai/building-a-streamlit-application-for-interactive-questioning-with-pdf-using-openai-and-langchain-dc82a0d8d68a



In [118]:
type(embeddings)

list

In [122]:
from langchain.schema import Document

# Create a list of Document objects
documents = [Document(page_content=chunk) for chunk in list_chunks]


In [129]:
?QdrantVectorStore.from_documents

[0;31mSignature:[0m
[0mQdrantVectorStore[0m[0;34m.[0m[0mfrom_documents[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdocuments[0m[0;34m:[0m [0;34m'list[Document]'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0membedding[0m[0;34m:[0m [0;34m'Embeddings'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m:[0m [0;34m'Any'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'VST'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return VectorStore initialized from documents and embeddings.

Args:
    documents: List of Documents to add to the vectorstore.
    embedding: Embedding function to use.
    kwargs: Additional keyword arguments.

Returns:
    VectorStore: VectorStore initialized from documents and embeddings.
[0;31mFile:[0m      ~/Desktop/DS_ML/ml_course/capstone/ragenv/lib/python3.12/site-packages/langchain_core/vectorstores/base.py
[0;31mType:[0m      method