In [52]:
import os

import PyPDF2
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings

# Define vector_db as a global variable
vector_db = None

# List of available embedding models
embedding_models = [
    "all-MiniLM-L6-v2",
    "paraphrase-MiniLM-L6-v2",
    "msmarco-distilbert-base-tas-b",
    "paraphrase-xlm-r-multilingual-v1",
    "multi-qa-mpnet-base-dot-v1",
    "stsb-roberta-base-v2",
    "nli-roberta-base-v2",
    "nli-mpnet-base-v2",
]


filepath = "./uploads/Receipt_Notice.pdf"

text = ""
with open(filepath, "rb") as file:
    reader = PyPDF2.PdfReader(file)
    for page in reader.pages:
        text += page.extract_text()
if filepath:
    loader = UnstructuredPDFLoader(file_path=filepath)
    data = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=7500, chunk_overlap=100
    )
    chunks = text_splitter.split_documents(data)

    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    vector_db = None

    vector_db = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        collection_name="rag-collection",
    )

    print("-------------done-------------")

-------------done-------------


In [3]:
!pip install -r requirements.txt


Collecting blinker==1.8.2 (from -r requirements.txt (line 11))
  Using cached blinker-1.8.2-py3-none-any.whl.metadata (1.6 kB)
Collecting Flask==3.0.3 (from -r requirements.txt (line 38))
  Using cached flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting itsdangerous==2.2.0 (from -r requirements.txt (line 61))
  Using cached itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting pydantic==2.7.4 (from -r requirements.txt (line 130))
  Using cached pydantic-2.7.4-py3-none-any.whl.metadata (109 kB)
Collecting PyPDF2==3.0.1 (from -r requirements.txt (line 136))
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting tokenizers==0.19.1 (from -r requirements.txt (line 177))
  Using cached tokenizers-0.19.1-cp312-none-win_amd64.whl.metadata (6.9 kB)
Collecting transformers==4.42.3 (from -r requirements.txt (line 181))
  Using cached transformers-4.42.3-py3-none-any.whl.metadata (43 kB)
Collecting Werkzeug==3.0.3 (from -r requirements.txt (line 196))
  Using c

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
farm-haystack 1.26.2 requires pydantic<2, but you have pydantic 2.7.4 which is incompatible.
farm-haystack 1.26.2 requires transformers==4.39.3, but you have transformers 4.42.3 which is incompatible.


In [55]:

query_text = "[rasanna]"
print("vecdb", vector_db)
# Query the vector database
if vector_db and query_text:
    results = vector_db.similarity_search(query_text, k=3)
    print(results, "res")

    response = str(results) if results else "No relevant results found."
else:
    response = "Vector database not initialized or query is empty."





vecdb <langchain_community.vectorstores.chroma.Chroma object at 0x0000026E5451D9A0>
[Document(page_content='Receipt Number IOE9877747501 Received Date 06/15/2024\n\nPriority Date\n\nCase Type I765 - APPLICATION FOR EMPLOYMENT AUTHORIZATION\n\nApplicant VADDKKEPURAKKAL, PRASANNA RAJENDRA\n\nNotice Date 06/15/2024\n\nPage 1 of 2\n\nVADDKKEPURAKKAL, PRASANNA RAJENDRA c/o PRASANNA RAJENDRA VADDKKEPURAKKAL 420 NORD AVE APT 11 CHICO CA 95926-4770\n\nNotice Type: Receipt Notice Amount received: $470.00 U.S. Class requested: C03B\n\nWe have received the application or petition ("your case") listed above. This notice only shows that your case was filed on the "Received Date" listed above. It does NOT grant you any immigration status or immigration benefit, and it is not evidence that your case is still pending. We will notify you in writing when we make a decision on your case or if we need additional information.\n\nPlease save this and any other notices about your case for your records. You s

In [57]:
len(results)

3

In [58]:
results[0].to_json()

{'lc': 1,
 'type': 'constructor',
 'id': ['langchain', 'schema', 'document', 'Document'],
 'kwargs': {'page_content': 'Receipt Number IOE9877747501 Received Date 06/15/2024\n\nPriority Date\n\nCase Type I765 - APPLICATION FOR EMPLOYMENT AUTHORIZATION\n\nApplicant VADDKKEPURAKKAL, PRASANNA RAJENDRA\n\nNotice Date 06/15/2024\n\nPage 1 of 2\n\nVADDKKEPURAKKAL, PRASANNA RAJENDRA c/o PRASANNA RAJENDRA VADDKKEPURAKKAL 420 NORD AVE APT 11 CHICO CA 95926-4770\n\nNotice Type: Receipt Notice Amount received: $470.00 U.S. Class requested: C03B\n\nWe have received the application or petition ("your case") listed above. This notice only shows that your case was filed on the "Received Date" listed above. It does NOT grant you any immigration status or immigration benefit, and it is not evidence that your case is still pending. We will notify you in writing when we make a decision on your case or if we need additional information.\n\nPlease save this and any other notices about your case for your rec

In [59]:
text_res= [i.to_json()["kwargs"]["page_content"] for i in results]