In [None]:
# Install required libraries with CUDA support
!pip install -q torch

In [None]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

CUDA available: False


In [None]:
# Check CUDA version first
!nvcc --version

# Install llama-cpp-python with CUDA 12.x support
!pip install --no-cache-dir llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu123

# install remaining libraries
!pip install llama-index
!pip install pymupdf
!pip install llama-index-llms-llama-cpp
!pip install llama-index-embeddings-huggingface

/bin/bash: line 1: nvcc: command not found
Looking in indexes: https://pypi.org/simple, https://abetlen.github.io/llama-cpp-python/whl/cu123
Collecting llama-cpp-python==0.2.90
  Downloading https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.90-cu123/llama_cpp_python-0.2.90-cp311-cp311-linux_x86_64.whl (444.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m444.5/444.5 MB[0m [31m172.4 MB/s[0m eta [36m0:00:00[0m
Collecting diskcache>=5.6.1 (from llama-cpp-python==0.2.90)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: diskcache, llama-cpp-python
Successfully installed diskcache-5.6.3 llama-cpp-python-0.2.90
Collecting llama-index
  Downloading llama_index-0.12.28-py3-none-any.whl.metadata (12 kB)
Collecting llama-index-agent-op

In [None]:
from llama_cpp import Llama
import os

# Download Mistral model if not already present
model_path = "/content/mistral.gguf"
if not os.path.exists(model_path):
    !wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf -O {model_path}
    print(f"Model downloaded to {model_path}")

# Verify file exists and check size
if os.path.exists(model_path):
    print(f"Model file exists. Size: {os.path.getsize(model_path) / (1024 * 1024):.2f} MB")
else:
    print("Model file not found!")


--2025-04-03 18:05:54--  https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf
Resolving huggingface.co (huggingface.co)... 18.172.134.4, 18.172.134.24, 18.172.134.124, ...
Connecting to huggingface.co (huggingface.co)|18.172.134.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/72/62/726219e98582d16c24a66629a4dec1b0761b91c918e15dea2625b4293c134a92/3e0039fd0273fcbebb49228943b17831aadd55cbcbf56f0af00499be2040ccf9?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27mistral-7b-instruct-v0.2.Q4_K_M.gguf%3B+filename%3D%22mistral-7b-instruct-v0.2.Q4_K_M.gguf%22%3B&Expires=1743707154&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0MzcwNzE1NH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzcyLzYyLzcyNjIxOWU5ODU4MmQxNmMyNGE2NjYyOWE0ZGVjMWIwNzYxYjkxYzkxOGUxNWRlYTI2MjViNDI5M2MxMzRhOTIvM2UwMDM5ZmQwMjczZmNiZWJiN

In [None]:
import fitz  # PyMuPDF

# Define document paths
doc_paths = {
    "Unknown 1": "/content/LenderFeesWorksheetNew.pdf",
    "Unknown 2": "/content/appraisal_report.pdf",
    "Unknown 3": "/content/payslip_sample_image.pdf",
    "Unknown 4": "/content/sample_bank_statement.pdf",
    "Unknown 5": "/content/sample_contract.pdf",
}

# Extract text from all PDFs
doc_texts = {}

for i, (doc_type, path) in enumerate(doc_paths.items()):
    doc = fitz.open(path)
    text = "\n".join([page.get_text() for page in doc])
    doc_texts[f"Doc-{i+1}"] = text  # Temporarily label them "Unknown"
    print(f"Extracted {len(text.split())} words from {path}.")

Extracted 404 words from /content/LenderFeesWorksheetNew.pdf.
Extracted 6470 words from /content/appraisal_report.pdf.
Extracted 82 words from /content/payslip_sample_image.pdf.
Extracted 287 words from /content/sample_bank_statement.pdf.
Extracted 315 words from /content/sample_contract.pdf.


In [None]:
doc_texts.keys()

dict_keys(['Doc-1', 'Doc-2', 'Doc-3', 'Doc-4', 'Doc-5'])

In [None]:
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.core import Document

# Load Mistral model with optimized generic parameters
llm = LlamaCPP(
    model_path="/content/mistral.gguf",
    temperature=0.0,  # Zero temperature for deterministic classification
    max_new_tokens=30,  # We only need a single category name
    context_window=4096,  # Increased context to handle our sampling approach
)


llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /content/mistral.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.

In [None]:
def prepare_document_for_classification(text):
    # Instead of truncating to first 500 chars, create a better representation

    # Get first, middle, and last portions
    doc_length = len(text)
    first_part = text[:min(500, doc_length)]

    middle_start = max(0, doc_length//2 - 250)
    middle_part = text[middle_start:middle_start + min(500, doc_length - middle_start)]

    last_start = max(0, doc_length - 500)
    last_part = text[last_start:]

    # Extract any structural elements (headings, tables, etc.)
    # This is a simplified version - could use regex for better extraction
    possible_headers = [line.strip() for line in text.split('\n')
                       if line.strip() and len(line.strip()) < 50
                       and line.strip().isupper()]
    headers = possible_headers[:10]  # Take first 10 potential headers

    return {
        "first_part": first_part,
        "middle_part": middle_part,
        "last_part": last_part,
        "total_length": doc_length,
        #"potential_headers": "\n".join(headers)
    }

In [None]:
def classify_document(text):
    doc_info = prepare_document_for_classification(text)

    prompt = f"""You are a document classification expert. Classify this document into one of these categories:
    - Bank Statement
    - Pay Slip
    - Appraisal Report
    - Unknown
    - Contract
    - Lender Fees Worksheet

    Here's information extracted from the document:

    DOCUMENT START EXCERPT:
    {doc_info['first_part']}
    DOCUMENT START EXCERPT END

    DOCUMENT MIDDLE EXCERPT:
    {doc_info['middle_part']}
    DOCUMENT MIDDLE EXCERPT END

    DOCUMENT END EXCERPT:
    {doc_info['last_part']}
    DOCUMENT END EXCERPT END

    Total document length: {doc_info['total_length']} characters

    IMPORTANT INSTRUCTION: Your response must be EXACTLY ONE of these six options:
    Bank Statement
    Pay Slip
    Appraisal Report
    Contract
    Lender Fees Worksheet
    Unknown

    Do not include any explanation, reasoning, or additional text. Respond with ONLY the category name.
    """

    response = llm.complete(prompt)
    raw_response = response.text.strip()

    # Post-process to extract just the category name
    categories = ["Bank Statement", "Pay Slip", "Appraisal Report", "Unknown", "Lender Fees Worksheet","Contract"]

    # First check if the response exactly matches one of our categories
    if raw_response in categories:
        return raw_response

    # If not, look for the category within the response
    for category in categories:
        if category.lower() in raw_response.lower():
            return category

    # If still no match, return the closest match
    import re
    words = re.findall(r'\b\w+\b', raw_response.lower())
    if "bank" in words or "statement" in words:
        return "Bank Statement"
    elif "pay" in words or "slip" in words or "salary" in words:
        return "Pay Slip"
    elif "contract" in words:
        return "Contract"
    elif "worksheet" in words or "Lender" in words:
        return "Lender Fees Worksheet"
    elif "appraisal" in words or "property" in words:
        return "Appraisal Report"
    else:
        return "Unknown"

In [None]:
# Classify each document
classified_docs = {}
for doc_id, text in doc_texts.items():
    doc_type = classify_document(text)
    classified_docs[doc_id] = {"text": text, "doc_type": doc_type}
    print(f"{doc_id} classified as: {doc_type}")


llama_perf_context_print:        load time =   21954.37 ms
llama_perf_context_print: prompt eval time =   21953.93 ms /   892 tokens (   24.61 ms per token,    40.63 tokens per second)
llama_perf_context_print:        eval time =    1243.42 ms /    21 runs   (   59.21 ms per token,    16.89 tokens per second)
llama_perf_context_print:       total time =   23204.81 ms /   913 tokens
Llama.generate: 81 prefix-match hit, remaining 697 prompt tokens to eval


Doc-1 classified as: Lender Fees Worksheet


llama_perf_context_print:        load time =   21954.37 ms
llama_perf_context_print: prompt eval time =   17621.64 ms /   697 tokens (   25.28 ms per token,    39.55 tokens per second)
llama_perf_context_print:        eval time =    1081.89 ms /    19 runs   (   56.94 ms per token,    17.56 tokens per second)
llama_perf_context_print:       total time =   18710.83 ms /   716 tokens
Llama.generate: 81 prefix-match hit, remaining 889 prompt tokens to eval


Doc-2 classified as: Appraisal Report


llama_perf_context_print:        load time =   21954.37 ms
llama_perf_context_print: prompt eval time =   22829.36 ms /   889 tokens (   25.68 ms per token,    38.94 tokens per second)
llama_perf_context_print:        eval time =    1225.25 ms /    21 runs   (   58.35 ms per token,    17.14 tokens per second)
llama_perf_context_print:       total time =   24062.25 ms /   910 tokens
Llama.generate: 80 prefix-match hit, remaining 1091 prompt tokens to eval


Doc-3 classified as: Pay Slip


llama_perf_context_print:        load time =   21954.37 ms
llama_perf_context_print: prompt eval time =   27768.39 ms /  1091 tokens (   25.45 ms per token,    39.29 tokens per second)
llama_perf_context_print:        eval time =    1024.64 ms /    17 runs   (   60.27 ms per token,    16.59 tokens per second)
llama_perf_context_print:       total time =   28799.41 ms /  1108 tokens
Llama.generate: 80 prefix-match hit, remaining 545 prompt tokens to eval


Doc-4 classified as: Bank Statement


llama_perf_context_print:        load time =   21954.37 ms
llama_perf_context_print: prompt eval time =   14063.65 ms /   545 tokens (   25.80 ms per token,    38.75 tokens per second)
llama_perf_context_print:        eval time =     936.50 ms /    16 runs   (   58.53 ms per token,    17.08 tokens per second)
llama_perf_context_print:       total time =   15006.17 ms /   561 tokens


Doc-5 classified as: Contract


In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Load embedding model
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
classified_docs.keys()

dict_keys(['Doc-1', 'Doc-2', 'Doc-3', 'Doc-4', 'Doc-5'])

In [None]:

# Create separate indexes for each document type
index_map = {}

for doc_id, data in classified_docs.items():
    doc_type = data["doc_type"]

    if doc_type == "Unknown":
        continue  # Skip unknown documents

    document = Document(text=data["text"], metadata={"doc_type": doc_type})

    if doc_type not in index_map:
        index_map[doc_type] = VectorStoreIndex.from_documents([document], embed_model=embed_model)
    else:
        index_map[doc_type].insert(document)

    print(f"Indexed {doc_id} as {doc_type}.")


Indexed Doc-1 as Lender Fees Worksheet.
Indexed Doc-2 as Appraisal Report.
Indexed Doc-3 as Pay Slip.
Indexed Doc-4 as Bank Statement.
Indexed Doc-5 as Contract.


In [None]:
index_map

{'Lender Fees Worksheet': <llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7bfeae2de050>,
 'Appraisal Report': <llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7bfeae0727d0>,
 'Pay Slip': <llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7bfe97ca3b90>,
 'Bank Statement': <llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7bfeae071c10>,
 'Contract': <llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7bfe97504690>}

In [None]:

def classify_document(text):

    keywords = {
        "Bank Statement": ["account", "balance", "deposit", "withdrawal", "transaction"],
        "Pay Slip": ["salary", "pay", "net", "gross", "earnings", "deductions"],
        "Appraisal Report": ["property", "appraisal", "valuation", "market value", "real estate"],
        "Contract": ["agreement", "contract", "clause", "parties", "obligations"],
        "Lender Fees Worksheet": ["worksheet", "fees", "lender", "closing costs", "mortgage"],
    }

    category_scores = {category: 0 for category in keywords}
    for category, keyword_list in keywords.items():
        for keyword in keyword_list:
            if keyword.lower() in text.lower():
                category_scores[category] += 1

    # Find the category with the highest score
    max_score = 0
    predicted_category = "Unknown"
    for category, score in category_scores.items():
        if score > max_score:
            max_score = score
            predicted_category = category

    # Use the predicted category, but only if it's above a certain threshold
    # and there's sufficient confidence.

    confidence_threshold = 2  # Adjust as needed
    if max_score >= confidence_threshold:
        return predicted_category
    else:
        # Fall back to the existing logic if no category has a high enough score.
        # ... (Your existing logic to extract category from response) ...
        return raw_response


In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.response_synthesizers import CompactAndRefine
import re

def route_query(query):
    # Check which document type the query is related to
    prompt = f"""
    Classify the following question into one of these categories:
    - Bank Statement
    - Pay Slip
    - Appraisal Report
    - Unknown
    - Contract
    - Lender Fees Worksheet

    If it does not match any, respond with 'Unknown'.

    IMPORTANT INSTRUCTION: Your response must be EXACTLY ONE of these six options:
    Bank Statement
    Pay Slip
    Appraisal Report
    Contract
    Lender Fees Worksheet
    Unknown

    Do not include any explanation, reasoning, or additional text. Respond with ONLY the category name.

    Query: {query}
    """

    doc_type = llm.complete(prompt).text.strip()

    raw_response = doc_type

    # Post-process to extract just the category name
    categories = ["Bank Statement", "Pay Slip", "Appraisal Report", "Unknown", "Lender Fees Worksheet","Contract"]

    # First check if the response exactly matches one of our categories
    if raw_response in categories:
        doc_type = raw_response

    # If not, look for the category within the response
    for category in categories:
        if category.lower() in raw_response.lower():
            doc_type = category

    # If still no match, return the closest match
    words = re.findall(r'\b\w+\b', raw_response.lower())
    if "bank" in words or "statement" in words:
        doc_type = "Bank Statement"
    elif "pay" in words or "slip" in words or "salary" in words:
        doc_type = "Pay Slip"
    elif "appraisal" in words or "property" in words:
        doc_type = "Appraisal Report"
    elif "contract" in words:
        doc_type = "Contract"
    elif "worksheet" in words or "Lender" in words:
        doc_type = "Lender Fees Worksheet"
    else:
        doc_type = "Unknown"

    if doc_type not in index_map:
        return "Could not determine document type."

    # Retrieve from the correct index
    retriever = index_map[doc_type].as_retriever(similarity_top_k=2)

    # Create a response synthesizer with the Mistral model
    response_synthesizer = CompactAndRefine(
        llm=llm,
        verbose=True
    )

    # Create the query engine with our explicit components
    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer
    )

    response = query_engine.query(query)
    return f"📄 **Information from:** {doc_type}\n🔍 **Response:** {response}"

In [None]:

# print(route_query(Question)) create a loop to let the person input as many times as they want until they use the sentinel word Done

while True:
  Question = input('Enter your Question (or "Done" to exit): ')
  if Question.lower() == 'done':
    break
  print(route_query(Question))


Enter your Question (or "Done" to exit): Sum all of the deposit atm from the bank statements ?


Llama.generate: 1 prefix-match hit, remaining 160 prompt tokens to eval
llama_perf_context_print:        load time =   21954.37 ms
llama_perf_context_print: prompt eval time =    3392.25 ms /   160 tokens (   21.20 ms per token,    47.17 tokens per second)
llama_perf_context_print:        eval time =     318.16 ms /     6 runs   (   53.03 ms per token,    18.86 tokens per second)
llama_perf_context_print:       total time =    3712.89 ms /   166 tokens
Llama.generate: 1 prefix-match hit, remaining 1580 prompt tokens to eval
llama_perf_context_print:        load time =   21954.37 ms
llama_perf_context_print: prompt eval time =   37702.71 ms /  1580 tokens (   23.86 ms per token,    41.91 tokens per second)
llama_perf_context_print:        eval time =    1860.07 ms /    29 runs   (   64.14 ms per token,    15.59 tokens per second)
llama_perf_context_print:       total time =   39573.17 ms /  1609 tokens


📄 **Information from:** Bank Statement
🔍 **Response:** 
The total amount of deposits made through an ATM during the statement period is 6,700.39 (2,6
Enter your Question (or "Done" to exit): What is the late payment policy from the contract ?


Llama.generate: 1 prefix-match hit, remaining 159 prompt tokens to eval
llama_perf_context_print:        load time =   21954.37 ms
llama_perf_context_print: prompt eval time =    3369.17 ms /   159 tokens (   21.19 ms per token,    47.19 tokens per second)
llama_perf_context_print:        eval time =     266.56 ms /     5 runs   (   53.31 ms per token,    18.76 tokens per second)
llama_perf_context_print:       total time =    3638.12 ms /   164 tokens
Llama.generate: 1 prefix-match hit, remaining 557 prompt tokens to eval
llama_perf_context_print:        load time =   21954.37 ms
llama_perf_context_print: prompt eval time =   12850.25 ms /   557 tokens (   23.07 ms per token,    43.35 tokens per second)
llama_perf_context_print:        eval time =     836.48 ms /    15 runs   (   55.77 ms per token,    17.93 tokens per second)
llama_perf_context_print:       total time =   13692.17 ms /   572 tokens


📄 **Information from:** Contract
🔍 **Response:** 1.5% per month from the due date until paid in full.
Enter your Question (or "Done" to exit): How much is the appraisal fee from the lender fee worksheet document ?


Llama.generate: 1 prefix-match hit, remaining 166 prompt tokens to eval
llama_perf_context_print:        load time =   21954.37 ms
llama_perf_context_print: prompt eval time =    3873.95 ms /   166 tokens (   23.34 ms per token,    42.85 tokens per second)
llama_perf_context_print:        eval time =     572.03 ms /    10 runs   (   57.20 ms per token,    17.48 tokens per second)
llama_perf_context_print:       total time =    4449.83 ms /   176 tokens
Llama.generate: 1 prefix-match hit, remaining 1141 prompt tokens to eval
llama_perf_context_print:        load time =   21954.37 ms
llama_perf_context_print: prompt eval time =   26779.45 ms /  1141 tokens (   23.47 ms per token,    42.61 tokens per second)
llama_perf_context_print:        eval time =    1271.07 ms /    22 runs   (   57.78 ms per token,    17.31 tokens per second)
llama_perf_context_print:       total time =   28058.22 ms /  1163 tokens


📄 **Information from:** Lender Fees Worksheet
🔍 **Response:**  The appraisal fee from the lender fee worksheet document is $525.00.
Enter your Question (or "Done" to exit): done
