In [73]:
!pip install -U sentence-transformers boto3 opensearch-py requests-aws4auth

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [102]:
from sentence_transformers import SentenceTransformer
import boto3
model = SentenceTransformer('saved_model_miniLM_v2')
dynamodb = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb.Table('DocumentTextTable')
response = table.get_item(Key={'DocumentName': 'finreport_q1_2025.pdf'})
text = response['Item']['ExtractedText']
print(text[:500])

Sample Financial Report - Q1 2025
Company: FinNova Technologies Pvt. Ltd.
Report Period: January 1 - March 31, 2025
Prepared By: Finance & Accounts Department
Date: April 5, 2025
Summary Highlights:
Total Revenue: ₹15.8 Crores
Cost of Goods Sold (COGS): ₹6.2 Crores
Operating Expenses: ₹3.4 Crores
Net Profit: ₹6.2 Crores
EBITDA Margin: 39.2%
Segment Performance:
Retail Banking:
Revenue - ₹9.1 Cr I Net Profit - ₹3.9 Cr
FinTech Services:
Revenue - ₹6.7 Cr I Net Profit - ₹2.3 Cr
Additional Notes:
To


In [103]:
embedding_vector = model.encode(text)
print("Embedding vector shape:", embedding_vector.shape)
print("Embedding preview:", embedding_vector[:10])

Embedding vector shape: (384,)
Embedding preview: [-0.05999449 -0.02339429 -0.03599763 -0.00625608 -0.00258177 -0.00882858
  0.01203891  0.10468467 -0.00113416  0.02123569]


In [104]:
model.save('saved_model_miniLM_v2')

In [105]:
from opensearchpy import OpenSearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
region = 'us-east-1'
host = 'search-doc-engine-domain-qyqfyw22tidthpfu7cgcattlwq.us-east-1.es.amazonaws.com'
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, 'es', session_token=credentials.token)
client = OpenSearch(
    hosts=[{'host': host, 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)

In [106]:
index_name = 'documents-index'
index_body = {
    "settings": {
        "index": {
            "knn": True
        }
    },
    "mappings": {
        "properties": {
            "fileName": {"type": "keyword"},
            "text": {"type": "text"},
            "embedding": {
                "type": "knn_vector",
                "dimension": 384
            }
        }
    }
}
if client.indices.exists(index=index_name):
    client.indices.delete(index=index_name)

client.indices.create(index=index_name, body=index_body)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'documents-index'}

In [107]:
doc_body = {
    "fileName": "finreport_q1_2025.pdf",
    "text": text,
    "embedding": embedding_vector.tolist()
}
client.index(index=index_name, body=doc_body)

{'_index': 'documents-index',
 '_id': '0aOXjZcBSILdzQarFEhT',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

In [108]:
from sentence_transformers import util
financial_keywords = [
    "net profit", "revenue", "operating expenses", "income", "loss",
    "EBITDA", "cash flow", "gross profit", "earnings", "balance sheet",
    "statement", "financial year", "Q1", "Q2", "Q3", "Q4"
]

def search_financial_info_best_line(query):
    search_query = {
        "size": 1,
        "query": {
            "match": {
                "text": query
            }
        }
    }
    response = client.search(index=index_name, body=search_query)
    hits = response['hits']['hits']

    if not hits:
        return "No relevant documents found."

    retrieved_text = hits[0]['_source']['text']

    filtered_lines = [
        line.strip() for line in retrieved_text.split('\n')
        if any(keyword.lower() in line.lower() for keyword in financial_keywords)
    ]

    if not filtered_lines:
        return "No matching financial lines found."

    query_embedding = model.encode(query, convert_to_tensor=True)
    line_embeddings = model.encode(filtered_lines, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(query_embedding, line_embeddings)[0]
    best_line_index = scores.argmax().item()
    
    return filtered_lines[best_line_index]

In [112]:
question = "What is the net profit?"
answer = search_financial_info_best_line(question)
print("Most Relevant Line:", answer)

Most Relevant Line: Net Profit: ₹6.2 Crores
