In [8]:
from numpy.ma.core import count

from lib.EmbeddingProvider import OpenAiEmbeddingProvider

from lib.DataRepository import DataRepository
from lib.questions import QuestionExtractor

repo = DataRepository(embedding=OpenAiEmbeddingProvider(), db_path="./data/db/open_ai_small_50_10", chunk_size=50, chunk_overlap=10)
# repo.save_by_file()

# Parce question and find metrics in the question, find similar metrics

In [9]:
from lib.questions import QuestionExtractor

question = "According to the annual report, what is the Operating margin (%) for Altech Chemicals Ltd  (within the last period or at the end of the last period)? If data is not available, return 'N/A'."
kind = "names"

extractor = QuestionExtractor()
extract = extractor.extract(question)
print(extract)
close_metrics = extractor.get_synonyms(extract["metric"])
print(close_metrics)

{'original_question': "According to the annual report, what is the Operating margin (%) for Altech Chemicals Ltd  (within the last period or at the end of the last period)? If data is not available, return 'N/A'.", 'metric': 'Operating margin (%)', 'companies': ['Altech Chemicals Ltd'], 'currency': None, 'comparison': None, 'category': 'fin_metric'}
['operating profit margin', 'operational margin', 'profit margin (operating)', 'percentage operating profit']


In [10]:
import json

companiy = extract["companies"][0]
with open("data/r2.0-test/subset.json", 'r') as file:
    subset = json.load(file)
filered = list(filter(lambda x: x["company_name"] == companiy, subset))[0]
filered

{'sha1': '63688d5d0b4f12e9f847c5407439a1ec46047a4a',
 'cur': 'USD',
 'company_name': 'Altech Chemicals Ltd',
 'major_industry': 'Technology',
 'mentions_recent_mergers_and_acquisitions': True,
 'has_leadership_changes': False,
 'has_layoffs': False,
 'has_executive_compensation': True,
 'has_rnd_investment_numbers': True,
 'has_new_product_launches': True,
 'has_capital_expenditures': True,
 'has_financial_performance_indicators': True,
 'has_dividend_policy_changes': False,
 'has_share_buyback_plans': False,
 'has_capital_structure_changes': False,
 'mentions_new_risk_factors': True,
 'has_guidance_updates': False,
 'has_regulatory_or_litigation_issues': False,
 'has_strategic_restructuring': False,
 'has_supply_chain_disruptions': False,
 'has_esg_initiatives': True}

# Find similar metrics in the data base

In [11]:
main_metric = extract["metric"]
print(main_metric)
file_filter = { "source" : f"./data/r2.0-test/pdfs/{filered["sha1"]}.pdf" }
main_results = repo.query(main_metric, k=10, f=file_filter) # start with main metric from the question

main_metric = extract["metric"]
smaller_results = [] # start with main metric from the question
for m in close_metrics:
    smaller_results += repo.query(m, k=5) # find similar metrics
search_results = main_results + smaller_results
print(len(search_results))

Operating margin (%)
30


In [12]:
pages_candidates = {}
for doc, score in search_results:
    page = doc.metadata["page"]
    if page in pages_candidates:
        pages_candidates[page]["count"] += 1
        pages_candidates[page]["score"].append(score)
    else:
        pages_candidates[page] = {
            "count": 1,
            "score": [score]
        }
pages_candidates_filtered = pages_candidates
for p in pages_candidates_filtered:
    pages_candidates_filtered[p]["score"] = sum(pages_candidates_filtered[p]["score"]) / pages_candidates_filtered[p]["count"]

pages_candidates_filtered = sorted(
    pages_candidates.items(),
    key=lambda x: (-x[1]["count"], x[1]["score"])
)
pages_candidates_filtered[0:8]

[(47, {'count': 5, 'score': 1.062661838531494}),
 (12, {'count': 3, 'score': 0.32247887551784515}),
 (141, {'count': 3, 'score': 0.39292486508687335}),
 (25, {'count': 3, 'score': 0.467891405026118}),
 (2, {'count': 2, 'score': 0.4098067879676819}),
 (99, {'count': 2, 'score': 0.419970765709877}),
 (123, {'count': 2, 'score': 0.6258890330791473}),
 (65, {'count': 2, 'score': 1.069601833820343})]

In [13]:
from typing import Tuple
from langchain_community.document_loaders import PyPDFLoader

document_loader = PyPDFLoader(file_filter["source"])
doc = document_loader.load()

pages = [p for p in doc if p.metadata["page"] in [p for p, _ in pages_candidates_filtered[0:8]]]
for p in pages:
    print(p.metadata["page"])
    p.metadata["id"] = p.metadata["page"]

rag = [(p, 0.0) for p in pages]

2
12
25
47
65


In [14]:
from lib.Agent import OpenAIAgent

agent = OpenAIAgent()
agent.query(question, rag, path="./prompt/names_prompt.txt")

'N/A'