In [1]:
from ExperimentPipelineV2 import ExperimentPipelineV2

dummy = ExperimentPipelineV2(
    name="dummy",
    llm=None,
    repo=None,
)

# Questions preprocessing.

In [2]:
# small chunk correct
question = "For SThree plc, what was the value of End-of-year total headcount at the end of the period listed in annual report? If data is not available, return 'N/A'."
correct_answer_page = 3
sha1 = "67185fca2a09b3c46ee961b2c1ae160dab8b5231"

# big chunk correct
# question = "Which leadership positions changed at Crombie REIT in the reporting period? If data is not available, return 'N/A'. Give me the title of the position."
# correct_answer_page = 3
# sha1 = "14fa568899745270c4ff2c10073f97f2c2e7764b"

# Both
# question = "What was the Capital expenditures (in USD) for Structural Monitoring Systems Plc according to the annual report (within the last period or at the end of the last period)? If data is not available, return 'N/A'."
# correct_answer_page = 38
# sha1= "3e5ccdb58faf901e75e31f154cb8330869ca5efa"

In [3]:
extracts = [dummy.extract(q) for q in dummy.questions]
extracts = list(filter(lambda x: x['original_question'] == question, extracts))[0]
extracts

{'original_question': "For SThree plc, what was the value of End-of-year total headcount at the end of the period listed in annual report? If data is not available, return 'N/A'.",
 'metric': 'End-of-year total headcount',
 'companies': ['SThree plc'],
 'currency': None,
 'comparison': None,
 'category': 'industry_metric',
 'type': 'number',
 'sha1': '67185fca2a09b3c46ee961b2c1ae160dab8b5231'}

In [4]:
synonyms_lookup = dummy.read_synonyms()
synonyms_lookup = list(filter(lambda x: x['metric'] == extracts['metric'], synonyms_lookup))[0]
synonyms_lookup

{'metric': 'End-of-year total headcount',
 'synonyms': [{'text': 'Year-end employee count', 'score': 0.95},
  {'text': 'Total staff at year-end', 'score': 0.92},
  {'text': 'Year-end workforce size', 'score': 0.9},
  {'text': 'End-of-year personnel count', 'score': 0.88},
  {'text': 'Year-end headcount total', 'score': 0.85}]}

In [5]:
from lib.DataRepository import DataRepository
from lib.EmbeddingProvider import WatsonEmbeddingProvider, OpenAiEmbeddingProvider

repoSmall = DataRepository(
    embedding=WatsonEmbeddingProvider(),
    db_path="./data/db/watson_ai_large_100_10_filtered",
    path="./data/r2.0/pdfs",
    name="watson_ai_large_100_10_filtered",
)
dummy.repo = repoSmall
search = dummy.search_database(synonyms_lookup, extracts, main=10, side=5)
smallCandidates = dummy.filter_candidates(search, size=10)
smallCandidates

[(119, {'count': 7, 'score': 0.4317506636892046}),
 (69, {'count': 3, 'score': 0.415793498357137}),
 (85, {'count': 3, 'score': 0.4354545772075653}),
 (3, {'count': 3, 'score': 0.44261035323143005}),
 (83, {'count': 3, 'score': 0.4451092878977458}),
 (125, {'count': 2, 'score': 0.3707282245159149}),
 (97, {'count': 2, 'score': 0.4507458955049515}),
 (57, {'count': 2, 'score': 0.46351566910743713}),
 (87, {'count': 1, 'score': 0.41389596462249756}),
 (39, {'count': 1, 'score': 0.41389599442481995})]

In [6]:
repoBig = DataRepository(
    embedding=WatsonEmbeddingProvider(),
    db_path="./data/db/watson_ai_large_1000_100_filtered",
    path="./data/r2.0/pdfs",
    name="watson_ai_large_1000_100_filtered",
)

dummy.repo = repoBig
search = dummy.search_database(synonyms_lookup, extracts, main=10, side=5)
bigCandidates = dummy.filter_candidates(search, size=10)
bigCandidates

[(105, {'count': 5, 'score': 0.5703718423843384}),
 (85, {'count': 5, 'score': 0.5749886989593506}),
 (106, {'count': 4, 'score': 0.576771929860115}),
 (97, {'count': 3, 'score': 0.5534367163976034}),
 (83, {'count': 2, 'score': 0.5704290866851807}),
 (104, {'count': 2, 'score': 0.5865854322910309}),
 (87, {'count': 2, 'score': 0.5889977216720581}),
 (81, {'count': 2, 'score': 0.6009736657142639}),
 (86, {'count': 2, 'score': 0.6045263409614563}),
 (96, {'count': 1, 'score': 0.5649453401565552})]

In [7]:
mergedCandidates = dummy.merge_data(smallCandidates + bigCandidates)
mergedCandidates

[(85, {'count': 8, 'score': 0.5226634033024311}),
 (119, {'count': 7, 'score': 0.4317506636892046}),
 (83, {'count': 5, 'score': 0.4952372074127197}),
 (97, {'count': 5, 'score': 0.5123603880405426}),
 (105, {'count': 5, 'score': 0.5703718423843384}),
 (106, {'count': 4, 'score': 0.576771929860115}),
 (69, {'count': 3, 'score': 0.415793498357137}),
 (3, {'count': 3, 'score': 0.44261035323143005}),
 (87, {'count': 3, 'score': 0.5306304693222046}),
 (125, {'count': 2, 'score': 0.3707282245159149}),
 (57, {'count': 2, 'score': 0.46351566910743713}),
 (104, {'count': 2, 'score': 0.5865854322910309}),
 (81, {'count': 2, 'score': 0.6009736657142639}),
 (86, {'count': 2, 'score': 0.6045263409614563}),
 (39, {'count': 1, 'score': 0.41389599442481995}),
 (96, {'count': 1, 'score': 0.5649453401565552})]

# Extracting relevant documents.

In [8]:
documents = dummy.read_pdf(sha1, mergedCandidates)
assert len(documents) == len(mergedCandidates)
# documents = documents[0:1]
documents

[(Document(metadata={'producer': 'Adobe PDF Library 16.0.7', 'creator': 'Adobe InDesign 17.3 (Macintosh)', 'creationdate': '2023-03-02T12:01:58+00:00', 'moddate': '2023-03-03T08:40:21+00:00', 'trapped': '/False', 'source': './data/r2.0/pdfs/67185fca2a09b3c46ee961b2c1ae160dab8b5231.pdf', 'total_pages': 129, 'page': 3, 'page_label': '4', 'id': 3, 'sha1': '67185fca2a09b3c46ee961b2c1ae160dab8b5231'}, page_content='Overview\nSThree at a glance\nProviding STEM \ntalent wherever and \nwhenever it’s needed\nEvery sector in every region faces the similar challenges of climate \nchange, decarbonisation, digitalisation and increased demand for \nhealth and pharmaceutical innovation. They find solutions through \nSTEM-skilled expertise.\nEmpower our people\nOur Group employs over 3,000 \npeople worldwide. We build \nan open and inclusive culture \nand offer a flexible working \nenvironment. Our business is full \nof proactive, forward-looking \npeople who bring energy and fun \nto the workplace. T

In [9]:
from lib.Agent import IBMWatsonAgent

dummy.llm = IBMWatsonAgent(model="deepseek/deepseek-r1-distill-llama-70b")

relevance_holder = []
for d in documents[0:10]:
    text = f"Evaluate the context for its relevance to the question: '{question}'."
    print(text, d[0].metadata['page'])
    d[0].metadata['company'] = extracts['companies'][0]
    print(d[0].metadata['company'])
    relevance_answer = dummy.llm.query(
        text=text,
        data=[d],
        path=f"./prompt/relevance_prompt.txt",
        system="You are competent financial analytic.")
    print(relevance_answer)
    relevance_holder.append({ "page": d[0].metadata['page'], "relevance": relevance_answer })

Evaluate the context for its relevance to the question: 'For SThree plc, what was the value of End-of-year total headcount at the end of the period listed in annual report? If data is not available, return 'N/A'.'. 3
SThree plc
0.90
Evaluate the context for its relevance to the question: 'For SThree plc, what was the value of End-of-year total headcount at the end of the period listed in annual report? If data is not available, return 'N/A'.'. 39
SThree plc
0.75
Evaluate the context for its relevance to the question: 'For SThree plc, what was the value of End-of-year total headcount at the end of the period listed in annual report? If data is not available, return 'N/A'.'. 57
SThree plc
0.0
Evaluate the context for its relevance to the question: 'For SThree plc, what was the value of End-of-year total headcount at the end of the period listed in annual report? If data is not available, return 'N/A'.'. 69
SThree plc
0.0
Evaluate the context for its relevance to the question: 'For SThree

In [10]:
import re

print(relevance_holder)

filtered_scores = list(filter(dummy.filter_function ,list(map(dummy.mapper, relevance_holder))))
filtered_scores

[{'page': 3, 'relevance': '0.90'}, {'page': 39, 'relevance': '0.75'}, {'page': 57, 'relevance': '0.0'}, {'page': 69, 'relevance': '0.0'}, {'page': 81, 'relevance': '0.0'}, {'page': 83, 'relevance': '0.0'}, {'page': 85, 'relevance': '0.0'}, {'page': 86, 'relevance': '0.0'}, {'page': 87, 'relevance': '0.0'}, {'page': 96, 'relevance': '0.0'}]


[{'page': 3, 'relevance': 0.9}, {'page': 39, 'relevance': 0.75}]

In [11]:
dummy.llm = IBMWatsonAgent(model="deepseek/deepseek-r1-distill-llama-70b")

for f in list(filtered_scores):
    print(f)
    d = dummy.read_markdown(sha1, [(f['page'], 0)])
    answer = dummy.llm.query(
        text=question,
        data=d,
        path=f"./prompt/number_prompt.txt",
        system="You are competent financial analytic.")
    print(answer)

{'page': 3, 'relevance': 0.9}
3119
{'page': 39, 'relevance': 0.75}
2364


In [12]:
filtered_pages = list(map(lambda x: (x['page'], x['relevance']), filtered_scores))
filtered_pages

[(3, 0.9), (39, 0.75)]

In [13]:
d = dummy.read_markdown(sha1, filtered_pages)
answer = dummy.llm.query(
        text=question,
        data=d,
        path=f"./prompt/number_prompt.txt",
        system="You are competent financial analytic.")
print(answer)

3119
