In [1]:
from pathlib import Path
import logging
from datetime import datetime
import os

from paperqa.agents.main import agent_query
from paperqa.settings import Settings, AgentSettings, IndexSettings, AnswerSettings, ParsingSettings, PromptSettings

from src.build_search_index import build_search_index, process_bibtex_and_pdfs, create_manifest_file
from src.query_answer_index import query_answer_index
from src.utils import pretty_print_text

# Boolean variable to determine if logs should also be printed to the console
print_logging = False

# Generate a timestamp for the log file name
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
logging_dir = Path(".") / "logs"
logging_dir.mkdir(exist_ok=True)

# Log file path
log_file_path = logging_dir / f"log_{timestamp}.log"

# Configure global logging
handlers = []

# Add file handler
file_handler = logging.FileHandler(log_file_path)
file_handler.setLevel(logging.INFO)  # Set desired file log level
file_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
file_handler.setFormatter(file_formatter)
handlers.append(file_handler)

# Optionally add console handler
if print_logging:
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)  # Set desired console log level
    console_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    console_handler.setFormatter(console_formatter)
    handlers.append(console_handler)

# Set up the root logger
logging.basicConfig(level=logging.INFO, handlers=handlers)

# NOTE: these are the paths that should be configured
export_directory_name = "MS_EXPORT"
project_dir = Path(".")
#project_dir = Path("/") / "Users" / "pschafer" / "Projects" / "MS_expert"

# default paths
data_dir = project_dir / "data"
data_dir.mkdir(exist_ok=True)
paper_directory = data_dir / export_directory_name
index_directory = data_dir / f"{export_directory_name}_index"
bibtex_file = paper_directory / f"{export_directory_name}.bib"
manifest_file = data_dir / f"{export_directory_name}_manifest.csv"
index_name = f"pqa_index_{export_directory_name}"

# create manifest file from bibtex
processed_df = process_bibtex_and_pdfs(bibtex_file=bibtex_file, paper_directory=paper_directory)
create_manifest_file(manifest_df=processed_df, manifest_file=manifest_file)

#manifest_from_bibtex(bibtex_file=bibtex_file, 
#                     paper_directory=paper_directory, 
#                     manifest_file=manifest_file)

# set paperQA settings
#default_lmm = "gpt-4o-mini" # smaller than default which is gpt-4o (bc cheaper)
default_lmm = "deepseek/deepseek-chat" # see https://docs.litellm.ai/docs/providers/deepseek

index_settings = IndexSettings(
    name = index_name,
    paper_directory = paper_directory,
    manifest_file = manifest_file,
    index_directory = index_directory,
    use_absolute_paper_directory = False,
    recurse_subdirectories = True,
    concurrency = 1, # "number of concurrent filesystem reads for indexing (probably not important anymore since I avoid calling S2)"
)

agent_settings = AgentSettings(
    agent_llm = default_lmm,
    index = index_settings,
    index_concurrency = index_settings.concurrency
)

answer_settings = AnswerSettings(
    evidence_k = 30, # number of evidence text chunks to retrieve (default=10)
    evidence_summary_length = "about 200 words", # length of evidence summary (default="about 100 words")
    answer_max_sources = 15, # max number of sources to use for answering (default=5)
    answer_length = "about 400 words, but can be longer", # length of final answer (default="about 200 words, but can be longer")
)

parse_settings = ParsingSettings(
    chunk_size=5_000,
    use_doc_details=True,
    overlap=250
)

prompt_settings = PromptSettings()

settings = Settings(
    agent = agent_settings, 
    answer = answer_settings,
    parsing = parse_settings,
    prompts = prompt_settings,
    llm=default_lmm, 
    summary_llm=default_lmm,
    embedding="text-embedding-3-small", # default
    temperature = 0.0, # default
    texts_index_mmr_lambda = 0.95, # Lambda MMR (default=1.0)
    index_absolute_directory = index_settings.use_absolute_paper_directory,
    index_directory = index_settings.index_directory,
    index_recursively = index_settings.recurse_subdirectories,
    manifest_file = index_settings.manifest_file,
    paper_directory = index_settings.paper_directory,
    verbosity = 0, # (0-3), where 3 is all LLM/Embedding calls are logged
)

# Make sure that I am using the defautl arguments where it matters
def print_non_default_settings(settings_defined, settings_classs, settings_name):
    print(f"------\n{settings_name}")
    for key, value in settings_defined.__dict__.items():
        default_value = getattr(settings_classs(), key, None)
        if value != default_value:
            print(f"selected: {key}: {value}")
            print(f"-> default: {key}: {default_value}")

# Print non-default settings for each object
#print_non_default_settings(index_settings, IndexSettings, "index_settings")
#print_non_default_settings(agent_settings, AgentSettings, "agent_settings")
#print_non_default_settings(answer_settings, AnswerSettings, "answer_settings")
#print_non_default_settings(parse_settings, ParsingSettings, "parse_settings")
#print_non_default_settings(prompt_settings, PromptSettings, "prompt_settings")
#print_non_default_settings(settings, Settings, "settings")

# check API_KEYS are present
API_KEYS = ["DEEPSEEK_API_KEY", "OPENAI_API_KEY"]
for api_key in API_KEYS:
    if (key := os.getenv(api_key)):
        print(f"{api_key} found")

* 'fields' has been removed


DEEPSEEK_API_KEY found
OPENAI_API_KEY found


In [6]:
prompt_settings

PromptSettings(summary='Summarize the excerpt below to help answer a question.\n\nExcerpt from {citation}\n\n----\n\n{text}\n\n----\n\nQuestion: {question}\n\nDo not directly answer the question, instead summarize to give evidence to help answer the question. Stay detailed; report specific numbers, equations, or direct quotes (marked with quotation marks). Reply "Not applicable" if the excerpt is irrelevant. At the end of your response, provide an integer score from 1-10 on a newline indicating relevance to question. Do not explain your score.\n\nRelevant Information Summary ({summary_length}):', qa='Answer the question below with the context.\n\nContext (with relevance scores):\n\n{context}\n\n----\n\nQuestion: {question}\n\nWrite an answer based on the context. If the context provides insufficient information reply "I cannot answer." For each part of your answer, indicate which sources most support it via citation keys at the end of sentences, like {example_citation}. Only cite from 

# Build Search Index

In [2]:
search_index = await build_search_index(settings=settings, bibtex_file=bibtex_file, manifest_file=manifest_file)
assert search_index.index_name == settings.agent.index.name
print(f"Index Name: {search_index.index_name}")
print(f"Number of Indexed Files: {len((await search_index.index_files).keys())}")

Index Name: pqa_index_MS_EXPORT
Number of Indexed Files: 241


# Perform Queries

In [5]:
answer_response = await agent_query(
    query="What is the role of NPPA in heart failure",
    settings=settings, rebuild_index=False
)
pretty_print_text(answer_response.session.formatted_answer)

[92m16:27:27 - LiteLLM Router:INFO[0m: router.py:611 - Routing strategy: simple-shuffle
[92m16:27:27 - LiteLLM:INFO[0m: utils.py:2820 - 
LiteLLM completion() model= deepseek-chat; provider = deepseek
[92m16:27:30 - LiteLLM Router:INFO[0m: router.py:960 - litellm.acompletion(model=deepseek/deepseek-chat)[32m 200 OK[0m
[92m16:27:30 - LiteLLM:INFO[0m: utils.py:2820 - 
LiteLLM completion() model= deepseek-chat; provider = deepseek
[92m16:27:32 - LiteLLM Router:INFO[0m: router.py:960 - litellm.acompletion(model=deepseek/deepseek-chat)[32m 200 OK[0m
[92m16:27:33 - LiteLLM Router:INFO[0m: router.py:611 - Routing strategy: simple-shuffle
[92m16:27:33 - LiteLLM:INFO[0m: utils.py:2820 - 
LiteLLM completion() model= deepseek-chat; provider = deepseek
[92m16:27:33 - LiteLLM:INFO[0m: utils.py:2820 - 
LiteLLM completion() model= deepseek-chat; provider = deepseek
[92m16:27:33 - LiteLLM:INFO[0m: utils.py:2820 - 
LiteLLM completion() model= deepseek-chat; provider = deepseek
[92

Question: What is the role of NPPA in heart failure


I cannot answer. The provided context does not contain any information about
the role of NPPA (Natriuretic Peptide A, also known as atrial natriuretic
peptide or ANP) in heart failure. The excerpts focus exclusively on the role of
Tumor Necrosis Factor-Alpha (TNF-a) in the central nervous system, particularly
in autoimmune disorders, and do not mention NPPA or its involvement in
cardiovascular conditions. For information on NPPA's role in heart failure,
additional sources specifically addressing this topic would be required.




# Query Previous Question & Answers

In [10]:
query_answer_index_results = await query_answer_index(settings=settings, query="CD163")
query_answer_index_results[0]

Number of Indexed Answers: 22
Number of Answers Matching Query: 9


AnswerResponse(session=PQASession(id=UUID('5c1d9ac9-b3fa-48bd-bc62-1c76e21db6b7'), question='What is the gene CD163? And how is CD163 involved in multiple sclerosis?', answer='**CD163** is a gene encoding a scavenger receptor that binds to the hemoglobin-haptoglobin complex, playing a critical role in iron uptake and metabolism within myeloid cells (MCs), particularly in the context of inflammatory and neurodegenerative diseases such as multiple sclerosis (MS) (Hofmann2023 pages 1-2, Hofmann2023 pages 10-12). CD163 is primarily expressed on perivascular macrophages in the healthy brain but is upregulated on resident microglia and infiltrating macrophages under pathological conditions, including chronic active MS lesions, HIV encephalitis, and subarachnoid hemorrhage (Hofmann2023 pages 10-12). This receptor facilitates the internalization of haptoglobin-bound hemoglobin, which is enzymatically broken down by heme oxygenase (HMOX1) to release ferrous iron (Fe2+). The iron is then either 

# Appendix

In [None]:
from paperqa.utils import parse_string, clean_upbibtex, unsrtalpha, CitationConversionError, Person, Parser, FieldIsMissing


def format_bibtex(
    bibtex: str,
    key: str | None = None,
    clean: bool = True,
    missing_replacements: dict[str, str] | None = None,
) -> str:
    """Transform bibtex entry into a citation, potentially adding missing fields."""
    style = unsrtalpha.Style()
    if missing_replacements is None:
        missing_replacements = {}
    try:
        bd = parse_string(clean_upbibtex(bibtex) if clean else bibtex, "bibtex")
        key = list(bd.entries.keys())[0]
    except Exception:
        key = bibtex.split(",")[0]
        key = key.replace("{{", "{")
        key = key.split("{")[1]
        return "Ref " + key
    try:
        entry = bd.entries[key]
    except KeyError as exc:  # Let's check if key is a non-empty prefix
        try:
            entry = next(
                iter(v for k, v in bd.entries.items() if k.startswith(key) and key)
            )
        except StopIteration:
            raise CitationConversionError(
                f"Failed to process{' and clean up' if clean else ''} bibtex {bibtex}"
                f" due to failed lookup of key {key}."
            ) from exc
    try:
        # see if we can insert missing fields
        for field, replacement_value in missing_replacements.items():
            # Deal with special case for author, since it needs to be parsed
            # into Person objects. This reorganizes the names automatically.
            if field == "author" and "author" not in entry.persons:
                tmp_author_bibtex = f"@misc{{tmpkey, author={{{replacement_value}}}}}"
                authors: list[Person] = (
                    Parser()
                    .parse_string(tmp_author_bibtex)
                    .entries["tmpkey"]
                    .persons["author"]
                )
                for a in authors:
                    entry.add_person(a, "author")
            elif field not in entry.fields:
                entry.fields.update({field: replacement_value})
        entry = style.format_entry(label="1", entry=entry)
        return entry.text.render_as("text")
    except (FieldIsMissing, UnicodeDecodeError):
        try:
            return entry.fields["title"]
        except KeyError as exc:
            raise CitationConversionError(
                f"Failed to process{' and clean up' if clean else ''} bibtex {bibtex}"
                " due to missing a 'title' field."
            ) from exc
    
missing_replacements = None
clean = True
bibtex = """@article{{tortosacarreres}2024predictivepotentialof,
    author = "{Tortosa-Carreres}, Jordi and {Cubas-N{\'u}{\\textasciitilde n}ez}, Laura and {Quiroga-Varela}, Ana and {Castillo-Villalba}, Jessica and {Rami{\'o}-Torrenta}, Llu{\'i}s and Piqueras, M{\'o}nica and {Gasqu{\'e}-Rubio}, Raquel and {Quintanilla-Bordas}, Carlos and Sanz, Maria Teresa and Lucas, Celia and {Huertas-Pons}, Joana Mar{\'i}a and Miguela, Albert and Casanova, Bonaventura and {Laiz-Marro}, Bego{\\textasciitilde n}a and {P{\'e}rez-Miralles}, Francisco Carlos",
    title = "Predictive Potential of Serum and Cerebrospinal Fluid Biomarkers for Disease Activity in Treated Multiple Sclerosis Patients",
    year = "2024",
    journal = "Multiple Sclerosis and Related Disorders",
    doi = "10.1016/j.msard.2024.105734",
    url = "https://doi.org/10.1016/j.msard.2024.105734",
    publisher = "Elsevier",
    issn = "2211-0348, 2211-0356"
}
"""


"""Transform bibtex entry into a citation, potentially adding missing fields."""
style = unsrtalpha.Style()
if missing_replacements is None:
    missing_replacements = {}
try:
    print(bibtex)
    bd = parse_string(clean_upbibtex(bibtex) if clean else bibtex, "bibtex")
    key = list(bd.entries.keys())[0]
except Exception:
    key = bibtex.split(",")[0]
    key = key.replace("{{", "{")
    key = key.split("{")[1]
    print("Ref " + key)
try:
    entry = bd.entries[key]
    print(entry)
except KeyError as exc:  # Let's check if key is a non-empty prefix
    try:
        entry = next(
            iter(v for k, v in bd.entries.items() if k.startswith(key) and key)
        )
    except StopIteration:
        raise CitationConversionError(
            f"Failed to process{' and clean up' if clean else ''} bibtex {bibtex}"
            f" due to failed lookup of key {key}."
        ) from exc
try:
    # see if we can insert missing fields
    for field, replacement_value in missing_replacements.items():
        # Deal with special case for author, since it needs to be parsed
        # into Person objects. This reorganizes the names automatically.
        if field == "author" and "author" not in entry.persons:
            tmp_author_bibtex = f"@misc{{tmpkey, author={{{replacement_value}}}}}"
            authors: list[Person] = (
                Parser()
                .parse_string(tmp_author_bibtex)
                .entries["tmpkey"]
                .persons["author"]
            )
            for a in authors:
                entry.add_person(a, "author")
        elif field not in entry.fields:
            entry.fields.update({field: replacement_value})
    entry = style.format_entry(label="1", entry=entry)
    print(entry.text.render_as("text"))
except (FieldIsMissing, UnicodeDecodeError):
    try:
        print(entry.fields["title"])
    except KeyError as exc:
        raise CitationConversionError(
            f"Failed to process{' and clean up' if clean else ''} bibtex {bibtex}"
            " due to missing a 'title' field."
        ) from exc


#test = """@article{{tortosacarreres}2024predictivepotentialof,
#    author = "{Tortosa-Carreres}, Jordi and {Cubas-N{\'u}{\\textasciitilde n}ez}, Laura and {Quiroga-Varela}, Ana and {Castillo-Villalba}, Jessica and {Rami{\'o}-Torrenta}, Llu{\'i}s and Piqueras, M{\'o}nica and {Gasqu{\'e}-Rubio}, Raquel and {Quintanilla-Bordas}, Carlos and Sanz, Maria Teresa and Lucas, Celia and {Huertas-Pons}, Joana Mar{\'i}a and Miguela, Albert and Casanova, Bonaventura and {Laiz-Marro}, Bego{\\textasciitilde n}a and {P{\'e}rez-Miralles}, Francisco Carlos",
#    title = "Predictive Potential of Serum and Cerebrospinal Fluid Biomarkers for Disease Activity in Treated Multiple Sclerosis Patients",
#    year = "2024",
#    journal = "Multiple Sclerosis and Related Disorders",
#    doi = "10.1016/j.msard.2024.105734",
#    url = "https://doi.org/10.1016/j.msard.2024.105734",
#    publisher = "Elsevier",
#    issn = "2211-0348, 2211-0356"
#"""
#format_bibtex(test)
#parse_string(clean_upbibtex(test), "bibtex")

bd = parse_string(clean_upbibtex(bibtex) if clean else bibtex, "bibtex")

In [None]:
print(bibtex)

In [None]:
test = """@article{tortosacarreres2024predictivepotentialof,
  author = "Tortosa-Carreres, Jordi and Cubas-Núñez, Laura and Quiroga-Varela, Ana and Castillo-Villalba, Jessica and Ramió-Torrenta, Lluís and Piqueras, Mónica and Gasqué-Rubio, Raquel and Quintanilla-Bordas, Carlos and Sanz, Maria Teresa and Lucas, Celia and Huertas-Pons, Joana María and Miguela, Albert and Casanova, Bonaventura and Laiz-Marro, Begoña and Pérez-Miralles, Francisco Carlos",
  title = "Predictive Potential of Serum and Cerebrospinal Fluid Biomarkers for Disease Activity in Treated Multiple Sclerosis Patients",
  year = "2024",
  journal = "Multiple Sclerosis and Related Disorders",
  doi = "10.1016/j.msard.2024.105734",
  url = "https://doi.org/10.1016/j.msard.2024.105734",
  publisher = "Elsevier",
  issn = "2211-0348, 2211-0356"
}"""
parse_string(test, bib_format="bibtex")

In [None]:
test = """@article{tortosacarreres2024predictivepotentialof,
    author = "foo, bar",
    title = "Predictive Potential of Serum and Cerebrospinal Fluid Biomarkers for Disease Activity in Treated Multiple Sclerosis Patients",
    year = "2024",
    journal = "Multiple Sclerosis and Related Disorders",
    doi = "10.1016/j.msard.2024.105734",
    url = "https://doi.org/10.1016/j.msard.2024.105734",
    publisher = "Elsevier",
    issn = "2211-0348"
}"""
parse_string(test, bib_format="bibtex")

In [None]:
DocDetails(**doc_details_dict)

In [63]:
import pybtex

bib_file = 

In [None]:
entry

In [None]:
clean = False
bd = parse_string(clean_upbibtex(test) if clean else test, "bibtex")
list(bd.entries.keys())[0]

In [None]:
bd = parse_string(clean_upbibtex(bibtex) if clean else bibtex, "bibtex")
bd