In [1]:
from pathlib import Path
import logging
from datetime import datetime

from paperqa.agents.main import agent_query
from paperqa.settings import Settings, AgentSettings, IndexSettings, AnswerSettings, ParsingSettings, PromptSettings

from src.build_search_index import manifest_from_bibtex, build_search_index
from src.query_answer_index import query_answer_index
from src.utils import pretty_print_text

# configure the logging
logging.basicConfig(
    level=logging.WARNING,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
# Generate a timestamp for the log file name
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
logging_dir = Path(".") / "logs"
logging_dir.mkdir(exist_ok=True)
file_handler = logging.FileHandler(str(logging_dir / f"log_{timestamp}.log"))  # log file name
file_handler.setLevel(logging.INFO)  # desired log level for the file
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
file_handler.setFormatter(formatter)
logging.getLogger().addHandler(file_handler)

# configure paths
export_directory_name = "TEST_EXPORT"
project_dir = Path("/") / "Users" / "pschafer" / "Projects" / "MS_expert"
paper_directory = project_dir / export_directory_name
index_directory = project_dir / f"{export_directory_name}_index"
bibtex_file = paper_directory / f"{export_directory_name}.bib"
manifest_file = project_dir / f"{export_directory_name}_manifest.csv"
index_name = f"pqa_index_{export_directory_name}"

# create manifest file from bibtex
manifest_from_bibtex(bibtex_file=bibtex_file, 
                     paper_directory=paper_directory, 
                     manifest_file=manifest_file)

# set paperQA settings
index_settings = IndexSettings(
    name = index_name,
    paper_directory = paper_directory,
    manifest_file = manifest_file,
    index_directory = index_directory,
    use_absolute_paper_directory = False,
    recurse_subdirectories = True,
    concurrency = 1, # "number of concurrent filesystem reads for indexing (probably not important anymore since I avoid calling S2)"
)

agent_settings = AgentSettings(
    agent_llm = "gpt-4o-mini", # smaller than default (bc cheaper)
    index = index_settings,
    index_concurrency = index_settings.concurrency
)

answer_settings = AnswerSettings(
    evidence_k = 10, # number of evidence text chunks to retrieve (default=10)
    evidence_summary_length = "about 100 words", # length of evidence summary (default="about 100 words")
    answer_max_sources = 5, # max number of sources to use for answering (default=5)
    answer_length = "about 200 words, but can be longer", # length of final answer (default="about 200 words, but can be longer")
)

parse_settings = ParsingSettings()

prompt_settings = PromptSettings()

settings = Settings(
    agent = agent_settings, 
    answer = answer_settings,
    parsing = parse_settings,
    prompts = prompt_settings,
    llm="gpt-4o-mini", # smaller than default (bc cheaper)
    summary_llm="gpt-4o-mini", # smaller than default (bc cheaper)
    embedding="text-embedding-3-small", # default
    temperature = 0.0, # default
    texts_index_mmr_lambda = 1.0, # Lambda MMR (default)
    index_absolute_directory = index_settings.use_absolute_paper_directory,
    index_directory = index_settings.index_directory,
    index_recursively = index_settings.recurse_subdirectories,
    manifest_file = index_settings.manifest_file,
    paper_directory = index_settings.paper_directory,
    verbosity = 0, # (0-3), where 3 is all LLM/Embedding calls are logged
)

# Make sure that I am using the defautl arguments where it matters
def print_non_default_settings(settings_defined, settings_classs, settings_name):
    print(f"------\n{settings_name}")
    for key, value in settings_defined.__dict__.items():
        default_value = getattr(settings_classs(), key, None)
        if value != default_value:
            print(f"selected: {key}: {value}")
            print(f"-> default: {key}: {default_value}")

# Print non-default settings for each object
#print_non_default_settings(index_settings, IndexSettings, "index_settings")
#print_non_default_settings(agent_settings, AgentSettings, "agent_settings")
#print_non_default_settings(answer_settings, AnswerSettings, "answer_settings")
#print_non_default_settings(parse_settings, ParsingSettings, "parse_settings")
#print_non_default_settings(prompt_settings, PromptSettings, "prompt_settings")
#print_non_default_settings(settings, Settings, "settings")

In [2]:
manifest_file

PosixPath('/Users/pschafer/Projects/MS_expert/TEST_EXPORT_manifest.csv')

# Build Search Index

In [3]:
search_index = await build_search_index(settings=settings, bibtex_file=bibtex_file)
assert search_index.index_name == settings.agent.index.name
print(f"Index Name: {search_index.index_name}")
print(f"Number of Indexed Files: {len((await search_index.index_files).keys())}")

Index Name: pqa_index_TEST_EXPORT
Number of Indexed Files: 10


# Perform Queries

In [4]:
answer_response = await agent_query(
    query="What do you know about the perivascular niche/space in multiple sclerosis", 
    settings=settings, rebuild_index=False
)
pretty_print_text(answer_response.session.formatted_answer)

Question: What do you know about the perivascular niche/space in multiple
sclerosis


The perivascular niche in multiple sclerosis (MS) is a critical area for
understanding the disease's pathology, particularly regarding immune cell
infiltration and its consequences. This niche is where immune cells, including
CD4+ and CD8+ T cells, B cells, and myeloid cells, infiltrate the central
nervous system (CNS) during relapses, primarily around post-capillary venules
of the blood-brain barrier (BBB) (Filippi2021 pages 8-9). The migration of
these immune cells across the BBB is essential for CNS inflammation, with
cytokines such as IL-1beta facilitating their access (Filippi2021 pages 24-24).


The perivascular space is also implicated in the injury of oligodendrocytes,
leading to demyelination and neuro-axonal damage through both direct cell
contact and the secretion of soluble factors (Filippi2021 pages 8-9).
Furthermore, it plays a role in remyelination processes, as it is a critical
area fo

# Query Previous Question & Answers

In [5]:
query_answer_index_results = await query_answer_index(settings=settings, query="role of perivascular niche in MS")

Number of Indexed Answers: 10
Number of Answers Matching Query: 10


# Appendix