In [1]:
from pathlib import Path
import logging
import os

from paperqa.agents.main import agent_query
from paperqa.settings import Settings, AgentSettings, IndexSettings, AnswerSettings, ParsingSettings, PromptSettings

from src.build_search_index import process_bibtex_and_pdfs, create_manifest_file, build_search_index
from src.query_answer_index import query_answer_index
from src.utils import pretty_print_text

# configure the logging
logging.basicConfig(
    level=logging.WARNING,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# NOTE: these are the paths that should be configured
export_directory_name = "TEST_EXPORT"
project_dir = Path(".")

# default paths
data_dir = project_dir / "data"
data_dir.mkdir(exist_ok=True)
paper_directory = data_dir / export_directory_name
index_directory = data_dir / f"{export_directory_name}_index"
bibtex_file = paper_directory / f"{export_directory_name}.bib"
manifest_file = data_dir / f"{export_directory_name}_manifest.csv"
index_name = f"pqa_index_{export_directory_name}"

# create manifest file from bibtex
processed_df = process_bibtex_and_pdfs(bibtex_file=bibtex_file, paper_directory=paper_directory)
create_manifest_file(manifest_df=processed_df, manifest_file=manifest_file)

# set paperQA settings
default_lmm = "deepseek/deepseek-chat" # see https://docs.litellm.ai/docs/providers/deepseek

index_settings = IndexSettings(
    name = index_name,
    paper_directory = paper_directory,
    manifest_file = manifest_file,
    index_directory = index_directory,
    use_absolute_paper_directory = False,
    recurse_subdirectories = True,
    concurrency = 1, # "number of concurrent filesystem reads for indexing (probably not important anymore since I avoid calling S2)"
)

agent_settings = AgentSettings(
    agent_llm = default_lmm, # smaller than default (bc cheaper)
    index = index_settings,
    index_concurrency = index_settings.concurrency
)

answer_settings = AnswerSettings(
    evidence_k = 10, # number of evidence text chunks to retrieve (default=10)
    evidence_summary_length = "about 100 words", # length of evidence summary (default="about 100 words")
    answer_max_sources = 5, # max number of sources to use for answering (default=5)
    answer_length = "about 200 words, but can be longer", # length of final answer (default="about 200 words, but can be longer")
)

parse_settings = ParsingSettings()

prompt_settings = PromptSettings()

settings = Settings(
    agent = agent_settings, 
    answer = answer_settings,
    parsing = parse_settings,
    prompts = prompt_settings,
    llm=default_lmm, # smaller than default (bc cheaper)
    summary_llm=default_lmm, # smaller than default (bc cheaper)
    embedding="text-embedding-3-small", # default
    temperature = 0.0, # default
    texts_index_mmr_lambda = 1.0, # Lambda MMR (default)
    index_absolute_directory = index_settings.use_absolute_paper_directory,
    index_directory = index_settings.index_directory,
    index_recursively = index_settings.recurse_subdirectories,
    manifest_file = index_settings.manifest_file,
    paper_directory = index_settings.paper_directory,
    verbosity = 0, # (0-3), where 3 is all LLM/Embedding calls are logged
)

# Make sure that I am using the default arguments where it matters
#def print_non_default_settings(settings_defined, settings_classs, settings_name):
#    print(f"------\n{settings_name}")
#    for key, value in settings_defined.__dict__.items():
#        default_value = getattr(settings_classs(), key, None)
#        if value != default_value:
#            print(f"selected: {key}: {value}")
#            print(f"-> default: {key}: {default_value}")

# Print non-default settings for each object
#print_non_default_settings(index_settings, IndexSettings, "index_settings")
#print_non_default_settings(agent_settings, AgentSettings, "agent_settings")
#print_non_default_settings(answer_settings, AnswerSettings, "answer_settings")
#print_non_default_settings(parse_settings, ParsingSettings, "parse_settings")
#print_non_default_settings(prompt_settings, PromptSettings, "prompt_settings")
#print_non_default_settings(settings, Settings, "settings")

# check API_KEYS are present
API_KEYS = ["DEEPSEEK_API_KEY", "OPENAI_API_KEY"]
for api_key in API_KEYS:
    if (key := os.getenv(api_key)):
        print(f"{api_key} found")

* 'fields' has been removed


------
index_settings
selected: name: pqa_index_TEST_EXPORT
-> default: name: None
selected: paper_directory: data/TEST_EXPORT
-> default: paper_directory: /Users/pschafer/Projects/MS_expert
selected: manifest_file: data/TEST_EXPORT_manifest.csv
-> default: manifest_file: None
selected: index_directory: data/TEST_EXPORT_index
-> default: index_directory: /Users/pschafer/.pqa/indexes
selected: concurrency: 1
-> default: concurrency: 5
------
agent_settings
selected: agent_llm: deepseek/deepseek-chat
-> default: agent_llm: gpt-4o-2024-11-20
selected: index_concurrency: 1
-> default: index_concurrency: 5
selected: index: name='pqa_index_TEST_EXPORT' paper_directory=PosixPath('data/TEST_EXPORT') manifest_file=PosixPath('data/TEST_EXPORT_manifest.csv') index_directory=PosixPath('data/TEST_EXPORT_index') use_absolute_paper_directory=False recurse_subdirectories=True concurrency=1 sync_with_paper_directory=True
-> default: index: name=None paper_directory=PosixPath('/Users/pschafer/Projects/M

# Build Search Index

In [3]:
search_index = await build_search_index(settings=settings, bibtex_file=bibtex_file, manifest_file=manifest_file)
assert search_index.index_name == settings.agent.index.name
print(f"Index Name: {search_index.index_name}")
print(f"Number of Indexed Files: {len((await search_index.index_files).keys())}")

Index Name: pqa_index_TEST_EXPORT
Number of Indexed Files: 10


# Perform Queries

In [4]:
answer_response = await agent_query(
    query="What is the role of the perivascular niche in multiple sclerosis?", 
    settings=settings, rebuild_index=False
)
pretty_print_text(answer_response.session.formatted_answer)

Question: What is the role of the perivascular niche in multiple sclerosis?


The perivascular niche plays a critical role in the pathogenesis of multiple
sclerosis (MS) by facilitating immune cell trafficking and inflammation in the
central nervous system (CNS). Immune cells, including T cells (CD4+ and CD8+),
B cells, and monocytes, migrate across the blood-brain barrier (BBB) at
post-capillary venules, entering the CNS parenchyma and contributing to
perivascular lesion formation (Filippi2021 pages 6-7, Filippi2021 pages 8-9).
This migration is mediated by molecules such as junctional adhesion
molecule-like (JAML) and MUC18, which enable immune cells to cross the BBB
(Filippi2021 pages 6-7). Additionally, immune cells may access the CNS through
the subarachnoid space and the blood-CSF barrier (Filippi2021 pages 8-9).


Once in the CNS, these immune cells, along with activated microglia and
astrocytes, drive oligodendrocyte injury, demyelination, and neuro-axonal
damage through cell c

# Query Previous Question & Answers

In [5]:
query_answer_index_results = await query_answer_index(settings=settings, query="role of perivascular niche in MS")

Number of Indexed Answers: 14
Number of Answers Matching Query: 10
