In [None]:
from pathlib import Path
import logging
from datetime import datetime

from paperqa.agents.main import agent_query
from paperqa.settings import Settings, AgentSettings, IndexSettings, AnswerSettings, ParsingSettings, PromptSettings

from src.build_search_index import build_search_index, process_bibtex_and_pdfs, create_manifest_file
from src.query_answer_index import query_answer_index
from src.utils import pretty_print_text

# configure the logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
# Generate a timestamp for the log file name
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
logging_dir = Path(".") / "logs"
logging_dir.mkdir(exist_ok=True)
file_handler = logging.FileHandler(str(logging_dir / f"log_{timestamp}.log"))  # log file name
file_handler.setLevel(logging.INFO)  # desired log level for the file
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
file_handler.setFormatter(formatter)
logging.getLogger().addHandler(file_handler)

# NOTE: these are the paths that should be configured
export_directory_name = "MS_EXPORT"
project_dir = Path(".")
#project_dir = Path("/") / "Users" / "pschafer" / "Projects" / "MS_expert"

# default paths
data_dir = project_dir / "data"
data_dir.mkdir(exist_ok=True)
paper_directory = data_dir / export_directory_name
index_directory = data_dir / f"{export_directory_name}_index"
bibtex_file = paper_directory / f"{export_directory_name}.bib"
manifest_file = data_dir / f"{export_directory_name}_manifest.csv"
index_name = f"pqa_index_{export_directory_name}"

# create manifest file from bibtex
processed_df = process_bibtex_and_pdfs(bibtex_file=bibtex_file, paper_directory=paper_directory)
create_manifest_file(manifest_df=processed_df, manifest_file=manifest_file)

#manifest_from_bibtex(bibtex_file=bibtex_file, 
#                     paper_directory=paper_directory, 
#                     manifest_file=manifest_file)

# set paperQA settings
index_settings = IndexSettings(
    name = index_name,
    paper_directory = paper_directory,
    manifest_file = manifest_file,
    index_directory = index_directory,
    use_absolute_paper_directory = False,
    recurse_subdirectories = True,
    concurrency = 1, # "number of concurrent filesystem reads for indexing (probably not important anymore since I avoid calling S2)"
)

agent_settings = AgentSettings(
    agent_llm = "gpt-4o-mini", # smaller than default (bc cheaper)
    index = index_settings,
    index_concurrency = index_settings.concurrency
)

answer_settings = AnswerSettings(
    evidence_k = 10, # number of evidence text chunks to retrieve (default=10)
    evidence_summary_length = "about 100 words", # length of evidence summary (default="about 100 words")
    answer_max_sources = 5, # max number of sources to use for answering (default=5)
    answer_length = "about 200 words, but can be longer", # length of final answer (default="about 200 words, but can be longer")
)

parse_settings = ParsingSettings()

prompt_settings = PromptSettings()

settings = Settings(
    agent = agent_settings, 
    answer = answer_settings,
    parsing = parse_settings,
    prompts = prompt_settings,
    llm="gpt-4o-mini", # smaller than default (bc cheaper)
    summary_llm="gpt-4o-mini", # smaller than default (bc cheaper)
    embedding="text-embedding-3-small", # default
    temperature = 0.0, # default
    texts_index_mmr_lambda = 1.0, # Lambda MMR (default)
    index_absolute_directory = index_settings.use_absolute_paper_directory,
    index_directory = index_settings.index_directory,
    index_recursively = index_settings.recurse_subdirectories,
    manifest_file = index_settings.manifest_file,
    paper_directory = index_settings.paper_directory,
    verbosity = 0, # (0-3), where 3 is all LLM/Embedding calls are logged
)

# Make sure that I am using the defautl arguments where it matters
def print_non_default_settings(settings_defined, settings_classs, settings_name):
    print(f"------\n{settings_name}")
    for key, value in settings_defined.__dict__.items():
        default_value = getattr(settings_classs(), key, None)
        if value != default_value:
            print(f"selected: {key}: {value}")
            print(f"-> default: {key}: {default_value}")

# Print non-default settings for each object
#print_non_default_settings(index_settings, IndexSettings, "index_settings")
#print_non_default_settings(agent_settings, AgentSettings, "agent_settings")
#print_non_default_settings(answer_settings, AnswerSettings, "answer_settings")
#print_non_default_settings(parse_settings, ParsingSettings, "parse_settings")
#print_non_default_settings(prompt_settings, PromptSettings, "prompt_settings")
#print_non_default_settings(settings, Settings, "settings")

# Build Search Index

In [None]:
search_index = await build_search_index(settings=settings, bibtex_file=bibtex_file, manifest_file=manifest_file)
assert search_index.index_name == settings.agent.index.name
print(f"Index Name: {search_index.index_name}")
print(f"Number of Indexed Files: {len((await search_index.index_files).keys())}")

In [3]:

import os
import anyio
import re
from pathlib import Path
from datetime import datetime
import logging
import warnings

import numpy as np
import pandas as pd
import bibtexparser
import pymupdf

from paperqa.settings import Settings
from paperqa.agents.search import SearchIndex
from paperqa.utils import maybe_is_text, md5sum
from paperqa.docs import Docs, Doc, DocDetails, read_doc        

In [None]:
manifest_df = pd.read_csv(manifest_file)
manifest_df

In [None]:
sum(["Predictive Potential of Serum and Cerebrospinal Fluid Biomarkers" in t for t in manifest_df.title.to_list()])
subset = manifest_df.loc[["Predictive Potential of Serum and Cerebrospinal Fluid Biomarkers" in t for t in manifest_df.title.to_list()], :]
subset.file_location.to_list()[0]

In [20]:
with open(bibtex_file, "r") as bibfile:
    bib_database = bibtexparser.load(bibfile)
bib_list = bib_database.entries
# NOTE: We only use PDF for now!
bib_dict = {re.search(r'[^;]*\.pdf', entry["file"]).group(0): entry for entry in bib_list}

In [None]:

import os
import anyio
import re
from pathlib import Path
from datetime import datetime
import logging
import warnings

import numpy as np
import pandas as pd
import bibtexparser
import pymupdf

from paperqa.settings import Settings
from paperqa.agents.search import SearchIndex
from paperqa.utils import maybe_is_text, md5sum
from paperqa.docs import Docs, Doc, DocDetails, read_doc

def _format_authors(bibtex_authors):
    # Split authors by "and"
    authors = [author.strip() for author in bibtex_authors.split(" and ")]
    
    # Reformat each author from "Last, First" to "First Last"
    formatted_authors = []
    for author in authors:
        if ',' in author:
            last, first = map(str.strip, author.split(',', 1))
            formatted_authors.append(f"{first} {last}")
        else:
            formatted_authors.append(author)  # In case no comma, keep as is
    
    return formatted_authors

rel_file_path = "files/27191/Tortosa-Carreres et al. - 2024 - Predictive potential of serum and cerebrospinal fluid biomarkers for disease activity in treated mul.pdf"

docs_for_single_doc = Docs() # otherwise creating Docs object for single doc confuses me

abs_file_path = settings.paper_directory / rel_file_path

bib_dict_doc = bib_dict[str(rel_file_path)]
if "author" in bib_dict_doc.keys():
    bib_dict_doc["authors"] = _format_authors(bib_dict_doc["author"])

parse_config = settings.parsing
dockey = md5sum(abs_file_path)

llm_model = settings.get_llm()

texts = read_doc(
        abs_file_path,
        Doc(docname="", citation="", dockey=dockey),  # Fake doc
        chunk_chars=parse_config.chunk_size,
        overlap=parse_config.overlap,
        page_size_limit=parse_config.page_size_limit,
    )

if not texts:
    raise ValueError(f"Could not read document {abs_file_path}. Is it empty?")

result = await llm_model.run_prompt(
    prompt=parse_config.citation_prompt,
    data={"text": texts[0].text},
    system_prompt=None,  # skip system because it's too hesitant to answer
)
citation = result.text

if (
    len(citation) < 3  # noqa: PLR2004
    or "Unknown" in citation
    or "insufficient" in citation
):
    citation = f"Unknown, {os.path.basename(abs_file_path)}, {datetime.now().year}"

# get first name and year from citation
match = re.search(r"([A-Z][a-z]+)", citation)
if match is not None:
    author = match.group(1)
else:
    # panicking - no word??
    raise ValueError(
        f"Could not parse docname from citation {citation}. "
        "Consider just passing key explicitly - e.g. docs.py "
        "(path, citation, key='mykey')"
    )
year = ""
match = re.search(r"(\d{4})", citation)
if match is not None:
    year = match.group(1)
docname = f"{author}{year}"
docname = docs_for_single_doc._get_unique_name(docname)

doc = Doc(docname=docname, citation=citation, dockey=dockey)

# see also CROSSREF_API_MAPPING, SEMANTIC_SCHOLAR_API_MAPPING

doc_details = DocDetails(**{k: bib_dict_doc[k] for k in ["doi", "authors", "title", "year", "publisher", "issn", "volume", "pages", "journal"] if k in bib_dict_doc.keys()})

In [None]:
BIBTEX_ATTR = ["doi", "authors", "title", "year", "publisher", "issn", "volume", "pages", "journal"]
doc_details_dict = {k: bib_dict_doc[k] for k in BIBTEX_ATTR if k in bib_dict_doc.keys()}
doc_details_dict
doc_details_dict["authors"] = [auth.replace("{", "").replace("}", "") for auth in doc_details_dict["authors"]]
#del doc_details_dict["authors"]
DocDetails(**doc_details_dict)

In [None]:
doc_details_dict["authors"]

In [None]:
[a.replace("{", "").replace("}", "") for a in doc_details_dict["authors"]]

In [None]:
# Create an instance of the DocDetails class
doc_details = DocDetails(
    title="My Awesome Document",
    authors=["John Doe", "Jane Doe"],
    year=2023,
    # ... other attributes
)
doc_details

In [None]:
data = 
data = DocDetails.lowercase_doi_and_populate_doc_id(data)
data = DocDetails.remove_invalid_authors(data)
data = DocDetails.misc_string_cleaning(data)
data = DocDetails.inject_clean_doi_url_into_data(data)
data = DocDetails.add_preprint_journal_from_doi_if_missing(data)
data = DocDetails.populate_bibtex_key_citation(data)

In [None]:
DocDetails.populate_bibtex_key_citation()

In [None]:
bib_dict_doc

# Perform Queries

In [None]:
answer_response = await agent_query(
    query="What do you know about the perivascular niche/space in multiple sclerosis", 
    settings=settings, rebuild_index=False
)
pretty_print_text(answer_response.session.formatted_answer)

# Query Previous Question & Answers

In [None]:
query_answer_index_results = await query_answer_index(settings=settings, query="role of perivascular niche in MS")

# Appendix

In [None]:
from paperqa.utils import parse_string, clean_upbibtex, unsrtalpha, CitationConversionError, Person, Parser, FieldIsMissing


def format_bibtex(
    bibtex: str,
    key: str | None = None,
    clean: bool = True,
    missing_replacements: dict[str, str] | None = None,
) -> str:
    """Transform bibtex entry into a citation, potentially adding missing fields."""
    style = unsrtalpha.Style()
    if missing_replacements is None:
        missing_replacements = {}
    try:
        bd = parse_string(clean_upbibtex(bibtex) if clean else bibtex, "bibtex")
        key = list(bd.entries.keys())[0]
    except Exception:
        key = bibtex.split(",")[0]
        key = key.replace("{{", "{")
        key = key.split("{")[1]
        return "Ref " + key
    try:
        entry = bd.entries[key]
    except KeyError as exc:  # Let's check if key is a non-empty prefix
        try:
            entry = next(
                iter(v for k, v in bd.entries.items() if k.startswith(key) and key)
            )
        except StopIteration:
            raise CitationConversionError(
                f"Failed to process{' and clean up' if clean else ''} bibtex {bibtex}"
                f" due to failed lookup of key {key}."
            ) from exc
    try:
        # see if we can insert missing fields
        for field, replacement_value in missing_replacements.items():
            # Deal with special case for author, since it needs to be parsed
            # into Person objects. This reorganizes the names automatically.
            if field == "author" and "author" not in entry.persons:
                tmp_author_bibtex = f"@misc{{tmpkey, author={{{replacement_value}}}}}"
                authors: list[Person] = (
                    Parser()
                    .parse_string(tmp_author_bibtex)
                    .entries["tmpkey"]
                    .persons["author"]
                )
                for a in authors:
                    entry.add_person(a, "author")
            elif field not in entry.fields:
                entry.fields.update({field: replacement_value})
        entry = style.format_entry(label="1", entry=entry)
        return entry.text.render_as("text")
    except (FieldIsMissing, UnicodeDecodeError):
        try:
            return entry.fields["title"]
        except KeyError as exc:
            raise CitationConversionError(
                f"Failed to process{' and clean up' if clean else ''} bibtex {bibtex}"
                " due to missing a 'title' field."
            ) from exc
    
missing_replacements = None
clean = True
bibtex = """@article{{tortosacarreres}2024predictivepotentialof,
    author = "{Tortosa-Carreres}, Jordi and {Cubas-N{\'u}{\\textasciitilde n}ez}, Laura and {Quiroga-Varela}, Ana and {Castillo-Villalba}, Jessica and {Rami{\'o}-Torrenta}, Llu{\'i}s and Piqueras, M{\'o}nica and {Gasqu{\'e}-Rubio}, Raquel and {Quintanilla-Bordas}, Carlos and Sanz, Maria Teresa and Lucas, Celia and {Huertas-Pons}, Joana Mar{\'i}a and Miguela, Albert and Casanova, Bonaventura and {Laiz-Marro}, Bego{\\textasciitilde n}a and {P{\'e}rez-Miralles}, Francisco Carlos",
    title = "Predictive Potential of Serum and Cerebrospinal Fluid Biomarkers for Disease Activity in Treated Multiple Sclerosis Patients",
    year = "2024",
    journal = "Multiple Sclerosis and Related Disorders",
    doi = "10.1016/j.msard.2024.105734",
    url = "https://doi.org/10.1016/j.msard.2024.105734",
    publisher = "Elsevier",
    issn = "2211-0348, 2211-0356"
}
"""


"""Transform bibtex entry into a citation, potentially adding missing fields."""
style = unsrtalpha.Style()
if missing_replacements is None:
    missing_replacements = {}
try:
    print(bibtex)
    bd = parse_string(clean_upbibtex(bibtex) if clean else bibtex, "bibtex")
    key = list(bd.entries.keys())[0]
except Exception:
    key = bibtex.split(",")[0]
    key = key.replace("{{", "{")
    key = key.split("{")[1]
    print("Ref " + key)
try:
    entry = bd.entries[key]
    print(entry)
except KeyError as exc:  # Let's check if key is a non-empty prefix
    try:
        entry = next(
            iter(v for k, v in bd.entries.items() if k.startswith(key) and key)
        )
    except StopIteration:
        raise CitationConversionError(
            f"Failed to process{' and clean up' if clean else ''} bibtex {bibtex}"
            f" due to failed lookup of key {key}."
        ) from exc
try:
    # see if we can insert missing fields
    for field, replacement_value in missing_replacements.items():
        # Deal with special case for author, since it needs to be parsed
        # into Person objects. This reorganizes the names automatically.
        if field == "author" and "author" not in entry.persons:
            tmp_author_bibtex = f"@misc{{tmpkey, author={{{replacement_value}}}}}"
            authors: list[Person] = (
                Parser()
                .parse_string(tmp_author_bibtex)
                .entries["tmpkey"]
                .persons["author"]
            )
            for a in authors:
                entry.add_person(a, "author")
        elif field not in entry.fields:
            entry.fields.update({field: replacement_value})
    entry = style.format_entry(label="1", entry=entry)
    print(entry.text.render_as("text"))
except (FieldIsMissing, UnicodeDecodeError):
    try:
        print(entry.fields["title"])
    except KeyError as exc:
        raise CitationConversionError(
            f"Failed to process{' and clean up' if clean else ''} bibtex {bibtex}"
            " due to missing a 'title' field."
        ) from exc


#test = """@article{{tortosacarreres}2024predictivepotentialof,
#    author = "{Tortosa-Carreres}, Jordi and {Cubas-N{\'u}{\\textasciitilde n}ez}, Laura and {Quiroga-Varela}, Ana and {Castillo-Villalba}, Jessica and {Rami{\'o}-Torrenta}, Llu{\'i}s and Piqueras, M{\'o}nica and {Gasqu{\'e}-Rubio}, Raquel and {Quintanilla-Bordas}, Carlos and Sanz, Maria Teresa and Lucas, Celia and {Huertas-Pons}, Joana Mar{\'i}a and Miguela, Albert and Casanova, Bonaventura and {Laiz-Marro}, Bego{\\textasciitilde n}a and {P{\'e}rez-Miralles}, Francisco Carlos",
#    title = "Predictive Potential of Serum and Cerebrospinal Fluid Biomarkers for Disease Activity in Treated Multiple Sclerosis Patients",
#    year = "2024",
#    journal = "Multiple Sclerosis and Related Disorders",
#    doi = "10.1016/j.msard.2024.105734",
#    url = "https://doi.org/10.1016/j.msard.2024.105734",
#    publisher = "Elsevier",
#    issn = "2211-0348, 2211-0356"
#"""
#format_bibtex(test)
#parse_string(clean_upbibtex(test), "bibtex")

bd = parse_string(clean_upbibtex(bibtex) if clean else bibtex, "bibtex")

In [None]:
print(bibtex)

In [None]:
test = """@article{tortosacarreres2024predictivepotentialof,
  author = "Tortosa-Carreres, Jordi and Cubas-Núñez, Laura and Quiroga-Varela, Ana and Castillo-Villalba, Jessica and Ramió-Torrenta, Lluís and Piqueras, Mónica and Gasqué-Rubio, Raquel and Quintanilla-Bordas, Carlos and Sanz, Maria Teresa and Lucas, Celia and Huertas-Pons, Joana María and Miguela, Albert and Casanova, Bonaventura and Laiz-Marro, Begoña and Pérez-Miralles, Francisco Carlos",
  title = "Predictive Potential of Serum and Cerebrospinal Fluid Biomarkers for Disease Activity in Treated Multiple Sclerosis Patients",
  year = "2024",
  journal = "Multiple Sclerosis and Related Disorders",
  doi = "10.1016/j.msard.2024.105734",
  url = "https://doi.org/10.1016/j.msard.2024.105734",
  publisher = "Elsevier",
  issn = "2211-0348, 2211-0356"
}"""
parse_string(test, bib_format="bibtex")

In [None]:
test = """@article{tortosacarreres2024predictivepotentialof,
    author = "foo, bar",
    title = "Predictive Potential of Serum and Cerebrospinal Fluid Biomarkers for Disease Activity in Treated Multiple Sclerosis Patients",
    year = "2024",
    journal = "Multiple Sclerosis and Related Disorders",
    doi = "10.1016/j.msard.2024.105734",
    url = "https://doi.org/10.1016/j.msard.2024.105734",
    publisher = "Elsevier",
    issn = "2211-0348"
}"""
parse_string(test, bib_format="bibtex")

In [None]:
DocDetails(**doc_details_dict)

In [63]:
import pybtex

bib_file = 

In [None]:
entry

In [None]:
clean = False
bd = parse_string(clean_upbibtex(test) if clean else test, "bibtex")
list(bd.entries.keys())[0]

In [None]:
bd = parse_string(clean_upbibtex(bibtex) if clean else bibtex, "bibtex")
bd