In [None]:
import os
os.chdir("..")

from src.db import Database
from src.scraper import Scraper
from src.tags import Tags

# Typical workflow

This notebook describes the typical use case and workflow between the three primary classes in this repo: `Scraper`, `Database`, and `Tags`. The particularities of each of these classes is described in more detail in their respective example notebooks.

A typical workflow will consist of the following steps:

1. Scrape new papers from online archives.
2. Load the papers into the database.
3. Review the resulting papers and remove the unwanted ones.
4. Create a new tag file (or modify an existing tag file) and add tags to papers in the database.
5. Attach the tags to the database.
6. Save the database.
7. Run the app to view the database.

In theory, once scraping and tagging papers is up-to-date with the current time, one would periodically repeat the workflow weekly to keep the database up to date. In code, this corresponds to the following steps.

**Note**: Running the app assumes that Docker is already installed on your system.

In [None]:
# TODO: Check this workflow properly and see that it works end to end.

""" 0. Set globals """
TABLE_DIR = ""
TAG_DIR = ""

""" 1. Scrape papers """
scraper = Scraper(
    keywords=["active inference"],   # List[Union[str, List[str]]]
    start_date="2025-01-01",         # str in YYYY-MM-DD format
    end_date="2025-02-01",           # str in YYYY-MM-DD format
    archives=["pubmed", "arxiv"],    # List[str]
)

scraper.run(return_results=False, outpath=TABLE_DIR)

""" 2. Load scraped papers into database """
database = Database()
database.create(table_dir=TABLE_DIR)

""" 3. Remove unwanted papers """
papers_to_remove = []
database.remove(doi_list=papers_to_remove)

""" 4. Create a new tag file """
tags = Tags()
tags.create()

# Add tag category and new tags to tag list
tags.set_tag_list_category("Bayesian mechanics")
tags.add_to_tag_list(["information geometry", "Markov blankets"])

# Add tags by DOI
tags_to_add = [
    {"doi": "10.1098/rsta.2019.0159", "tags": ["information geometry"]},
    {"doi": "10.1098/rspa.2021.0518", "tags": ["Markov blankets"]},
]

tags.link_tags_to_doi(tag_dict_list=tags_to_add)

# Save tag file
tags.save(outpath=TAG_DIR)

""" 5. Attach tags to database """
database.attach_tags(tag_path=TAG_DIR, overwrite=False)

""" 6. Save database """
# If no path is provided the database is serialized to data/databases with a timestamp
database.save(database_description="Active inference papers")

"""7. Run the app to view the database """
# Build the container if you have not already: `sh bin/build.sh`
# Run the container: `sh bin/run.sh`
# View the app in your browser at: http://0.0.0.0:5000

# Note that running the container uses the most up to date version of the database by default as shown in `bin/run.sh` and `bin/build_and_run.sh`. If you wish to point the container to a different database then run `sh bin/run.sh data/<database_name.pkl>`.