# Remaining papers to cite

Collecting together remaining papers to look over and add to book if necessary

In [1]:
import bibtexparser
import logging
import numpy as np
import re

import os
os.chdir("..")

import pandas as pd

from src.db import Database
from src.tags import Tags

LOGGER = logging.getLogger(__name__)
logging.basicConfig(level = logging.INFO)

## Pubmed

In [2]:
def load_metadata(db, filter=True):
    # Initialize Tag class
    metadata = Tags()

    # Load the tag file and turn into a dataframe
    metadata.load("data/tag_files/tags.yaml")
    md = pd.DataFrame(metadata.tags["tagged_papers"])
    
    # Merge with database
    mddb = db.merge(md, on="id", how="outer")
    
    # List current progress out of total.
    total_papers = mddb.shape[0]
    tagged_papers = sum(~mddb["tag"].isnull())
    percent = np.round((tagged_papers/total_papers) * 100, 2)
    
    LOGGER.info(f"Currently {tagged_papers}/{total_papers} ({percent}%) papers tagged.")
    LOGGER.info(f"{total_papers-tagged_papers} papers remaining.")
        
    mddb = mddb.reset_index()
        
    return mddb

In [3]:
# Set tables
tables = ["active_inference", "bayesian_mechanics", "free_energy", "friston", "karl_friston", "predictive_coding", "predictive_processing"]

# Load database
database = Database()
database.load(tables=tables)
db = database.db

# Load metadata
pubmed_db = load_metadata(db, filter=False)

INFO:src.db:Checking tables...
INFO:src.db:Loading tables...
INFO:src.db:Tables downloaded from PubMed on Thursday, Sept. 14, 2023.
INFO:src.tags:YAML tag file successfully loaded from data/tag_files/tags.yaml.
INFO:__main__:Currently 3585/3990 (89.85%) papers tagged.
INFO:__main__:405 papers remaining.


In [9]:
pubmed_db["create_date"] = pd.to_datetime(pubmed_db["create_date"])

In [13]:
pubmed_db = pubmed_db[pubmed_db["create_date"] > pd.Timestamp(2023,9,1)]

## PsyArXiv, BioArXiv, and ArXiv

In [4]:
with open("examples/bioarxiv_aif_citations.bib") as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)
    
bioarxiv_db = pd.DataFrame(bib_database.entries)

In [5]:
arxiv_psyarxiv_db = pd.read_csv("data/arxiv_papers.csv")
arxiv_db = arxiv_psyarxiv_db[arxiv_psyarxiv_db["Archive"] == "ArXiv"]
psyarxiv_db = arxiv_psyarxiv_db[arxiv_psyarxiv_db["Archive"] == "PsyArXiv"]

In [21]:
pubmed_db["title"].to_csv("pubmed_after_sept.csv", index=False)

In [23]:
bioarxiv_db["title"].to_csv("bioarxiv_titles.csv", index=False)