Just some useful snippets for working with RIS files.

In [24]:
import rispy
import pandas as pd
import numpy as np  # for set ops
from dotenv import load_dotenv
load_dotenv()

True

In [25]:
# Create a deep copy of the rispy.TAG_KEY_MAPPING dictionary (rispy's default actually)
# rispy.TAG_KEY_MAPPING is a dictionary used by the rispy library to map
# RIS tag abbreviations (like 'TY', 'AU', 'PY') to more descriptive Python keys
# (like 'type', 'authors', 'year'). This deep copy ensures that the original
# library's mapping is not modified if the 'mapping' dictionary is changed later.
# More about this (including a complete mapping): https://github.com/MrTango/rispy
from copy import deepcopy
mapping = deepcopy(rispy.TAG_KEY_MAPPING)

In [26]:
# Load some arguments and set output file path
import os
f1_path = os.getenv('INPUT_RIS_1ST_PATH')
f2_path = os.getenv('INPUT_RIS_2ND_PATH')
fo_path = os.getenv('OUTPUT_RIS_PATH')
dfs_labels={
    f1_path: "1st input RIS",
    f2_path: "2nd input RIS",
    fo_path: "output RIS"
}

In [27]:
def do_stuff_with_ris(df1: pd.DataFrame, df2: pd.DataFrame, dfs_labels={}) -> pd.DataFrame:
    """Just implement some pandas stuff to do with RIS files."""
    try:
        # For example, remove included from total screened to get excluded (or vice versa)
        output_df = df1[~df1.index.isin(df2.index)]
    except:
        output_df = pd.DataFrame()
    return output_df

In [28]:
# Load the RIS files
dfs = {}  # dict of dataframes, with file_path as key
for file_path in [f1_path, f2_path]:
    # For some reason, EndNote RIS use UTF-8 with BOM encoding
    with open(file_path, 'r', encoding="utf-8-sig") as bibliography_file:
        entries = rispy.load(bibliography_file, encoding='utf-8', mapping=mapping)
        dfs[file_path] = pd.DataFrame(entries)
        #for entry in entries:
        #    pprint(entry)
        #    break
def expected_n(file_path: str):
    """
    Assuming that filenames follow the title_123.ris convention,
    where 123 is the number of records in the file.
    """
    try:
        return int(os.path.basename(file_path).split('_')[-1].split('.')[0])   
    except:
        try:  # perhaps it has a suffix?
            return int(os.path.basename(file_path).split('_')[-2].split('.')[0])
        except:
            pass
        return pd.NA
print("Loaded input files:",
      f"observed {len(dfs[f1_path])}, expected {expected_n(f1_path)}  # {dfs_labels[f1_path]}",
      f"observed {len(dfs[f2_path])}, expected {expected_n(f2_path)}  # {dfs_labels[f2_path]}",
      sep="\n")

Loaded input files:
observed 3463, expected 3463  # 1st input RIS
observed 392, expected 392  # 2nd input RIS


In [29]:
# Process and dump the output RIS file
dfs[fo_path] = do_stuff_with_ris(dfs[f1_path], dfs[f2_path])
with open(fo_path, 'w', encoding='utf-8-sig') as bibliography_file:
    # Note that NaN values are set to empty strings
    entries_to_dump = dfs[fo_path].replace({pd.NA: ''}).to_dict(orient='records')
    rispy.dump(entries_to_dump, bibliography_file)
def calc_expected_n(n1: int, n2: int) -> int | pd.api.typing.NAType:
    """Function to override as needed."""
    return n1 - n2
print("Dumped output file:",
      f"observed {len(dfs[fo_path])}, expected {calc_expected_n(expected_n(f1_path), expected_n(f2_path))}  # {dfs_labels[fo_path]}",
      sep="\n")

Dumped output file:
observed 3071, expected 3071  # output RIS
