# Pubmed ID to YAML

Quick script to convert the YAML tag file, currently listed under Pubmed IDs, into the respective DOI.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import yaml

os.chdir("..")

TABLE_DIR = "/home/sanjeev/Documents/fundamentals_of_aif/backup_db_archive"
TAG_PATH = "/home/sanjeev/Documents/git_repos/aif-fep-db/data/tags/tags.yaml"

In [42]:
# Loop across list of files and load CSV into list of pd.DataFrames
table_list = []


table_names = os.listdir(TABLE_DIR)
table_paths = [TABLE_DIR + "/" + table_name for table_name in table_names]

for table_path in table_paths:
    table = pd.read_csv(table_path, index_col=0)
    table_list.append(table)

# Drop all columns except PMID and DOI; remove duplicates
mapping_list = []  
for table in table_list:
    table.drop(
        ["Citation", "First Author", "Create Date", "PMCID", "NIHMS ID", "Title", "Authors", "Journal/Book", "Publication Year"], 
        axis=1, inplace=True)
    mapping_list.append(table)

# Concat DataFrames into a single DataFrame and convert to dict
id_mapping = pd.concat(mapping_list).drop_duplicates()
id_mapping = id_mapping.to_dict()
id_mapping = id_mapping["DOI"]

# Import YAML tag file
with open(TAG_PATH, "r") as file:
    tag_file = yaml.safe_load(file)
    
tagged_papers = tag_file["tagged_papers"]

# Apply id mapping
for entry in tagged_papers:
    if entry["id"] in id_mapping:
        entry["id"] = id_mapping[entry["id"]]

# Write back to the YAML file
with open(TAG_PATH, "w") as file:
    yaml.dump(tag_file, file, default_flow_style=False)

In [41]:
id_list = []
for entry in tag_file["tagged_papers"]:
    id_list.append(entry["id"])
    
entry_int_list = []
for entry in id_list:
    if type(entry) == int:
        entry_int_list.append(entry)

In [None]:
['10.1177/1059712319862774',
 '10.48550/arXiv.2308.08029',
 'no_doi_1',
 'no_doi_2',
 'no_doi_3',
 '10.1006/nimg.2002.1153',
 'no_doi_4',
 'no_doi_5',
 'no_doi_6',
 '10.1016/S0893-6080(00)00066-6',
 '10.1088/0954-898X/14/4/305',
 'no_doi_7',
 'no_doi_8',
 'no_doi_9',
 'no_doi_10',
 'no_doi_11',
 'no_doi_12',
 '10.1006/nimg.2002.1156',
 'no_doi_13',
 '10.48550/arXiv.2109.08063',
 '10.48550/arXiv.2201.13180',
 'no_doi_14',
 'no_doi_15',
 '10.48550/arXiv.2305.19654']

In [18]:
# Import YAML tag file
with open(TAG_PATH, "r") as file:
    tag_file = yaml.safe_load(file)
    
tagged_papers = tag_file["tagged_papers"]

for entry in tagged_papers:
    entry["doi"] = entry.pop("id")
    
# Write back to the YAML file
with open(TAG_PATH, "w") as file:
    yaml.dump(tag_file, file, default_flow_style=False)

In [None]:
# import yaml

# # Load the mapping dictionary
# id_mapping = {
#     "some_integer": "doi"  # Replace with actual mappings
# }

# # Read YAML file
# with open("input.yaml", "r") as file:
#     data = yaml.safe_load(file)

# # Update IDs based on the mapping
# for entry in data:
#     if str(entry["id"]) in id_mapping:  # Convert to str to match dictionary keys
#         entry["id"] = id_mapping[str(entry["id"])]

# # Write back to YAML file
# with open("output.yaml", "w") as file:
#     yaml.dump(data, file, default_flow_style=False)

# print("Updated YAML file saved as output.yaml")