# Get Author info from papers
https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/get_graph_get_paper_authors

https://api.semanticscholar.org/api-docs/graph#tag/Author-Data/operation/post_graph_get_authors

See also about PubMed unique author id:
https://pubmed.ncbi.nlm.nih.gov/34180522/
https://pubmed.ncbi.nlm.nih.gov/28960025/
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5639053/
https://github.com/Daniel-Mietchen/ideas/issues/1260
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5530597/

In [1]:
import os
import requests
import json
import pandas as pd
import publication_query
import utils

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
#pd.set_option('display.max_colwidth', None)

In [2]:
KG_PATH = "../kg/data"
DERIVED_DATA_PATH = "../derived_data"

In [3]:
START_PUBLICATION_YEAR = 2021 # Grants started at the end of 2020.

## Get raw primary Publication data
These publications result from a grant number query in PubMed.

In [4]:
raw_publications = pd.read_csv("https://raw.githubusercontent.com/christian-horgan/pubmed-search/main/data/radx-rad/article_data_current.csv", dtype=str, keep_default_na=False)

In [5]:
non_pubmed_publications = pd.read_csv("../data/Publications_non_pubmed.csv", dtype=str, keep_default_na=False)
non_pubmed_publications.head()

Unnamed: 0,pm_id,title,pmc_id,authors,doi,keywords,mesh_ids,mesh_terms,abstract,project_serial_num,journal_name,publication_year
0,,IMI-CDE: an interactive interface for collabor...,,"Tao S,Chou WC,Li J,Du J,Ram PM,Abeysinghe R,Xu...",doi:10.1109/ICHI54592.2022.00070,"COVID-19,Common Data Element,CDE,Mapping,Data ...",,,The National Institute of Health (NIH) launche...,LM013755,2022 IEEE 10th International Conference on Hea...,2022


In [6]:
raw_publications = pd.concat([raw_publications, non_pubmed_publications])
raw_publications.fillna("", inplace=True)

In [7]:
# Restrict publications to 2021+ since RADx projects started at the end of 2020.
raw_publications["publication_year"] = raw_publications["publication_year"].astype(int)
raw_publications.query(f"publication_year >= {START_PUBLICATION_YEAR}", inplace=True)
# DOI is required as the primary key
raw_publications.dropna(subset="doi", inplace=True)
# convert property name to Neo4j convention
raw_publications.rename(columns={"pm_id": "pmId", "pmc_id": "pmcId", "project_serial_num": "projectSerialNum"}, inplace=True)

In [8]:
raw_publications.head()

Unnamed: 0,pmId,title,pmcId,authors,doi,keywords,mesh_ids,mesh_terms,abstract,projectSerialNum,journal_name,publication_year
0,37727605,Repurposing a SARS-CoV-2 surveillance program ...,PMC10505707,"King KL,Ham R,Smothers A,Lee I,Bowie T,Teetsel...",doi:10.3389/fpubh.2023.1168551,"SARS-CoV-2,flu,influenza,saliva,surveillance t...","D006801,D000086402,D014495,D000086382,D007251,...","Humans,SARS-CoV-2,Universities,COVID-19,Influe...",Standard multiplex RT-qPCR diagnostic tests us...,AA029328,Frontiers in public health,2023
1,37590297,A capacitive laser-induced graphene based apta...,PMC10434860,"Moreira G,Qian H,Datta SPA,Bliznyuk N,Carpente...",doi:10.1371/journal.pone.0290256,,"D006801,D000086402,D000086382,D006108,D007251,...","Humans,SARS-CoV-2,COVID-19,Graphite,Influenza,...",SARS-CoV-2 virus induced CoVID-19 pandemic has...,AA029328,PloS one,2023
2,36595104,Ion-selective electrodes based on laser-induce...,,"Soares RRA,Hjort RG,Pola CC,Jing D,Cecon VS,Cl...",doi:10.1007/s00604-022-05615-9,"Electrochemical Sensors,Food additives,Food sa...","D006801,D017736,D006108,D009573,D014867,D007834","Humans,Ion-Selective Electrodes,Graphite,Nitri...",Nitrite is an important food additive for cure...,AA029328,Mikrochimica acta,2023
3,36354449,An Experimental Framework for Developing Point...,PMC9688365,"Ullah SF,Moreira G,Datta SPA,McLamore E,Vanegas D",doi:10.3390/bios12110938,"SARS-CoV-2,analytical sensing,binding kinetics...","D006801,D058266,D000086402,D000086382,D007368,...","Humans,Dielectric Spectroscopy,SARS-CoV-2,COVI...",Biolayer interferometry (BLI) is a well-establ...,AA029328,Biosensors,2022
4,35992634,Development of a Biosensor Based on Angiotensi...,PMC9386735,"Moreira G,Casso-Hartmann L,Datta SPA,Dean D,Mc...",doi:10.3389/fsens.2022.917380,"LIG electrodes,attenuated virus,betacoronaviru...",,,Severe acute respiratory syndrome coronavirus ...,AA029328,Frontiers in sensors,2022


In [9]:
raw_publications.tail()

Unnamed: 0,pmId,title,pmcId,authors,doi,keywords,mesh_ids,mesh_terms,abstract,projectSerialNum,journal_name,publication_year
389,35644340.0,Effectiveness and Safety of Biologic Therapy i...,PMC9701245,"Nguyen NH,Luo J,Paul P,Kim J,Syal G,Ha C,Rudra...",doi:10.1016/j.cgh.2022.05.008,"Crohn’s Disease,Disparities,Ethnic Minorities,...","D000328,D005260,D006801,D008297,D001688,D00169...","Adult,Female,Humans,Male,Biological Products,B...",There are limited data on outcomes of biologic...,LM013755,Clinical gastroenterology and hepatology : the...,2023
390,35532905.0,Aptamer Sandwich Lateral Flow Assay (AptaFlow)...,PMC9112978,"Yang LF,Kacherovsky N,Panpradist N,Wan R,Liang...",doi:10.1021/acs.analchem.2c00554,,"D000914,D052157,D000086382,D000086663,D006801,...","Antibodies,Viral,Aptamers,Nucleotide,COVID-19,...",The COVID-19 pandemic is among the greatest he...,LM013755,Analytical chemistry,2022
391,34923447.0,Detecting model misconducts in decentralized h...,PMC10017272,"Kuo TT,Pham A",doi:10.1016/j.ijmedinf.2021.104658,"Blockchain Distributed Ledger Technology,Elect...",,,To accelerate healthcare/genomic medicine rese...,LM013755,International journal of medical informatics,2021
392,34328683.0,Discovery and Characterization of Spike N-Term...,PMC8426805,"Kacherovsky N,Yang LF,Dang HV,Cheng EL,Cardle ...",doi:10.1002/anie.202107730,"SARS-CoV-2,aptamers,coronavirus,cryo-EM,immuno...","D052157,D000086382,D004797,D006801,D008958,D00...","Aptamers,Nucleotide,COVID-19,Enzyme-Linked Imm...",The coronavirus disease 2019 (COVID-19) pandem...,LM013755,Angewandte Chemie (International ed. in English),2021
0,,IMI-CDE: an interactive interface for collabor...,,"Tao S,Chou WC,Li J,Du J,Ram PM,Abeysinghe R,Xu...",doi:10.1109/ICHI54592.2022.00070,"COVID-19,Common Data Element,CDE,Mapping,Data ...",,,The National Institute of Health (NIH) launche...,LM013755,2022 IEEE 10th International Conference on Hea...,2022


## Get author information for the primary citations

In [10]:
# Query the Semantic Scholar API to get author information for a list of DOIs
raw_dois = list(raw_publications["doi"].unique())
authors = publication_query.get_author_ids(raw_dois)
authors.head()

ERROR: Semantic Scholar HTTP error: 504 Server Error: Gateway Timeout for url: https://api.semanticscholar.org/graph/v1/paper/batch?fields=authors.authorId%2Cauthors.name%2Cauthors.aliases%2Cauthors.affiliations%2Cauthors.paperCount%2Cauthors.citationCount%2Cauthors.hIndex%2Cauthors.externalIds


KeyError: 'aliases'

In [None]:
# authors.query("doi == 'doi:10.1097/PCC.0000000000002976'") # Authors with title in names
authors.query("doi == 'doi:10.1109/ICHI54592.2022.00070'")

In [None]:
publication_query.expand_name_column(authors, "name")
authors.rename(columns={"name": "author"}, inplace=True)
authors.rename(columns={"externalIds.ORCID": "orcid"}, inplace=True)
authors = raw_publications.merge(authors, on="doi")

In [None]:
authors.drop_duplicates(inplace=True)
authors.fillna("", inplace=True)
authors.replace("nan", "", inplace=True)
authors.rename(columns={"name": "author"}, inplace=True)

## Match publication authors with RADx-rad investigators
In this step, we eliminate non-RADx publications.

In [None]:
radx_investigators = pd.read_csv(os.path.join(DERIVED_DATA_PATH, "radx_investigators.csv"))

In [None]:
radx_investigators.head()

#### Fuzzy merge of publication authors with RADx investigators

In [None]:
# The threshold was manually adjusted by checking for proper matches in the table below. The lowest threshold is 0.905882 for author Vanegas D.
radx_authors = utils.fuzzy_merge(authors[["author", "authorId", "aliases", "orcid", "projectSerialNum", "doi"]], 
                                    radx_investigators[["name", "projectSerialNum"]], 
                                    left_fuzzy_on="author", right_fuzzy_on="name", 
                                    left_on="projectSerialNum", right_on="projectSerialNum", 
                                    how="inner", threshold=0.905)

In [None]:
print("Matched authors       :", radx_authors.shape[0])
print("Matched unique authors:", radx_authors["author"].nunique())
print("Number of unique DOIs :", radx_authors["doi"].nunique())
radx_authors.head(300)

In [None]:
radx_authors.query("doi == 'doi:10.1109/ICHI54592.2022.00070'")

#### If there are multiple name matches for a publication, use the match with the highest score.

In [None]:
# sort descending so the match with the highest score comes first.
radx_authors.sort_values(by=["score", "doi", "match"], ascending=False, inplace=True)
# keep only the first instance for each match, thus eliminating the lower-scoring matches.
radx_authors.drop_duplicates(subset=["doi", "match"], inplace=True)
print("Matched authors       :", radx_authors.shape[0])
print("Matched unique authors:", radx_authors["author"].nunique())
print("Number of unique DOIs :", radx_authors["doi"].nunique())
radx_authors.head()

#### Filter publications (RADx investigators only)

In [None]:
primary_dois = list(radx_authors["doi"].unique())
primary_publications = raw_publications[raw_publications["doi"].isin(set(primary_dois))].copy()
primary_publications.fillna("", inplace=True)
primary_publications.head()

## Create primary ```Publication``` node file for KG

In [None]:
primary_publications["id"] = primary_publications["doi"]
primary_publications["url"] = "https://doi.org/" + primary_publications["doi"]
primary_publications["type"] = "primary"
primary_publications.drop_duplicates("id", inplace=True)

In [None]:
publication_map = {"id": "id", "title": "name", "abstract": "abstract", "journal_name": "journal", "publication_year": "year", "type": "type", "doi": "doi", "pmId": "pmId", "pmcId": "pmcId", "url": "url"}
primary_publications_kg = utils.rename_and_reorder_columns(primary_publications, publication_map)

In [None]:
primary_publications_kg.to_csv(os.path.join(KG_PATH, "nodes", "Publication_primary.csv"), index=False)

### Create primary publication Author nodes

In [None]:
primary_authors_all = authors[authors["doi"].isin(set(primary_dois))].copy()
primary_authors_all["fullName"] = primary_authors_all["fullName"].str.replace(".", "")
primary_authors_all.head()

In [None]:
#primary_authors[["authorId", "author", "fullName", "firstName", "middleName", "lastName", "doi"]].to_csv(os.path.join(DERIVED_DATA_PATH, "primary_authors_all.csv"), index=False)

## Create ```Publication-CITES-Publication``` relationship file for KG

In [None]:
citations = publication_query.get_citations(primary_dois)
citations.rename(columns={"doiCite": "from", "doi": "to"}, inplace=True)
# TODO where does the following inconsistency originate?
# WARNING: Error messages from data import:
# doi:10.47464/METROCIENCIA/VOL29/3/2021/5-10 (Publication-ID)-[CITES]->doi:10.1002/art.41616 (Publication-ID) referring to missing node doi:10.47464/METROCIENCIA/VOL29/3/2021/5-10
citations = citations[(citations["from"] != "doi:10.47464/METROCIENCIA/VOL29/3/2021/5-10") & (citations["to"] != "doi:10.1002/art.41616")]
citations.to_csv(os.path.join(KG_PATH, "relationships", "Publication-CITES-Publication.csv"), index=False)

## Get Secondary publications
Secondary publications are publications that cite the primary publication

In [None]:
secondary_dois = list(citations["from"].unique())
# remove any primary DOIs, e.g., a primary publication cites another primary publication.
secondary_dois = list(set(secondary_dois) - set(primary_dois))
secondary_publications = publication_query.get_publication_info(secondary_dois)

In [None]:
secondary_publications["id"] = secondary_publications["doi"]
secondary_publications["url"] = "https://doi.org/" + secondary_publications["doi"]
secondary_publications["type"] = "secondary"
secondary_publications.drop_duplicates("id", inplace=True)

In [None]:
citation_map = {"id": "id", "title": "name", "abstract": "abstract", "journal": "journal", "year": "year", "type": "type", "doi": "doi", "pmId": "pmId", "pmcId": "pmcId", "url": "url"}
secondary_publications = utils.rename_and_reorder_columns(secondary_publications, citation_map)

In [None]:
print("Number of secondary publications:", secondary_publications.shape[0])
secondary_publications.head()

In [None]:
secondary_publications.to_csv(os.path.join(KG_PATH, "nodes", "Publication_secondary.csv"), index=False)

### Create list of primary Authors

In [None]:
radx_authors.head(100)

In [None]:
primary_authors = radx_authors[["authorId", "author", "aliases", "projectSerialNum", "doi"]].copy()

In [None]:
primary_authors.drop_duplicates(inplace=True)
primary_authors.dropna(inplace=True)
primary_authors.sort_values("author", inplace=True)
print(primary_authors.shape[0])
primary_authors

In [None]:
primary_authors.to_csv(os.path.join("../derived_data/", "primary_authors.csv"), index=False)

## Create a list of primary authors who are not PIs or Study Investigators

In [None]:
primary_authors_list = list(primary_authors["authorId"].unique())

In [None]:
primary_authors_all_list = list(primary_authors_all["authorId"].unique())
primary_authors_other_list = set(primary_authors_all_list) - set(primary_authors_list)

In [None]:
primary_authors_other = primary_authors_all[primary_authors_all["authorId"].isin(primary_authors_other_list)]
primary_authors_other = primary_authors_other[["authorId", "author", "fullName", "firstName", "middleName", "lastName", "doi"]].copy()
primary_authors_other.drop_duplicates(inplace=True)
primary_authors_other.dropna(inplace=True)
primary_authors_other.sort_values("author", inplace=True)

In [None]:
primary_authors_other.to_csv(os.path.join("../derived_data/", "primary_authors_other.csv"), index=False)
print(f"Number of other primary paper authors: {primary_authors_other.shape[0]}")
primary_authors_other

### Create list of secondary authors

In [None]:
# secondary_authors = publication_query.get_author_ids(secondary_dois)
# author_map = {"authorId": "authorId", "title": "name", "abstract": "abstract", "journal": "journal", "year": "year", "type": "type", "doi": "doi", "pmId": "pmId", "pmcId": "pmcId", "url": "url"}
# secondary_authors = utils.rename_and_reorder_columns(secondary_authors, author_map)
# secondary_authors.rename(columns={"name": "author"})
# secondary_authors.head()
#secondary_authors = authors[["authorId", "author", "aliases", "orcid", "projectSerialNum"]].copy()

In [None]:
#secondary_authors.to_csv(os.path.join("../derived_data/", "secondary_authors.csv"), index=False)