# Get Author info from papers
https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/get_graph_get_paper_authors

https://api.semanticscholar.org/api-docs/graph#tag/Author-Data/operation/post_graph_get_authors

See also about PubMed unique author id:
https://pubmed.ncbi.nlm.nih.gov/34180522/
https://pubmed.ncbi.nlm.nih.gov/28960025/
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5639053/
https://github.com/Daniel-Mietchen/ideas/issues/1260
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5530597/

In [1]:
import os
import requests
import json
import pandas as pd
import publication_query
import utils

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
#pd.set_option('display.max_colwidth', None)

In [2]:
KG_PATH = "../kg/data"
DERIVED_DATA_PATH = "../derived_data"

In [3]:
START_PUBLICATION_YEAR = 2021 # Grants started at the end of 2020.

## Get raw primary Publication data
These publications result from a grant number query in PubMed.

In [4]:
raw_publications = pd.read_csv("https://raw.githubusercontent.com/christian-horgan/pubmed-search/main/data/radx-rad/article_data_current.csv", dtype=str, keep_default_na=False)

In [5]:
non_pubmed_publications = pd.read_csv("../data/Publications_non_pubmed.csv", dtype=str, keep_default_na=False)
non_pubmed_publications.head()

Unnamed: 0,pm_id,title,pmc_id,authors,doi,keywords,mesh_ids,mesh_terms,abstract,project_serial_num,journal_name,publication_year
0,,IMI-CDE: an interactive interface for collabor...,,"Tao S,Chou WC,Li J,Du J,Ram PM,Abeysinghe R,Xu...",doi:10.1109/ICHI54592.2022.00070,"COVID-19,Common Data Element,CDE,Mapping,Data ...",,,The National Institute of Health (NIH) launche...,LM013755,2022 IEEE 10th International Conference on Hea...,2022


In [6]:
raw_publications = pd.concat([raw_publications, non_pubmed_publications])
raw_publications.fillna("", inplace=True)

In [7]:
# Restrict publications to 2021+ since RADx projects started at the end of 2020.
raw_publications["publication_year"] = raw_publications["publication_year"].astype(int)
raw_publications.query(f"publication_year >= {START_PUBLICATION_YEAR}", inplace=True)
# DOI is required as the primary key
raw_publications.dropna(subset="doi", inplace=True)
# convert property name to Neo4j convention
raw_publications.rename(columns={"pm_id": "pmId", "pmc_id": "pmcId", "project_serial_num": "projectSerialNum"}, inplace=True)

In [8]:
raw_publications.head()

Unnamed: 0,pmId,title,pmcId,authors,doi,keywords,mesh_ids,mesh_terms,abstract,projectSerialNum,journal_name,publication_year
0,37727605,Repurposing a SARS-CoV-2 surveillance program ...,PMC10505707,"['King KL', 'Ham R', 'Smothers A', 'Lee I', 'B...",doi:10.3389/fpubh.2023.1168551,"['SARS-CoV-2', 'flu', 'influenza', 'saliva', '...","['D006801', 'D000086402', 'D014495', 'D0000863...","['Humans', 'SARS-CoV-2', 'Universities', 'COVI...",Standard multiplex RT-qPCR diagnostic tests us...,AA029328,Frontiers in public health,2023
1,36595104,Ion-selective electrodes based on laser-induce...,,"['Soares RRA', 'Hjort RG', 'Pola CC', 'Jing D'...",doi:10.1007/s00604-022-05615-9,"['Electrochemical Sensors', 'Food additives', ...","['D006801', 'D017736', 'D006108', 'D009573', '...","['Humans', 'Ion-Selective Electrodes', 'Graphi...",Nitrite is an important food additive for cure...,AA029328,Mikrochimica acta,2023
2,36354449,An Experimental Framework for Developing Point...,PMC9688365,"['Ullah SF', 'Moreira G', 'Datta SPA', 'McLamo...",doi:10.3390/bios12110938,"['SARS-CoV-2', 'analytical sensing', 'binding ...","['D006801', 'D058266', 'D000086402', 'D0000863...","['Humans', 'Dielectric Spectroscopy', 'SARS-Co...",Biolayer interferometry (BLI) is a well-establ...,AA029328,Biosensors,2022
3,35992634,Development of a Biosensor Based on Angiotensi...,PMC9386735,"['Moreira G', 'Casso-Hartmann L', 'Datta SPA',...",doi:10.3389/fsens.2022.917380,"['LIG electrodes', 'attenuated virus', 'betaco...",[],[],Severe acute respiratory syndrome coronavirus ...,AA029328,Frontiers in sensors,2022
4,35785019,Aerosol-jet-printed graphene electrochemical i...,PMC9245948,"['Pola CC', 'Rangnekar SV', 'Sheets R', 'Szydl...",doi:10.1088/2053-1583/ac7339,"['COVID-19', 'aerosol jet printing', 'biosenso...",[],[],"Rapid, inexpensive, and easy-to-use coronaviru...",AA029328,2d materials,2022


In [9]:
raw_publications.tail()

Unnamed: 0,pmId,title,pmcId,authors,doi,keywords,mesh_ids,mesh_terms,abstract,projectSerialNum,journal_name,publication_year
400,35532905.0,Aptamer Sandwich Lateral Flow Assay (AptaFlow)...,PMC9112978,"['Yang LF', 'Kacherovsky N', 'Panpradist N', '...",doi:10.1021/acs.analchem.2c00554,[],"['D000914', 'D052157', 'D000086382', 'D0000866...","['Antibodies, Viral', 'Aptamers, Nucleotide', ...",The COVID-19 pandemic is among the greatest he...,LM013755,Analytical chemistry,2022
401,35132411.0,Outbreak.info Research Library: A standardized...,PMC8820656,"['Tsueng G', 'Mullen JL', 'Alkuzweny M', 'Cano...",doi:10.1101/2022.01.20.477133,[],[],[],"To combat the ongoing COVID-19 pandemic, scien...",LM013755,bioRxiv : the preprint server for biology,2022
402,34923447.0,Detecting model misconducts in decentralized h...,PMC10017272,"['Kuo TT', 'Pham A']",doi:10.1016/j.ijmedinf.2021.104658,"['Blockchain Distributed Ledger Technology', '...",[],[],To accelerate healthcare/genomic medicine rese...,LM013755,International journal of medical informatics,2021
403,34328683.0,Discovery and Characterization of Spike N-Term...,PMC8426805,"['Kacherovsky N', 'Yang LF', 'Dang HV', 'Cheng...",doi:10.1002/anie.202107730,"['SARS-CoV-2', 'aptamers', 'coronavirus', 'cry...","['D052157', 'D000086382', 'D004797', 'D006801'...","['Aptamers, Nucleotide', 'COVID-19', 'Enzyme-L...",The coronavirus disease 2019 (COVID-19) pandem...,LM013755,Angewandte Chemie (International ed. in English),2021
0,,IMI-CDE: an interactive interface for collabor...,,"Tao S,Chou WC,Li J,Du J,Ram PM,Abeysinghe R,Xu...",doi:10.1109/ICHI54592.2022.00070,"COVID-19,Common Data Element,CDE,Mapping,Data ...",,,The National Institute of Health (NIH) launche...,LM013755,2022 IEEE 10th International Conference on Hea...,2022


## Get author information for the primary citations

In [10]:
# Query the Semantic Scholar API to get author information for a list of DOIs
raw_dois = list(raw_publications["doi"].unique())
authors = publication_query.get_author_ids(raw_dois)
authors.head()

Number of mismatches: 13


Unnamed: 0,authorId,name,aliases,affiliations,paperCount,citationCount,hIndex,externalIds.DBLP,externalIds.ORCID,paperId,names,pmId,pmcId,doi
0,2154342066,Kylie L King,Kylie L King,[],5,44,4,,,8a26272b10779c0d0cedc6eb4e2630d906429658,"Kylie L King,nan,Kylie L King",37727605,10505707,doi:10.3389/fpubh.2023.1168551
1,2043300888,Rachel E. Ham,"Rachel E Ham, Rachel Ham",[],6,22,3,,,8a26272b10779c0d0cedc6eb4e2630d906429658,"Rachel E. Ham,nan,Rachel E Ham, Rachel Ham",37727605,10505707,doi:10.3389/fpubh.2023.1168551
2,2154336577,Austin R. Smothers,"Austin R Smothers, Austin Smothers",[],6,17,2,,,8a26272b10779c0d0cedc6eb4e2630d906429658,"Austin R. Smothers,nan,Austin R Smothers, Aust...",37727605,10505707,doi:10.3389/fpubh.2023.1168551
3,2237391763,Isaac Lee,,[],1,0,0,,,8a26272b10779c0d0cedc6eb4e2630d906429658,"Isaac Lee,nan,",37727605,10505707,doi:10.3389/fpubh.2023.1168551
4,2237321471,Tyler Bowie,,[],1,0,0,,,8a26272b10779c0d0cedc6eb4e2630d906429658,"Tyler Bowie,nan,",37727605,10505707,doi:10.3389/fpubh.2023.1168551


In [11]:
# authors.query("doi == 'doi:10.1097/PCC.0000000000002976'") # Authors with title in names
authors.query("doi == 'doi:10.1109/ICHI54592.2022.00070'")

Unnamed: 0,authorId,name,aliases,affiliations,paperCount,citationCount,hIndex,externalIds.DBLP,externalIds.ORCID,paperId,names,pmId,pmcId,doi
5860,2463102,Shiqiang Tao,,[],46,771,11,",Shiqiang Tao,",,f06d21eacb91120147bb8347fdeec956de1b6baf,"Shiqiang Tao,Shiqiang Tao,",,,doi:10.1109/ICHI54592.2022.00070
5861,2184490951,Wei–Chun Chou,,[],1,0,0,,,f06d21eacb91120147bb8347fdeec956de1b6baf,"Wei–Chun Chou,nan,",,,doi:10.1109/ICHI54592.2022.00070
5862,2117984733,Jianfu Li,,[],16,163,8,",Jianfu Li,",,f06d21eacb91120147bb8347fdeec956de1b6baf,"Jianfu Li,Jianfu Li,",,,doi:10.1109/ICHI54592.2022.00070
5863,1728324,Jingcheng Du,Jing-cheng Du,['The University of Texas Health Science Cente...,74,1447,18,",Jing-Cheng Du, Jingcheng Du,",0000-0002-0322-4566,f06d21eacb91120147bb8347fdeec956de1b6baf,"Jingcheng Du,Jing-Cheng Du, Jingcheng Du,Jing-...",,,doi:10.1109/ICHI54592.2022.00070
5864,2184494110,Pritham M Ram,Pritham Ram,[],2,1,1,,,f06d21eacb91120147bb8347fdeec956de1b6baf,"Pritham M Ram,nan,Pritham Ram",,,doi:10.1109/ICHI54592.2022.00070
5865,39601718,Rashmie Abeysinghe,,[],30,119,7,",Rashmie Abeysinghe,",,f06d21eacb91120147bb8347fdeec956de1b6baf,"Rashmie Abeysinghe,Rashmie Abeysinghe,",,,doi:10.1109/ICHI54592.2022.00070
5866,2219752599,Hua Xu,Hua Xu,[],236,10035,51,",Hua Xu, Hua Xu 0001,",,f06d21eacb91120147bb8347fdeec956de1b6baf,"Hua Xu,Hua Xu, Hua Xu 0001,Hua Xu",,,doi:10.1109/ICHI54592.2022.00070
5867,121147517,Xiaoqian Jiang,,[],89,878,16,",Xiaoqian Jiang,",,f06d21eacb91120147bb8347fdeec956de1b6baf,"Xiaoqian Jiang,Xiaoqian Jiang,",,,doi:10.1109/ICHI54592.2022.00070
5868,2184494479,Peter W Rose,Peter Rose,[],4,19,2,,,f06d21eacb91120147bb8347fdeec956de1b6baf,"Peter W Rose,nan,Peter Rose",,,doi:10.1109/ICHI54592.2022.00070
5869,1397958221,L. Ohno-Machado,"L Ohno-machado, L Ohno-machado, Lucila Ohno‐ma...",[],545,19167,61,",Lucila Ohno-Machado,",,f06d21eacb91120147bb8347fdeec956de1b6baf,"L. Ohno-Machado,Lucila Ohno-Machado,L Ohno-mac...",,,doi:10.1109/ICHI54592.2022.00070


In [12]:
publication_query.expand_name_column(authors, "name")
authors.rename(columns={"name": "author"}, inplace=True)
authors.rename(columns={"externalIds.ORCID": "orcid"}, inplace=True)
authors = raw_publications.merge(authors, on="doi")

In [13]:
authors.drop_duplicates(inplace=True)
authors.fillna("", inplace=True)
authors.replace("nan", "", inplace=True)
authors.rename(columns={"name": "author"}, inplace=True)

## Match publication authors with RADx-rad investigators
In this step, we eliminate non-RADx publications.

In [14]:
radx_investigators = pd.read_csv(os.path.join(DERIVED_DATA_PATH, "radx_investigators.csv"))

In [15]:
radx_investigators.head()

Unnamed: 0,dbgapAccession,coreProjectNum,studyInvestigator,profileId,projectSerialNum,isContactPi,fiscalYear,grantPi,fullName,firstName,middleName,lastName,isPi,isDepositor,name
0,phs002964.v1.p1,U01DC019579,Albers M,6625336.0,DC019579,True,2022.0,Albers MW,Mark W Albers,Mark,W,Albers,True,False,Albers MW
1,,R61HD105593,,2563052.0,HD105593,False,2022.0,Allen CE,Carl E Allen,Carl,E,Allen,True,False,Allen CE
2,phs002585.v1.p1,R61HD105593,Annapragada A,7039414.0,HD105593,True,2022.0,Annapragada AV,Ananth V Annapragada,Ananth,V,Annapragada,True,False,Annapragada AV
3,,U24LM013755,,10450719.0,LM013755,False,2023.0,Aronoff-Spencer ES,Eliah S Aronoff-Spencer,Eliah,S,Aronoff-Spencer,True,False,Aronoff-Spencer ES
4,,R33HD105594,,10320851.0,HD105594,False,2023.0,Bassiri H,Hamid Bassiri,Hamid,,Bassiri,True,False,Bassiri H


#### Fuzzy merge of publication authors with RADx investigators

In [16]:
# The threshold was manually adjusted by checking for proper matches in the table below. The lowest threshold is 0.905882 for author Vanegas D.
radx_authors = utils.fuzzy_merge(authors[["author", "authorId", "aliases", "orcid", "projectSerialNum", "doi"]], 
                                    radx_investigators[["name", "projectSerialNum"]], 
                                    left_fuzzy_on="author", right_fuzzy_on="name", 
                                    left_on="projectSerialNum", right_on="projectSerialNum", 
                                    how="inner", threshold=0.905)

In [17]:
print("Matched authors       :", radx_authors.shape[0])
print("Matched unique authors:", radx_authors["author"].nunique())
print("Number of unique DOIs :", radx_authors["doi"].nunique())
radx_authors.head(300)

Matched authors       : 303
Matched unique authors: 63
Number of unique DOIs : 205


Unnamed: 0,author,authorId,aliases,orcid,projectSerialNum,doi,match,score,name
0,Vanegas D,30717980,"Diana C Vanegas, D C Vanegas, Diana C Vanegas,...",,AA029328,doi:10.3390/bios12110938,Vanegas-Gamboa DC,0.905882,Vanegas-Gamboa DC
1,Vanegas D,30717980,"Diana C Vanegas, D C Vanegas, Diana C Vanegas,...",,AA029328,doi:10.3389/fsens.2022.917380,Vanegas-Gamboa DC,0.905882,Vanegas-Gamboa DC
2,Vanegas D,30717980,"Diana C Vanegas, D C Vanegas, Diana C Vanegas,...",,AA029328,doi:10.3390/bios12020101,Vanegas-Gamboa DC,0.905882,Vanegas-Gamboa DC
3,Cirrito J,5588350,"J R Cirrito, John R Cirrito, John Cirrito, Joh...",,AA029331,doi:10.1021/acssensors.3c00512,Cirrito JR,0.98,Cirrito JR
4,Cirrito J,5588350,"J R Cirrito, John R Cirrito, John Cirrito, Joh...",,AA029331,doi:10.1038/s41467-023-39419-z,Cirrito JR,0.98,Cirrito JR
5,Salaita K,49570843,"Khalid S Salaita, K Salaita, Khalid S Salaita,...",,AA029345,doi:10.1101/2023.02.27.530294,Salaita KS,0.98,Salaita KS
6,Salaita K,49570843,"Khalid S Salaita, K Salaita, Khalid S Salaita,...",,AA029345,doi:10.1038/s41565-022-01080-w,Salaita KS,0.98,Salaita KS
7,Salaita K,49570843,"Khalid S Salaita, K Salaita, Khalid S Salaita,...",,AA029345,doi:10.1002/adma.202006600,Salaita KS,0.98,Salaita KS
8,Salaita K,49570843,"Khalid S Salaita, K Salaita, Khalid S Salaita,...",,AA029345,doi:10.1002/anie.202107660,Salaita KS,0.98,Salaita KS
9,Pun S,2635677,"S\u2009h Pun, S H Pun, Suzie H Pun, Suzie Pun,...",,AA029316,doi:10.1039/d3sc00439b,Pun SH,0.966667,Pun SH


In [18]:
radx_authors.query("doi == 'doi:10.1109/ICHI54592.2022.00070'")

Unnamed: 0,author,authorId,aliases,orcid,projectSerialNum,doi,match,score,name
301,Xu H,2219752599,Hua Xu,,LM013755,doi:10.1109/ICHI54592.2022.00070,Xu H,1.0,Xu H
302,Ohno-Machado L,1397958221,"L Ohno-machado, L Ohno-machado, Lucila Ohno‐ma...",,LM013755,doi:10.1109/ICHI54592.2022.00070,Ohno-Machado L,1.0,Ohno-Machado L


#### If there are multiple name matches for a publication, use the match with the highest score.

In [19]:
# sort descending so the match with the highest score comes first.
radx_authors.sort_values(by=["score", "doi", "match"], ascending=False, inplace=True)
# keep only the first instance for each match, thus eliminating the lower-scoring matches.
radx_authors.drop_duplicates(subset=["doi", "match"], inplace=True)
print("Matched authors       :", radx_authors.shape[0])
print("Matched unique authors:", radx_authors["author"].nunique())
print("Number of unique DOIs :", radx_authors["doi"].nunique())
radx_authors.head()

Matched authors       : 303
Matched unique authors: 63
Number of unique DOIs : 205


Unnamed: 0,author,authorId,aliases,orcid,projectSerialNum,doi,match,score,name
232,Solo-Gabriele H,1398411589,"Helena Solo‐gabriele, Helena Maria Solo-gabrie...",,DA053941,doi:10.7171/jbt.21-3203-019,Solo-Gabriele H,1.0,Solo-Gabriele H
230,Solo-Gabriele H,1398411589,"Helena Solo‐gabriele, Helena Maria Solo-gabrie...",,DA053941,doi:10.7171/jbt.21-3203-017,Solo-Gabriele H,1.0,Solo-Gabriele H
224,Solo-Gabriele H,1398411589,"Helena Solo‐gabriele, Helena Maria Solo-gabrie...",,DA053941,doi:10.3390/w14081187,Solo-Gabriele H,1.0,Solo-Gabriele H
281,Varsani A,4462013,"A U Varsani, A Varsani, Arvind Varsani",,LM013129,doi:10.3390/v13091803,Varsani A,1.0,Varsani A
283,Scotch M,2161990,"M Scotch, Matthew L Scotch, Matthew Scotch",,LM013129,doi:10.3390/v13091803,Scotch M,1.0,Scotch M


#### Filter publications (RADx investigators only)

In [20]:
primary_dois = list(radx_authors["doi"].unique())
primary_publications = raw_publications[raw_publications["doi"].isin(set(primary_dois))].copy()
primary_publications.fillna("", inplace=True)
primary_publications.head()

Unnamed: 0,pmId,title,pmcId,authors,doi,keywords,mesh_ids,mesh_terms,abstract,projectSerialNum,journal_name,publication_year
2,36354449,An Experimental Framework for Developing Point...,PMC9688365,"['Ullah SF', 'Moreira G', 'Datta SPA', 'McLamo...",doi:10.3390/bios12110938,"['SARS-CoV-2', 'analytical sensing', 'binding ...","['D006801', 'D058266', 'D000086402', 'D0000863...","['Humans', 'Dielectric Spectroscopy', 'SARS-Co...",Biolayer interferometry (BLI) is a well-establ...,AA029328,Biosensors,2022
3,35992634,Development of a Biosensor Based on Angiotensi...,PMC9386735,"['Moreira G', 'Casso-Hartmann L', 'Datta SPA',...",doi:10.3389/fsens.2022.917380,"['LIG electrodes', 'attenuated virus', 'betaco...",[],[],Severe acute respiratory syndrome coronavirus ...,AA029328,Frontiers in sensors,2022
5,35200361,Context-Aware Diagnostic Specificity (CADS).,PMC8869940,"['McLamore ES', 'Moreira G', 'Vanegas DC', 'Da...",doi:10.3390/bios12020101,[],"['D006801', 'D015233', 'D011506', 'D012680']","['Humans', 'Models, Statistical', 'Proteins', ...",Rapid detection of proteins is critical in a v...,AA029328,Biosensors,2022
6,37498298,Rapid Direct Detection of SARS-CoV-2 Aerosols ...,PMC10463275,"['Ghumra DP', 'Shetty N', 'McBrearty KR', 'Put...",doi:10.1021/acssensors.3c00512,"['SARS-CoV-2', 'aerosol science', 'biosensors'...","['D006801', 'D000086402', 'D000086382', 'D0190...","['Humans', 'SARS-CoV-2', 'COVID-19', 'Point-of...",Airborne transmission via virus-laden aerosols...,AA029331,ACS sensors,2023
7,37429842,Real-time environmental surveillance of SARS-C...,PMC10333287,"['Puthussery JV', 'Ghumra DP', 'McBrearty KR',...",doi:10.1038/s41467-023-39419-z,[],"['D006801', 'D000086402', 'D000086382', 'D0588...","['Humans', 'SARS-CoV-2', 'COVID-19', 'Pandemic...",Real-time surveillance of airborne SARS-CoV-2 ...,AA029331,Nature communications,2023


## Create primary ```Publication``` node file for KG

In [21]:
primary_publications["id"] = primary_publications["doi"]
primary_publications["url"] = "https://doi.org/" + primary_publications["doi"]
primary_publications["type"] = "primary"
primary_publications.drop_duplicates("id", inplace=True)

In [22]:
publication_map = {"id": "id", "title": "name", "abstract": "abstract", "journal_name": "journal", "publication_year": "year", "type": "type", "doi": "doi", "pmId": "pmId", "pmcId": "pmcId", "url": "url"}
primary_publications_kg = utils.rename_and_reorder_columns(primary_publications, publication_map)

In [23]:
primary_publications_kg.to_csv(os.path.join(KG_PATH, "nodes", "Publication_primary.csv"), index=False)

### Create primary publication Author nodes

In [24]:
primary_authors_all = authors[authors["doi"].isin(set(primary_dois))].copy()
primary_authors_all["fullName"] = primary_authors_all["fullName"].str.replace(".", "")
primary_authors_all.head()

Unnamed: 0,pmId_x,title,pmcId_x,authors,doi,keywords,mesh_ids,mesh_terms,abstract,projectSerialNum,journal_name,publication_year,authorId,author,aliases,affiliations,paperCount,citationCount,hIndex,externalIds.DBLP,orcid,paperId,names,pmId_y,pmcId_y,fullName,firstName,middleName,lastName
15,36354449,An Experimental Framework for Developing Point...,PMC9688365,"['Ullah SF', 'Moreira G', 'Datta SPA', 'McLamo...",doi:10.3390/bios12110938,"['SARS-CoV-2', 'analytical sensing', 'binding ...","['D006801', 'D058266', 'D000086402', 'D0000863...","['Humans', 'Dielectric Spectroscopy', 'SARS-Co...",Biolayer interferometry (BLI) is a well-establ...,AA029328,Biosensors,2022,66615195,Ullah SF,"Sadia Fida Ullah, Sadia F Ullah",[],17,164,7,,,7100d14b3fd28ec25980f0fa3253d86fd33cfef9,"S. F. Ullah,nan,Sadia Fida Ullah, Sadia F Ullah",36354449,9688365,S F Ullah,S,F,Ullah
16,36354449,An Experimental Framework for Developing Point...,PMC9688365,"['Ullah SF', 'Moreira G', 'Datta SPA', 'McLamo...",doi:10.3390/bios12110938,"['SARS-CoV-2', 'analytical sensing', 'binding ...","['D006801', 'D058266', 'D000086402', 'D0000863...","['Humans', 'Dielectric Spectroscopy', 'SARS-Co...",Biolayer interferometry (BLI) is a well-establ...,AA029328,Biosensors,2022,14596380,Moreira G,"Geisianny Moreira, Geisianny Augusta Monteiro ...",[],13,66,4,,,7100d14b3fd28ec25980f0fa3253d86fd33cfef9,"G. Moreira,nan,Geisianny Moreira, Geisianny Au...",36354449,9688365,G Moreira,G,,Moreira
17,36354449,An Experimental Framework for Developing Point...,PMC9688365,"['Ullah SF', 'Moreira G', 'Datta SPA', 'McLamo...",doi:10.3390/bios12110938,"['SARS-CoV-2', 'analytical sensing', 'binding ...","['D006801', 'D058266', 'D000086402', 'D0000863...","['Humans', 'Dielectric Spectroscopy', 'SARS-Co...",Biolayer interferometry (BLI) is a well-establ...,AA029328,Biosensors,2022,5237978,Datta S,"Shoumen Datta, Shoumen P A Datta, Shoumen Pa D...",[],84,349,10,",Shoumen Datta, Shoumen Palit Austin Datta,",,7100d14b3fd28ec25980f0fa3253d86fd33cfef9,"S. Datta,Shoumen Datta, Shoumen Palit Austin D...",36354449,9688365,S Datta,S,,Datta
18,36354449,An Experimental Framework for Developing Point...,PMC9688365,"['Ullah SF', 'Moreira G', 'Datta SPA', 'McLamo...",doi:10.3390/bios12110938,"['SARS-CoV-2', 'analytical sensing', 'binding ...","['D006801', 'D058266', 'D000086402', 'D0000863...","['Humans', 'Dielectric Spectroscopy', 'SARS-Co...",Biolayer interferometry (BLI) is a well-establ...,AA029328,Biosensors,2022,5655264,McLamore E,"E S Mclamore, E S Mclamore, Ericz Mclamore, Er...",[],164,3088,30,",Eric S McLamore,",,7100d14b3fd28ec25980f0fa3253d86fd33cfef9,"E. McLamore,Eric S McLamore,E S Mclamore, E S ...",36354449,9688365,E McLamore,E,,McLamore
19,36354449,An Experimental Framework for Developing Point...,PMC9688365,"['Ullah SF', 'Moreira G', 'Datta SPA', 'McLamo...",doi:10.3390/bios12110938,"['SARS-CoV-2', 'analytical sensing', 'binding ...","['D006801', 'D058266', 'D000086402', 'D0000863...","['Humans', 'Dielectric Spectroscopy', 'SARS-Co...",Biolayer interferometry (BLI) is a well-establ...,AA029328,Biosensors,2022,30717980,Vanegas D,"Diana C Vanegas, D C Vanegas, Diana C Vanegas,...",[],41,660,14,",Diana Vanegas,",,7100d14b3fd28ec25980f0fa3253d86fd33cfef9,"D. Vanegas,Diana Vanegas,Diana C Vanegas, D C ...",36354449,9688365,D Vanegas,D,,Vanegas


In [25]:
#primary_authors[["authorId", "author", "fullName", "firstName", "middleName", "lastName", "doi"]].to_csv(os.path.join(DERIVED_DATA_PATH, "primary_authors_all.csv"), index=False)

## Create ```Publication-CITES-Publication``` relationship file for KG

In [26]:
citations = publication_query.get_citations(primary_dois)
citations.rename(columns={"doiCite": "from", "doi": "to"}, inplace=True)
# TODO where does the following inconsistency originate?
# WARNING: Error messages from data import:
# doi:10.47464/METROCIENCIA/VOL29/3/2021/5-10 (Publication-ID)-[CITES]->doi:10.1002/art.41616 (Publication-ID) referring to missing node doi:10.47464/METROCIENCIA/VOL29/3/2021/5-10
citations = citations[(citations["from"] != "doi:10.47464/METROCIENCIA/VOL29/3/2021/5-10") & (citations["to"] != "doi:10.1002/art.41616")]
citations.to_csv(os.path.join(KG_PATH, "relationships", "Publication-CITES-Publication.csv"), index=False)

## Get Secondary publications
Secondary publications are publications that cite the primary publication

In [27]:
secondary_dois = list(citations["from"].unique())
# remove any primary DOIs, e.g., a primary publication cites another primary publication.
secondary_dois = list(set(secondary_dois) - set(primary_dois))
secondary_publications = publication_query.get_publication_info(secondary_dois)

In [28]:
secondary_publications["id"] = secondary_publications["doi"]
secondary_publications["url"] = "https://doi.org/" + secondary_publications["doi"]
secondary_publications["type"] = "secondary"
secondary_publications.drop_duplicates("id", inplace=True)

In [29]:
citation_map = {"id": "id", "title": "name", "abstract": "abstract", "journal": "journal", "year": "year", "type": "type", "doi": "doi", "pmId": "pmId", "pmcId": "pmcId", "url": "url"}
secondary_publications = utils.rename_and_reorder_columns(secondary_publications, citation_map)

In [30]:
print("Number of secondary publications:", secondary_publications.shape[0])
secondary_publications.head()

Number of secondary publications: 1792


Unnamed: 0,id,name,abstract,journal,year,type,doi,pmId,pmcId,url
0,doi:10.1016/j.bioelechem.2022.108358,CdTe QDs-sensitized TiO2 nanocomposite for mag...,,"Bioelectrochemistry (Amsterdam, Netherlands)",2022,secondary,doi:10.1016/j.bioelechem.2022.108358,36580690.0,9783190.0,https://doi.org/doi:10.1016/j.bioelechem.2022....
1,doi:10.3201/eid2803.211972,Spatiotemporal Analyses of 2 Co-Circulating SA...,The emergence of novel severe acute respirator...,Emerging Infectious Diseases,2022,secondary,doi:10.3201/eid2803.211972,35133957.0,8888210.0,https://doi.org/doi:10.3201/eid2803.211972
2,doi:10.2196/44401,Thermometer-based fever surveillance and COVID...,,JMIR Public Health and Surveillance,2022,secondary,doi:10.2196/44401,,,https://doi.org/doi:10.2196/44401
3,doi:10.1101/2021.12.21.21268077,Quantitative detection of SARS-CoV-2 Omicron v...,"On November 26, 2021, the B.1.1.529 COVID-19 v...",,2021,secondary,doi:10.1101/2021.12.21.21268077,,,https://doi.org/doi:10.1101/2021.12.21.21268077
4,doi:10.1002/advs.202302816,Next-Generation Vitrimers Design through Theor...,Vitrimers are an innovative class of polymers ...,Advanced science,2023,secondary,doi:10.1002/advs.202302816,38058273.0,,https://doi.org/doi:10.1002/advs.202302816


In [31]:
secondary_publications.to_csv(os.path.join(KG_PATH, "nodes", "Publication_secondary.csv"), index=False)

### Create list of primary Authors

In [32]:
radx_authors.head(100)

Unnamed: 0,author,authorId,aliases,orcid,projectSerialNum,doi,match,score,name
232,Solo-Gabriele H,1398411589,"Helena Solo‐gabriele, Helena Maria Solo-gabrie...",,DA053941,doi:10.7171/jbt.21-3203-019,Solo-Gabriele H,1.0,Solo-Gabriele H
230,Solo-Gabriele H,1398411589,"Helena Solo‐gabriele, Helena Maria Solo-gabrie...",,DA053941,doi:10.7171/jbt.21-3203-017,Solo-Gabriele H,1.0,Solo-Gabriele H
224,Solo-Gabriele H,1398411589,"Helena Solo‐gabriele, Helena Maria Solo-gabrie...",,DA053941,doi:10.3390/w14081187,Solo-Gabriele H,1.0,Solo-Gabriele H
281,Varsani A,4462013,"A U Varsani, A Varsani, Arvind Varsani",,LM013129,doi:10.3390/v13091803,Varsani A,1.0,Varsani A
283,Scotch M,2161990,"M Scotch, Matthew L Scotch, Matthew Scotch",,LM013129,doi:10.3390/v13091803,Scotch M,1.0,Scotch M
247,Wenzel J,40530973,"J Wenzel, J Wenzel, Jeffrey D Wenzel, Jeff Wenzel",,DA053893,doi:10.3390/v13081647,Wenzel J,1.0,Wenzel J
290,Varsani A,4462013,"A U Varsani, A Varsani, Arvind Varsani",,LM013129,doi:10.3390/v13010074,Varsani A,1.0,Varsani A
292,Scotch M,2161990,"M Scotch, Matthew L Scotch, Matthew Scotch",,LM013129,doi:10.3390/v13010074,Scotch M,1.0,Scotch M
104,Devaraj S,2012755,"S Niranjali Devaraj, S N Devaraj, S Devaraj, S...",,HD105593,doi:10.3390/jcm12175435,Devaraj S,1.0,Devaraj S
16,Wang X,2144804604,,,AA029348,doi:10.3390/bios13020298,Wang X,1.0,Wang X


In [33]:
primary_authors = radx_authors[["authorId", "author", "aliases", "projectSerialNum", "doi"]].copy()

In [34]:
primary_authors.drop_duplicates(inplace=True)
primary_authors.dropna(inplace=True)
primary_authors.sort_values("author", inplace=True)
print(primary_authors.shape[0])
primary_authors

303


Unnamed: 0,authorId,author,aliases,projectSerialNum,doi
102,48292006,Allen C,"C Allen, Carl E Allen, Carl Allen, Carl E Alle...",HD105593,doi:10.3390/jcm12175435
103,3473746,Annapragada A,"Ananth V Annapragada, A Annapragada, A V Annap...",HD105593,doi:10.3390/jcm12175435
105,3473746,Annapragada A,"Ananth V Annapragada, A Annapragada, A V Annap...",HD105593,doi:10.1097/INF.0000000000003888
107,3473746,Annapragada A,"Ananth V Annapragada, A Annapragada, A V Annap...",HD105593,doi:10.1038/s41390-022-02108-6
113,3473746,Annapragada A,"Ananth V Annapragada, A Annapragada, A V Annap...",HD105593,doi:10.1016/j.jbi.2021.103818
117,10999669,Bassiri H,"H A Bassiri, Ha Bassiri, Hamid Bassiri",HD105594,doi:10.1161/JAHA.121.021428
115,10999669,Bassiri H,"H A Bassiri, Ha Bassiri, Hamid Bassiri",HD105594,doi:10.1002/art.42062
120,10999669,Bassiri H,"H A Bassiri, Ha Bassiri, Hamid Bassiri",HD105594,doi:10.1002/art.41616
116,10999669,Bassiri H,"H A Bassiri, Ha Bassiri, Hamid Bassiri",HD105594,doi:10.1038/s41467-021-27544-6
121,10999669,Bassiri H,"H A Bassiri, Ha Bassiri, Hamid Bassiri",HD105594,doi:10.1093/jpids/piaa161


In [35]:
primary_authors.to_csv(os.path.join("../derived_data/", "primary_authors.csv"), index=False)

## Create a list of primary authors who are not PIs or Study Investigators

In [36]:
primary_authors_list = list(primary_authors["authorId"].unique())

In [37]:
primary_authors_all_list = list(primary_authors_all["authorId"].unique())
primary_authors_other_list = set(primary_authors_all_list) - set(primary_authors_list)

In [38]:
primary_authors_other = primary_authors_all[primary_authors_all["authorId"].isin(primary_authors_other_list)]
primary_authors_other = primary_authors_other[["authorId", "author", "fullName", "firstName", "middleName", "lastName", "doi"]].copy()
primary_authors_other.drop_duplicates(inplace=True)
primary_authors_other.dropna(inplace=True)
primary_authors_other.sort_values("author", inplace=True)

In [39]:
primary_authors_other.to_csv(os.path.join("../derived_data/", "primary_authors_other.csv"), index=False)
print(f"Number of other primary paper authors: {primary_authors_other.shape[0]}")
primary_authors_other

Number of other primary paper authors: 3186


Unnamed: 0,authorId,author,fullName,firstName,middleName,lastName,doi
2464,,ONJEBAREMEABCFK,Olivia Neha Jordan E Brandi Anita Rachel Evera...,Olivia,NehaJordanEBrandiAnitaRachelEverardoMaryEAlmen...,,doi:10.1038/s41586-023-05949-1
4651,80827919.0,Abdullah N,Natasha Abdullah,Natasha,,Abdullah,doi:10.1016/j.cell.2021.05.002
3441,2186155180.0,Abe N,Naomi Abe,Naomi,,Abe,doi:10.1038/s41467-022-30357-w
3614,2186155180.0,Abe N,Naomi Abe,Naomi,,Abe,doi:10.1016/S2589-7500(22)00149-2
4230,6865020.0,Abedalthagafi M,M Abedalthagafi,M,,Abedalthagafi,doi:10.1038/s41592-022-01444-z
3702,2106780379.0,Abella BS,Benjamin S Abella,Benjamin,S,Abella,doi:10.3390/diagnostics13040707
4037,2106799969.0,Abelson S,S Abelson,S,,Abelson,doi:10.1101/2023.07.12.23292570
4071,2106799969.0,Abelson S,S Abelson,S,,Abelson,doi:10.1016/j.scitotenv.2023.164289
4095,2106799969.0,Abelson S,S Abelson,S,,Abelson,doi:10.1016/j.scitotenv.2023.161423
4164,2106799969.0,Abelson S,S Abelson,S,,Abelson,doi:10.1016/j.scitotenv.2022.159188


### Create list of secondary authors

In [40]:
# secondary_authors = publication_query.get_author_ids(secondary_dois)
# author_map = {"authorId": "authorId", "title": "name", "abstract": "abstract", "journal": "journal", "year": "year", "type": "type", "doi": "doi", "pmId": "pmId", "pmcId": "pmcId", "url": "url"}
# secondary_authors = utils.rename_and_reorder_columns(secondary_authors, author_map)
# secondary_authors.rename(columns={"name": "author"})
# secondary_authors.head()
#secondary_authors = authors[["authorId", "author", "aliases", "orcid", "projectSerialNum"]].copy()

In [41]:
#secondary_authors.to_csv(os.path.join("../derived_data/", "secondary_authors.csv"), index=False)