# get_reseacher_info
This notebook creates researcher-related node and relationship files for the RADx-KG:
* Researcher.csv
* Researcher-IS_PI_OF-Grant.csv

In [1]:
import os
import pandas as pd
import shutil
import grant_query
import publication_query
import utils
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
#pd.set_option('display.max_colwidth', None)

In [2]:
KG_PATH = "../kg/data"
DERIVED_DATA_PATH = "../derived_data"

In [3]:
START_FISCAL_YEAR = 2021 # starts Oct. 1, 2020

In [4]:
projects = pd.read_csv(os.path.join(DERIVED_DATA_PATH, "radx-projects.csv"), dtype=str, keep_default_na=False)
projects.query("researchInitiative == 'RADx-rad'", inplace=True)

## Prepare fields for Study Investigator
The study investigator is the investigator assigned to a dbGaP study.

In [5]:
publication_query.expand_name_column(projects, "studyInvestigator")
projects["studyInvestigator"] = projects.apply(lambda x: x["name"] if x["dbgapAccession"].startswith("phs") else "", axis=1)
projects.drop("name", axis=1, inplace=True)
projects.rename(columns={"fullName": "studyFullName"}, inplace=True)
projects.rename(columns={"firstName": "studyFirstName"}, inplace=True)
projects.rename(columns={"middleName": "studyMiddleName"}, inplace=True)
projects.rename(columns={"lastName": "studyLastName"}, inplace=True)

In [6]:
print(f"Number of projects {projects.shape[0]}")
projects

Number of projects 50


Unnamed: 0,researchInitiative,dbgapAccession,projectNum,coreProjectNum,projectSerialNum,studyInvestigator,studyTitle,subProject,studyFullName,studyFirstName,studyMiddleName,studyLastName
0,RADx-rad,phs002744.v1.p1,U01AA029345,U01AA029345,AA029345,Salaita K,Rapid Acceleration of Diagnostics - Radical (R...,Automatic Detection & Tracing,Khalid Salaita,Khalid,,Salaita
12,RADx-rad,phs002778.v1.p1,1U18TR003793-01,U18TR003793,TR003793,Stott S,Rapid Acceleration of Diagnostics - Radical (R...,Exosome,Shannon Stott,Shannon,,Stott
26,RADx-rad,phs002604.v1.p1,1U01DA053949-01,U01DA053949,DA053949,Uhlemann A,Rapid Acceleration of Diagnostics - RADICAL (R...,Wastewater,Anne-Catrin Uhlemann,Anne-Catrin,,Uhlemann
27,RADx-rad,phs002583.v1.p1,5U01HL150852-02,U01HL150852,HL150852,DeMauro EP,Rapid Acceleration of Diagnostics - Radical (...,Novel Biosensing and VOC,Edward P DeMauro,Edward,P,DeMauro
28,RADx-rad,phs002603.v1.p1,3R33HD105594-03S1,R33HD105594,HD105594,Odom-John A,Rapid Acceleration of Diagnostics - Radical (...,PreVAIL kIds,Audrey Odom-John,Audrey,,Odom-John
29,RADx-rad,phs002642.v1.p1,R01DC016112,R01DC016112,DC016112,Travers S,Rapid Acceleration of Diagnostics - Radical (R...,Chemosensory Testing,Susan Travers,Susan,,Travers
30,RADx-rad,phs002702.v1.p1,1R44DE030842-01,R44DE030842,DE030842,Ly J,Rapid Acceleration of Diagnostics - Radical (R...,Novel Biosensing and VOC,Jeffrey Ly,Jeffrey,,Ly
31,RADx-rad,phs002685.v1.p1,1R44DE030852-01,R44DE030852,DE030852,Yao X,Rapid Acceleration of Diagnostics - Radical (R...,Novel Biosensing and VOC,Xiaohu Yao,Xiaohu,,Yao
32,RADx-rad,phs003124.v1.p1,1U18TR003780-01,U18TR003780,TR003780,Das S,Rapid Acceleration of Diagnostics - Radical (R...,Exosome,Samarjit Das,Samarjit,,Das
33,RADx-rad,phs002729.v1.p1,1U01DA053899-01,U01DA053899,DA053899,Noble R,Rapid Acceleration of Diagnostics - Radical (R...,Wastewater,Rachel Noble,Rachel,,Noble


## Get Principal Investigators for Grants
Grant PIs are the investigators that are assigned to a grant in NIH Reporter. One of the investigators is designated as the contact PI.

In [7]:
project_num = list(projects["coreProjectNum"].unique())
grant_pis = grant_query.get_principal_investigators(list(project_num))
# keep only the entries for the most recent fiscal year
grant_pis.sort_values("fiscalYear", ascending=False, inplace=True)
grant_pis.drop_duplicates("profileId", inplace=True)
grant_pis.rename(columns={"name": "grantPi"}, inplace=True)
print("Number of PIs:", grant_pis["grantPi"].nunique())
print(grant_pis.shape[0])
grant_pis.head()

Number of PIs: 100
100


Unnamed: 0,profileId,coreProjectNum,projectSerialNum,isContactPi,fiscalYear,grantPi,fullName,firstName,middleName,lastName
0,1877373,U01DC019578,DC019578,True,2023,Dalton PH,Pamela Helen Dalton,Pamela,Helen,Dalton
12,9942278,R01DK130067,DK130067,False,2023,Kotanko P,Peter Kotanko,Peter,,Kotanko
1,16392635,U01DC019578,DC019578,False,2023,Parma V,Valentina Parma,Valentina,,Parma
22,7086603,U01HL152410,HL152410,False,2023,Grant SA,SHEILA Ann GRANT,Sheila,Ann,Grant
21,1897028,U01HL152410,HL152410,True,2023,Fay WP,William P Fay,William,P,Fay


In [8]:
# Merge dbGaP with Grant data
projects.query("studyInvestigator != ''", inplace=True) # Otakuye Conroy-Ben and Lucila Ohno-Machado are not study investigators
dbgap_to_grant = utils.fuzzy_merge(projects[["dbgapAccession", "coreProjectNum", "studyInvestigator", "studyFirstName", "studyMiddleName", "studyLastName"]], grant_pis, left_fuzzy_on="studyInvestigator", right_fuzzy_on="grantPi", left_on="coreProjectNum", right_on="coreProjectNum", how="outer", threshold=0.9)
dbgap_to_grant = dbgap_to_grant[~((dbgap_to_grant["studyInvestigator"] == "") & (dbgap_to_grant["grantPi"] == ""))]
dbgap_to_grant.head()

Unnamed: 0,dbgapAccession,coreProjectNum,studyInvestigator,studyFirstName,studyMiddleName,studyLastName,match,score,profileId,projectSerialNum,isContactPi,fiscalYear,grantPi,fullName,firstName,middleName,lastName
0,phs002572.v1.p1,R42DE030832,Gordon T,Timothy,,Gordon,,0.0,,,,,,,,,
1,phs002542.v1.p1,U01DA053903,Keck JW,James,W,Keck,,0.0,,,,,,,,,
2,phs002583.v1.p1,U01HL150852,DeMauro EP,Edward,P,DeMauro,,0.0,,,,,,,,,
3,phs002924.v1.p1,U01HL152401,MacKenzie D,Devin,,MacKenzie,,0.0,,,,,,,,,
4,phs002522.v1.p1,U01HL152410,Huang J,Jie,,Huang,,0.0,,,,,,,,,


## Find NIH profileIds for investigators that are not grant PIs on the grants above

In [9]:
dbgap_to_grant_no_profile_id = dbgap_to_grant.query("profileId == ''").copy()
dbgap_to_grant_no_profile_id["studyMatchName"] = dbgap_to_grant_no_profile_id["studyLastName"] + ", " + dbgap_to_grant_no_profile_id["studyFirstName"] +  " " + dbgap_to_grant_no_profile_id["studyMiddleName"]
dbgap_to_grant_no_profile_id["studyMatchName"] = dbgap_to_grant_no_profile_id["studyMatchName"].str.strip()
dbgap_to_grant_no_profile_id["queryName"] = dbgap_to_grant_no_profile_id["studyFirstName"] +  " " + dbgap_to_grant_no_profile_id["studyMiddleName"] + " " + dbgap_to_grant_no_profile_id["studyLastName"]
dbgap_to_grant_no_profile_id["queryName"] = dbgap_to_grant_no_profile_id["queryName"].str.replace("  ", " ")
dbgap_to_grant_no_profile_id

Unnamed: 0,dbgapAccession,coreProjectNum,studyInvestigator,studyFirstName,studyMiddleName,studyLastName,match,score,profileId,projectSerialNum,isContactPi,fiscalYear,grantPi,fullName,firstName,middleName,lastName,studyMatchName,queryName
0,phs002572.v1.p1,R42DE030832,Gordon T,Timothy,,Gordon,,0.0,,,,,,,,,,"Gordon, Timothy",Timothy Gordon
1,phs002542.v1.p1,U01DA053903,Keck JW,James,W,Keck,,0.0,,,,,,,,,,"Keck, James W",James W Keck
2,phs002583.v1.p1,U01HL150852,DeMauro EP,Edward,P,DeMauro,,0.0,,,,,,,,,,"DeMauro, Edward P",Edward P DeMauro
3,phs002924.v1.p1,U01HL152401,MacKenzie D,Devin,,MacKenzie,,0.0,,,,,,,,,,"MacKenzie, Devin",Devin MacKenzie
4,phs002522.v1.p1,U01HL152410,Huang J,Jie,,Huang,,0.0,,,,,,,,,,"Huang, Jie",Jie Huang
5,phs002561.v1.p1,U54HL119145,Shafiee H,Hadi,,Shafiee,,0.0,,,,,,,,,,"Shafiee, Hadi",Hadi Shafiee
6,phs002602.v1.p1,U54HL119145,Unlu S,Selim,,Unlu,,0.0,,,,,,,,,,"Unlu, Selim",Selim Unlu


In [10]:
investigators_no_profile_id = dbgap_to_grant_no_profile_id["queryName"].to_list()
print(investigators_no_profile_id)
investigators_profile = grant_query.get_principal_investigators_by_name(investigators_no_profile_id)

['Timothy Gordon', 'James W Keck', 'Edward P DeMauro', 'Devin MacKenzie', 'Jie Huang', 'Hadi Shafiee', 'Selim Unlu']


In [11]:
# Devin MacKenzie: https://www.semanticscholar.org/author/Devin-MacKenzie/2183276053
# Edward P DeMauro: https://www.semanticscholar.org/author/E.-P.-DeMauro/93475563

In [12]:
investigators_profile.head()

Unnamed: 0,profileId,coreProjectNum,projectSerialNum,isContactPi,fiscalYear,name,fullName,firstName,middleName,lastName
0,9306964,R01DE027738,DE027738,False,2022,Chan TA,Timothy An-thy Chan,Timothy,An-thy,Chan
1,8725622,R01DE027738,DE027738,False,2022,Ho AL,Alan L Ho,Alan,L,Ho
2,10512020,R01DE027738,DE027738,True,2022,Morris LG,Luc Gordon Trang Morris,Luc,Gordon Trang,Morris
3,9306964,R01DE027738,DE027738,False,2021,Chan TA,Timothy An-thy Chan,Timothy,An-thy,Chan
4,8725622,R01DE027738,DE027738,False,2021,Ho AL,Alan L Ho,Alan,L,Ho


In [13]:
investigators_profile = investigators_profile[["profileId", "name", "firstName", "middleName", "lastName"]].copy()
investigators_profile.drop_duplicates(inplace=True)
investigators_profile["matchName"] = investigators_profile["lastName"] + ", " + investigators_profile["firstName"] + " " + investigators_profile["middleName"]
investigators_profile["matchName"] = investigators_profile["matchName"].str.replace("  ", " ")

In [14]:
investigators_profile.head()

Unnamed: 0,profileId,name,firstName,middleName,lastName,matchName
0,9306964,Chan TA,Timothy,An-thy,Chan,"Chan, Timothy An-thy"
1,8725622,Ho AL,Alan,L,Ho,"Ho, Alan L"
2,10512020,Morris LG,Luc,Gordon Trang,Morris,"Morris, Luc Gordon Trang"
12,9356398,Davison IG,Ian,Gordon,Davison,"Davison, Ian Gordon"
13,9732516,Gardner TJ,Timothy,James,Gardner,"Gardner, Timothy James"


In [15]:
dbgap_to_grant_no_profile_id.drop(columns=["match", "score", "profileId"], inplace=True)
dbgap_to_grant_no_profile_id.head(20)

Unnamed: 0,dbgapAccession,coreProjectNum,studyInvestigator,studyFirstName,studyMiddleName,studyLastName,projectSerialNum,isContactPi,fiscalYear,grantPi,fullName,firstName,middleName,lastName,studyMatchName,queryName
0,phs002572.v1.p1,R42DE030832,Gordon T,Timothy,,Gordon,,,,,,,,,"Gordon, Timothy",Timothy Gordon
1,phs002542.v1.p1,U01DA053903,Keck JW,James,W,Keck,,,,,,,,,"Keck, James W",James W Keck
2,phs002583.v1.p1,U01HL150852,DeMauro EP,Edward,P,DeMauro,,,,,,,,,"DeMauro, Edward P",Edward P DeMauro
3,phs002924.v1.p1,U01HL152401,MacKenzie D,Devin,,MacKenzie,,,,,,,,,"MacKenzie, Devin",Devin MacKenzie
4,phs002522.v1.p1,U01HL152410,Huang J,Jie,,Huang,,,,,,,,,"Huang, Jie",Jie Huang
5,phs002561.v1.p1,U54HL119145,Shafiee H,Hadi,,Shafiee,,,,,,,,,"Shafiee, Hadi",Hadi Shafiee
6,phs002602.v1.p1,U54HL119145,Unlu S,Selim,,Unlu,,,,,,,,,"Unlu, Selim",Selim Unlu


In [16]:
other_investigators = utils.fuzzy_merge(dbgap_to_grant_no_profile_id, investigators_profile, left_fuzzy_on="studyMatchName", right_fuzzy_on="matchName", how="left", threshold=0.90)

In [17]:
other_investigators

Unnamed: 0,dbgapAccession,coreProjectNum,studyInvestigator,studyFirstName,studyMiddleName,studyLastName,projectSerialNum,isContactPi,fiscalYear,grantPi,fullName,firstName_x,middleName_x,lastName_x,studyMatchName,queryName,match,score,profileId,name,firstName_y,middleName_y,lastName_y,matchName
0,phs002572.v1.p1,R42DE030832,Gordon T,Timothy,,Gordon,,,,,,,,,"Gordon, Timothy",Timothy Gordon,,0.0,,,,,,
1,phs002542.v1.p1,U01DA053903,Keck JW,James,W,Keck,,,,,,,,,"Keck, James W",James W Keck,,0.0,,,,,,
2,phs002583.v1.p1,U01HL150852,DeMauro EP,Edward,P,DeMauro,,,,,,,,,"DeMauro, Edward P",Edward P DeMauro,,0.0,,,,,,
3,phs002924.v1.p1,U01HL152401,MacKenzie D,Devin,,MacKenzie,,,,,,,,,"MacKenzie, Devin",Devin MacKenzie,,0.0,,,,,,
4,phs002522.v1.p1,U01HL152410,Huang J,Jie,,Huang,,,,,,,,,"Huang, Jie",Jie Huang,,0.0,,,,,,
5,phs002561.v1.p1,U54HL119145,Shafiee H,Hadi,,Shafiee,,,,,,,,,"Shafiee, Hadi",Hadi Shafiee,,0.0,,,,,,
6,phs002602.v1.p1,U54HL119145,Unlu S,Selim,,Unlu,,,,,,,,,"Unlu, Selim",Selim Unlu,"Unlu, M Selim",0.932867,8139237.0,Unlu MS,M,Selim,Unlu,"Unlu, M Selim"


In [18]:
#bfill see: https://www.statology.org/pandas-coalesce/
other_investigators["firstName_y"] = other_investigators[["studyFirstName"]].bfill(axis=1).iloc[:, 0]
other_investigators["middleName_y"] = other_investigators[["studyMiddleName"]].bfill(axis=1).iloc[:, 0]
other_investigators["lastName_y"] = other_investigators[["studyLastName"]].bfill(axis=1).iloc[:, 0]

In [19]:
other_investigators_map = {"profileId": "profileId", "coreProjectNum": "coreProjectNum", "isContactPi": "isContactPi",  
                           "grantPi": "grantPi", "firstName_y": "firstName", "middleName_y": "middleName", "lastName_y": "lastName", 
                           "dbgapAccession": "dbgapAccession", "studyInvestigator": "studyInvestigator", "studyFirstName":"studyFirstName"}
other_investigators = publication_query.rename_and_reorder_columns(other_investigators, other_investigators_map)
other_investigators.fillna("", inplace=True)

In [20]:
other_investigators

Unnamed: 0,profileId,coreProjectNum,isContactPi,grantPi,firstName,middleName,lastName,dbgapAccession,studyInvestigator,studyFirstName
0,,R42DE030832,,,Timothy,,Gordon,phs002572.v1.p1,Gordon T,Timothy
1,,U01DA053903,,,James,W,Keck,phs002542.v1.p1,Keck JW,James
2,,U01HL150852,,,Edward,P,DeMauro,phs002583.v1.p1,DeMauro EP,Edward
3,,U01HL152401,,,Devin,,MacKenzie,phs002924.v1.p1,MacKenzie D,Devin
4,,U01HL152410,,,Jie,,Huang,phs002522.v1.p1,Huang J,Jie
5,,U54HL119145,,,Hadi,,Shafiee,phs002561.v1.p1,Shafiee H,Hadi
6,8139237.0,U54HL119145,,,Selim,,Unlu,phs002602.v1.p1,Unlu S,Selim


In [21]:
dbgap_to_grant = dbgap_to_grant[["profileId", "coreProjectNum", "isContactPi", "grantPi", "firstName", "middleName", "lastName", "dbgapAccession", "studyInvestigator"]].copy()
dbgap_to_grant = dbgap_to_grant[(dbgap_to_grant["grantPi"] != "")].copy()

In [22]:
dbgap_to_grant.head()

Unnamed: 0,profileId,coreProjectNum,isContactPi,grantPi,firstName,middleName,lastName,dbgapAccession,studyInvestigator
7,6625336,U01DC019579,True,Albers MW,Mark,W,Albers,phs002964.v1.p1,Albers M
8,2563052,R61HD105593,False,Allen CE,Carl,E,Allen,,
9,7039414,R61HD105593,True,Annapragada AV,Ananth,V,Annapragada,phs002585.v1.p1,Annapragada A
10,10450719,U24LM013755,False,Aronoff-Spencer ES,Eliah,S,Aronoff-Spencer,,
11,10320851,R33HD105594,False,Bassiri H,Hamid,,Bassiri,,


In [23]:
investigators = pd.concat([dbgap_to_grant, other_investigators])
investigators["name"] = investigators["lastName"] + " " + investigators["firstName"].str[:1] + investigators["middleName"].str[:1]
investigators.fillna("", inplace=True)
investigators.head()

Unnamed: 0,profileId,coreProjectNum,isContactPi,grantPi,firstName,middleName,lastName,dbgapAccession,studyInvestigator,studyFirstName,name
7,6625336,U01DC019579,True,Albers MW,Mark,W,Albers,phs002964.v1.p1,Albers M,,Albers MW
8,2563052,R61HD105593,False,Allen CE,Carl,E,Allen,,,,Allen CE
9,7039414,R61HD105593,True,Annapragada AV,Ananth,V,Annapragada,phs002585.v1.p1,Annapragada A,,Annapragada AV
10,10450719,U24LM013755,False,Aronoff-Spencer ES,Eliah,S,Aronoff-Spencer,,,,Aronoff-Spencer ES
11,10320851,R33HD105594,False,Bassiri H,Hamid,,Bassiri,,,,Bassiri H


In [24]:
grant_pis.to_csv(os.path.join(DERIVED_DATA_PATH, "grant_pis.csv"))

## Merge with ORCID ids

In [25]:
# Manually curated list of ORICD IDs grant and study investigators
orcid = pd.read_csv(
        "https://docs.google.com/spreadsheets/d/1NrscSutI50QdiW2_Z9u5_TTcQZibMkJssLQJxTAFaI8/export?format=csv",
        keep_default_na=False,
       )

In [26]:
publication_query.expand_name_column(orcid, "principal_investigator")

In [27]:
orcid.head()

Unnamed: 0,research_initiative,principal_investigator,orcid_id,orcid_note,name,fullName,firstName,middleName,lastName
0,RADx-rad,Douglas Bell,https://orcid.org/0000-0001-7700-0840,,Bell D,Douglas Bell,Douglas,,Bell
1,RADx-rad,Lucila Ohno-Machado,https://orcid.org/0000-0002-8005-7327,,Ohno-Machado L,Lucila Ohno-Machado,Lucila,,Ohno-Machado
2,RADx-rad,Hua Xu,https://orcid.org/0000-0002-5274-4672,,Xu H,Hua Xu,Hua,,Xu
3,RADx-rad,Shannon Stott,https://orcid.org/0000-0002-0349-0522,,Stott S,Shannon Stott,Shannon,,Stott
4,RADx-rad,Yong Kim,https://orcid.org/0000-0001-7224-0503,,Kim Y,Yong Kim,Yong,,Kim


In [28]:
orcid.rename(columns={"name" : "orcidName"}, inplace=True)
investigators = utils.fuzzy_merge(investigators, orcid[["orcid_id", "orcidName"]], left_fuzzy_on="name", right_fuzzy_on="orcidName", how="outer", threshold=0.9)
investigators.drop(columns=["match", "score"], inplace=True)

In [29]:
investigators.head()

Unnamed: 0,profileId,coreProjectNum,isContactPi,grantPi,firstName,middleName,lastName,dbgapAccession,studyInvestigator,studyFirstName,name,orcid_id,orcidName
0,14135419,R61HD105618,False,De Vlaminck I,Iwijn,,De Vlaminck,,,,De Vlaminck I,,
1,1880742,U01HL150852,False,Kohn JB,Joachim,B,Kohn,,,,Kohn JB,,
2,9198778,R33HD105594,True,Odom John AR,Audrey,Ragan,Odom John,phs002603.v1.p1,Odom-John A,,Odom John AR,,
3,6625336,U01DC019579,True,Albers MW,Mark,W,Albers,phs002964.v1.p1,Albers M,,Albers MW,https://orcid.org/0000-0001-7855-3455,Albers M
4,2563052,R61HD105593,False,Allen CE,Carl,E,Allen,,,,Allen CE,https://orcid.org/0000-0002-6625-739X,Allen C


In [30]:
# merge PIs with primary publication authors
primary_authors = pd.read_csv(os.path.join(DERIVED_DATA_PATH, "primary_authors.csv"), dtype=str, keep_default_na=False)
primary_authors.sort_values("author", inplace=True)

In [31]:
primary_authors.head()

Unnamed: 0,authorId,author,aliases,projectSerialNum,doi
0,48292006,Allen C,"C Allen, Carl E Allen, Carl Allen, Carl E Alle...",HD105593,doi:10.3390/jcm12175435
1,3473746,Annapragada A,"Ananth V Annapragada, A Annapragada, A V Annap...",HD105593,doi:10.3390/jcm12175435
2,3473746,Annapragada A,"Ananth V Annapragada, A Annapragada, A V Annap...",HD105593,doi:10.1097/INF.0000000000003888
3,3473746,Annapragada A,"Ananth V Annapragada, A Annapragada, A V Annap...",HD105593,doi:10.1038/s41390-022-02108-6
4,3473746,Annapragada A,"Ananth V Annapragada, A Annapragada, A V Annap...",HD105593,doi:10.1016/j.jbi.2021.103818


In [32]:
investigators.head()

Unnamed: 0,profileId,coreProjectNum,isContactPi,grantPi,firstName,middleName,lastName,dbgapAccession,studyInvestigator,studyFirstName,name,orcid_id,orcidName
0,14135419,R61HD105618,False,De Vlaminck I,Iwijn,,De Vlaminck,,,,De Vlaminck I,,
1,1880742,U01HL150852,False,Kohn JB,Joachim,B,Kohn,,,,Kohn JB,,
2,9198778,R33HD105594,True,Odom John AR,Audrey,Ragan,Odom John,phs002603.v1.p1,Odom-John A,,Odom John AR,,
3,6625336,U01DC019579,True,Albers MW,Mark,W,Albers,phs002964.v1.p1,Albers M,,Albers MW,https://orcid.org/0000-0001-7855-3455,Albers M
4,2563052,R61HD105593,False,Allen CE,Carl,E,Allen,,,,Allen CE,https://orcid.org/0000-0002-6625-739X,Allen C


In [33]:
 # De Vlaminck" I" doesn't match
# split ORICD file into first/middle/lastname
investigators = utils.fuzzy_merge2(investigators, primary_authors[["authorId", "author"]], left_fuzzy_on="name", right_fuzzy_on="author", how="outer", threshold=0.9)
investigators.drop_duplicates(inplace=True)

In [34]:
# remove mismatches, e.g. Annapragada PA vs. Annapragada A (first initial mismatch)
investigators.query("coreProjectNum != ''", inplace=True)

In [35]:
# assign CURIEs
investigators["profileId"] = investigators["profileId"].apply(lambda x: "profileid:" + x if x != "" else "")
investigators["orcid"] = investigators["orcid_id"].str.replace("https://orcid.org/", "orcid:")

# assign profileId as primary key, if present, otherwise assign the orcid
investigators["id"]= investigators["profileId"].where(investigators["profileId"] != "", investigators["orcid"])

# assign full name
investigators["fullName"] = investigators["firstName"] + " " + investigators["middleName"] + " " + investigators["lastName"]
investigators["fullName"] = investigators["fullName"].str.replace("  ", " ")

In [36]:
investigators.head()

Unnamed: 0,profileId,coreProjectNum,isContactPi,grantPi,firstName,middleName,lastName,dbgapAccession,studyInvestigator,studyFirstName,name,orcid_id,orcidName,match,authorId,author,orcid,id,fullName
0,profileid:2563052,R61HD105593,False,Allen CE,Carl,E,Allen,,,,Allen CE,https://orcid.org/0000-0002-6625-739X,Allen C,Allen C,48292006,Allen C,orcid:0000-0002-6625-739X,profileid:2563052,Carl E Allen
1,profileid:7039414,R61HD105593,True,Annapragada AV,Ananth,V,Annapragada,phs002585.v1.p1,Annapragada A,,Annapragada AV,https://orcid.org/0000-0002-3156-9617,Annapragada A,Annapragada A,3473746,Annapragada A,orcid:0000-0002-3156-9617,profileid:7039414,Ananth V Annapragada
17,profileid:10320851,R33HD105594,False,Bassiri H,Hamid,,Bassiri,,,,Bassiri H,https://orcid.org/0000-0001-6532-8478,Bassiri H,Bassiri H,10999669,Bassiri H,orcid:0000-0001-6532-8478,profileid:10320851,Hamid Bassiri
81,profileid:7989301,R61HD105590,True,Burns JC,Jane,C,Burns,phs002553.v1.p1,Burns J,,Burns JC,https://orcid.org/0000-0001-5679-1217,Burns J,Burns J,6298375,Burns J,orcid:0000-0001-5679-1217,profileid:7989301,Jane C Burns
250,profileid:8667619,R61HD105618,True,Chiu CY,Charles,Yen,Chiu,phs002781.v1.p1,Chiu C,,Chiu CY,https://orcid.org/0000-0003-2915-2094,Chiu C,Chiu C,2142194904,Chiu C,orcid:0000-0003-2915-2094,profileid:8667619,Charles Yen Chiu


In [37]:
# id to Semantic Scholar author id mapping
id_to_author_id = investigators[["id", "authorId", "orcid", "name", "fullName", "firstName", "middleName", "lastName"]].copy()
id_to_author_id.drop_duplicates(inplace=True)
id_to_author_id.to_csv(os.path.join(DERIVED_DATA_PATH, "id_to_author_id.csv"), index=False)
id_to_author_id.head()

Unnamed: 0,id,authorId,orcid,name,fullName,firstName,middleName,lastName
0,profileid:2563052,48292006,orcid:0000-0002-6625-739X,Allen CE,Carl E Allen,Carl,E,Allen
1,profileid:7039414,3473746,orcid:0000-0002-3156-9617,Annapragada AV,Ananth V Annapragada,Ananth,V,Annapragada
17,profileid:10320851,10999669,orcid:0000-0001-6532-8478,Bassiri H,Hamid Bassiri,Hamid,,Bassiri
81,profileid:7989301,6298375,orcid:0000-0001-5679-1217,Burns JC,Jane C Burns,Jane,C,Burns
250,profileid:8667619,2142194904,orcid:0000-0003-2915-2094,Chiu CY,Charles Yen Chiu,Charles,Yen,Chiu


## Create Researcher nodes

In [38]:
researcher_nodes = investigators[["id", "name", "fullName", "firstName", "middleName", "lastName", "orcid", "profileId"]].copy()
researcher_nodes.drop_duplicates(inplace=True)
researcher_nodes.to_csv(os.path.join(KG_PATH, "nodes", "Researcher_investigators.csv"), index=False)
print(f"Number of Researcher investigator nodes: {researcher_nodes.shape[0]}")
researcher_nodes.head()

Number of Researcher investigator nodes: 107


Unnamed: 0,id,name,fullName,firstName,middleName,lastName,orcid,profileId
0,profileid:2563052,Allen CE,Carl E Allen,Carl,E,Allen,orcid:0000-0002-6625-739X,profileid:2563052
1,profileid:7039414,Annapragada AV,Ananth V Annapragada,Ananth,V,Annapragada,orcid:0000-0002-3156-9617,profileid:7039414
17,profileid:10320851,Bassiri H,Hamid Bassiri,Hamid,,Bassiri,orcid:0000-0001-6532-8478,profileid:10320851
81,profileid:7989301,Burns JC,Jane C Burns,Jane,C,Burns,orcid:0000-0001-5679-1217,profileid:7989301
250,profileid:8667619,Chiu CY,Charles Yen Chiu,Charles,Yen,Chiu,orcid:0000-0003-2915-2094,profileid:8667619


In [39]:
researcher_nodes_other = pd.read_csv(os.path.join("../derived_data/", "primary_authors_other.csv"), dtype=str, keep_default_na=False)
researcher_nodes_other["id"] = "s2authorid:" + researcher_nodes_other["authorId"]
researcher_nodes_other.query("id != 's2authorid:None'", inplace=True)
researcher_nodes_other["name"] = researcher_nodes_other["author"]
researcher_nodes_other["orcid"] = ""
researcher_nodes_other["profileId"] = ""
researcher_nodes_other = researcher_nodes_other[["id", "name", "fullName", "firstName", "middleName", "lastName", "orcid", "profileId"]]
researcher_nodes_other.drop_duplicates(inplace=True)
researcher_nodes_other.to_csv(os.path.join(KG_PATH, "nodes", "Researcher_primary_coauthors.csv"), index=False)
print(f"Number of other Researcher nodes: {researcher_nodes_other.shape[0]}")
researcher_nodes_other.head()

Number of other Researcher nodes: 2250


Unnamed: 0,id,name,fullName,firstName,middleName,lastName,orcid,profileId
1,s2authorid:80827919,Abdullah N,Natasha Abdullah,Natasha,,Abdullah,,
2,s2authorid:2186155180,Abe N,Naomi Abe,Naomi,,Abe,,
4,s2authorid:6865020,Abedalthagafi M,M Abedalthagafi,M,,Abedalthagafi,,
5,s2authorid:2106780379,Abella BS,Benjamin S Abella,Benjamin,S,Abella,,
6,s2authorid:2106799969,Abelson S,S Abelson,S,,Abelson,,


## Create Researcher-IS_INVESTIGATOR_OF-Grant relationships

In [40]:
is_investigator = investigators.copy()
is_investigator_map = {"id": "from", "coreProjectNum": "to", "grantPi": "isPi", "isContactPi": "isContactPi", "studyInvestigator": "isStudyInvestigator"}
is_investigator = utils.rename_and_reorder_columns(is_investigator, is_investigator_map)
is_investigator.drop_duplicates(inplace=True)
is_investigator.fillna("", inplace=True)
is_investigator = is_investigator[(is_investigator["from"] != "") & (is_investigator["to"] != "")]
is_investigator["isPi"] = is_investigator["isPi"] != ""
is_investigator["isStudyInvestigator"] = is_investigator["isStudyInvestigator"] != ""
is_investigator["isContactPi"]= investigators["isContactPi"].where(is_investigator["isContactPi"] != "", False)

In [41]:
is_investigator["isPi"] = is_investigator["isPi"].astype(str).str.lower()
is_investigator["isStudyInvestigator"] = is_investigator["isStudyInvestigator"].astype(str).str.lower()
is_investigator["isContactPi"] = is_investigator["isContactPi"].astype(str).str.lower()

In [42]:
is_investigator.to_csv(os.path.join(KG_PATH, "relationships", "Researcher-IS_INVESTIGATOR_OF-Grant.csv"), index=False)
print(f"Number of Researcher-IS_INVESTIGATOR_OF-Grant relationships: {is_investigator.shape[0]}")
print(is_investigator.dtypes)
is_investigator.head()

Number of Researcher-IS_INVESTIGATOR_OF-Grant relationships: 107
from                   object
to                     object
isPi                   object
isContactPi            object
isStudyInvestigator    object
dtype: object


Unnamed: 0,from,to,isPi,isContactPi,isStudyInvestigator
0,profileid:2563052,R61HD105593,True,False,False
1,profileid:7039414,R61HD105593,True,True,True
17,profileid:10320851,R33HD105594,True,False,False
81,profileid:7989301,R61HD105590,True,True,True
250,profileid:8667619,R61HD105618,True,True,True
