# get_event_info
This notebook creates event-related node and relationship files for the RADx-KG:

Event.csv
Event-RELATED_TO-CoordinationCenter.csv

In [1]:
import os
import shutil
import pandas as pd
import hashlib
import utils
import publication_query
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [2]:
KG_PATH = "../kg/data"

In [3]:
# Event spreadsheet
doc_id = "1ZPvaKqHYIusiIROpjImlXLC2tqZFnkwb"
grid_id = "1054557813"
presentations = pd.read_csv(f"https://docs.google.com/spreadsheets/d/{doc_id}/export?format=csv&gid={grid_id}", dtype=str, keep_default_na=False)
# assign unique ID to each presentation
presentations["presentationId"] = presentations["eventId"] + presentations["title"]
presentations["presentationId"] = presentations["presentationId"].apply(lambda x: hashlib.md5(x.encode()).hexdigest())
presentations.head()
print(presentations.shape[0])
presentations

87


Unnamed: 0,eventId,presenters,title,presentationUrl,videoUrl,Subproject,Project,Contact PI(s),PI(s),Affiliation,presentationId
0,2023-11-07_E1,Mark Albers,Longitudinal smell testing to detect SARS-CoV-2 infection,,,Chemosensory testing,U01DC019579-01,Mark Albers,,"ADK Group, LLC",ae27baf31f88698080a13283b977faf3
1,2023-11-07_E1,John Hayes,"The ArOMa-T, a tool for rapid, adaptive testing of olfactory thresholds",,,,,Steven D Munger,,The Pennsylvania State University,962bcba40d8ae53eb9f6adfa92e476de
2,2023-11-07_E1,Christopher Simons,Confectionary-based screening tool to assess chemosensory loss in COVID-19 patients,,,Chemosensory testing,R01DC016112-04S1,Susan Travers,,Ohio State University,78e761fdd29e4c8a1435e1bd0fd35a6e
3,2023-11-07_E1,Pamela Dalton,SCENTinel and other screening tools for remote testing,,,,,Pamela Dalton,,Monell Chemical Senses Center,006458e5ddfa0299c0c5310106b61ce8
4,2023-09-21_E1,Xudong Fan,COVID-19 detection through scent analysis with a compact GC device,,,Scent,U18TR003812-01,Xudong Fan,Kevin Ward,University of Michigan at Ann Arbor,cf197e5a369f99145e525699089439fb
5,2023-09-21_E1,Xiao-An Fu,A portable breath analyzer system to screen for SARS-CoV-2 infection,,,Scent,,Xiao-An Fu,,University of Louisville,41d9821dc51fbf7d26d14dc4ac2a0d54
6,2023-09-21_E1,Cristina Davis,Breath volatile organic compounds (VOCs) for COVID-19 diagnostics,,,Scent,,Cristina Davis,,"University of California, Davis",b34c0f79e57273e641c23e7d0bbb1260
7,2023-09-21_E1,John Hayes,"Using rapid, self-administered testing to assess chemosensory function",,,Chemosensory testing,,Steven Munger,,University of Florida,86b10ec18eb309bd09ed0638ad6d9c00
8,2023-09-21_E1,Mark Albers,Longitudinal At Home Remote Olfactory Testing to Detect SARS-CoV2,,,Chemosensory testing,,Mark Albers,,MGH /Harvard University,002020d8308198d8e23841d67a2edb0d
9,2023-09-21_E1,Susan Travers,A Confectionary-based Screening Tool for Assessing Chemosensory Loss in COVID-19 Patients,,,Chemosensory testing,R01DC016112-04S1,Susan Travers,,Ohio State University,a23b1fda417d14d96568b59149b581db


## Create Presentation-PRESENTED_AT-Event relationships

In [4]:
presentations_map = {"presentationId": "from", "eventId": "to"}
presented = utils.rename_and_reorder_columns(presentations, presentations_map)
presented.to_csv(os.path.join(KG_PATH, "relationships", "Presentation-PRESENTED_AT-Event.csv"), index=False)
print(f"Number of Presentation nodes: {presented.shape[0]}")
presented

Number of Presentation nodes: 87


Unnamed: 0,from,to
0,ae27baf31f88698080a13283b977faf3,2023-11-07_E1
1,962bcba40d8ae53eb9f6adfa92e476de,2023-11-07_E1
2,78e761fdd29e4c8a1435e1bd0fd35a6e,2023-11-07_E1
3,006458e5ddfa0299c0c5310106b61ce8,2023-11-07_E1
4,cf197e5a369f99145e525699089439fb,2023-09-21_E1
5,41d9821dc51fbf7d26d14dc4ac2a0d54,2023-09-21_E1
6,b34c0f79e57273e641c23e7d0bbb1260,2023-09-21_E1
7,86b10ec18eb309bd09ed0638ad6d9c00,2023-09-21_E1
8,002020d8308198d8e23841d67a2edb0d,2023-09-21_E1
9,a23b1fda417d14d96568b59149b581db,2023-09-21_E1


In [5]:
#presentations["id"] = presentations["title"].apply(lambda x: hashlib.md5(x.encode()).hexdigest())
#presentations.head()

In [6]:
presenters = presentations[["eventId", "presenters", "presentationId"]].copy()
presenters["presenters"] = presenters["presenters"].str.split("|")
presenters = presenters.explode("presenters")
publication_query.expand_name_column(presenters, "presenters")
presenters["presenters"] = presenters["name"]
presenters.drop(columns="name", inplace=True)
presenters.head()

Unnamed: 0,eventId,presenters,presentationId,fullName,firstName,middleName,lastName
0,2023-11-07_E1,Albers M,ae27baf31f88698080a13283b977faf3,Mark Albers,Mark,,Albers
1,2023-11-07_E1,Hayes J,962bcba40d8ae53eb9f6adfa92e476de,John Hayes,John,,Hayes
2,2023-11-07_E1,Simons C,78e761fdd29e4c8a1435e1bd0fd35a6e,Christopher Simons,Christopher,,Simons
3,2023-11-07_E1,Dalton P,006458e5ddfa0299c0c5310106b61ce8,Pamela Dalton,Pamela,,Dalton
4,2023-09-21_E1,Fan X,cf197e5a369f99145e525699089439fb,Xudong Fan,Xudong,,Fan


In [7]:
investigators = pd.read_csv(os.path.join(KG_PATH, "nodes", "Researcher_investigators.csv"), dtype=str, keep_default_na=False)
other = pd.read_csv(os.path.join(KG_PATH, "nodes", "Researcher_primary_coauthors.csv"), dtype=str, keep_default_na=False)
researchers = pd.concat([investigators, other])
researchers.head()

Unnamed: 0,id,name,fullName,firstName,middleName,lastName,orcid,profileId
0,profileid:2563052,Allen CE,Carl E Allen,Carl,E,Allen,orcid:0000-0002-6625-739X,profileid:2563052
1,profileid:7039414,Annapragada AV,Ananth V Annapragada,Ananth,V,Annapragada,orcid:0000-0002-3156-9617,profileid:7039414
2,profileid:10320851,Bassiri H,Hamid Bassiri,Hamid,,Bassiri,orcid:0000-0001-6532-8478,profileid:10320851
3,profileid:7989301,Burns JC,Jane C Burns,Jane,C,Burns,orcid:0000-0001-5679-1217,profileid:7989301
4,profileid:8667619,Chiu CY,Charles Yen Chiu,Charles,Yen,Chiu,orcid:0000-0003-2915-2094,profileid:8667619


In [8]:
presenters_matched = utils.fuzzy_merge(presenters, researchers, left_fuzzy_on="presenters", right_fuzzy_on="name", how="inner", threshold=0.95)
presenters_matched

Unnamed: 0,eventId,presenters,presentationId,fullName_x,firstName_x,middleName_x,lastName_x,match,score,id,name,fullName_y,firstName_y,middleName_y,lastName_y,orcid,profileId
0,2023-11-07_E1,Albers M,ae27baf31f88698080a13283b977faf3,Mark Albers,Mark,,Albers,Albers MW,0.977778,profileid:6625336,Albers MW,Mark W Albers,Mark,W,Albers,orcid:0000-0001-7855-3455,profileid:6625336
1,2023-11-07_E1,Hayes J,962bcba40d8ae53eb9f6adfa92e476de,John Hayes,John,,Hayes,Hayes JE,0.975,profileid:9451051,Hayes JE,John Edward Hayes,John,Edward,Hayes,,profileid:9451051
2,2023-11-07_E1,Dalton P,006458e5ddfa0299c0c5310106b61ce8,Pamela Dalton,Pamela,,Dalton,Dalton PH,0.977778,profileid:1877373,Dalton PH,Pamela Helen Dalton,Pamela,Helen,Dalton,orcid:0000-0003-2474-2888,profileid:1877373
3,2023-09-21_E1,Fan X,cf197e5a369f99145e525699089439fb,Xudong Fan,Xudong,,Fan,Fan X,1.0,profileid:8485696,Fan X,Xudong Fan,Xudong,,Fan,orcid:0000-0003-0149-1326,profileid:8485696
4,2023-09-21_E1,Fu X,41d9821dc51fbf7d26d14dc4ac2a0d54,Xiao-An Fu,Xiao-An,,Fu,Fu X,1.0,profileid:10807170,Fu X,Xiao-An Fu,Xiao-An,,Fu,,profileid:10807170
5,2023-09-21_E1,Fu X,41d9821dc51fbf7d26d14dc4ac2a0d54,Xiao-An Fu,Xiao-An,,Fu,Fu X,1.0,s2authorid:47012539,Fu X,Xiuping Fu,Xiuping,,Fu,,
6,2023-09-21_E1,Davis C,b34c0f79e57273e641c23e7d0bbb1260,Cristina Davis,Cristina,,Davis,Davis CE,0.975,profileid:2050145,Davis CE,Cristina Elizabeth Davis,Cristina,Elizabeth,Davis,,profileid:2050145
7,2023-09-21_E1,Hayes J,86b10ec18eb309bd09ed0638ad6d9c00,John Hayes,John,,Hayes,Hayes JE,0.975,profileid:9451051,Hayes JE,John Edward Hayes,John,Edward,Hayes,,profileid:9451051
8,2023-09-21_E1,Albers M,002020d8308198d8e23841d67a2edb0d,Mark Albers,Mark,,Albers,Albers MW,0.977778,profileid:6625336,Albers MW,Mark W Albers,Mark,W,Albers,orcid:0000-0001-7855-3455,profileid:6625336
9,2023-09-21_E1,Travers S,a23b1fda417d14d96568b59149b581db,Susan Travers,Susan,,Travers,Travers SP,0.98,profileid:1896926,Travers SP,Susan P Travers,Susan,P,Travers,orcid:0000-0001-8730-3618,profileid:1896926


## Create Researcher-PRESENTED-Presentation relationships

In [9]:
presented = presenters_matched[["id", "presentationId"]].copy()
presenters_map = {"id": "from", "presentationId": "to"}
presented = utils.rename_and_reorder_columns(presented, presenters_map)
presented.drop_duplicates(inplace=True)
presented = presented[(presented["from"] != "") & (presented["to"] != "")]
presented.to_csv(os.path.join(KG_PATH, "relationships", "Researcher-PRESENTED-Presentation.csv"), index=False)
print(f"Number of other Researcher nodes: {presented.shape[0]}")
presented

Number of other Researcher nodes: 174


Unnamed: 0,from,to
0,profileid:6625336,ae27baf31f88698080a13283b977faf3
1,profileid:9451051,962bcba40d8ae53eb9f6adfa92e476de
2,profileid:1877373,006458e5ddfa0299c0c5310106b61ce8
3,profileid:8485696,cf197e5a369f99145e525699089439fb
4,profileid:10807170,41d9821dc51fbf7d26d14dc4ac2a0d54
5,s2authorid:47012539,41d9821dc51fbf7d26d14dc4ac2a0d54
6,profileid:2050145,b34c0f79e57273e641c23e7d0bbb1260
7,profileid:9451051,86b10ec18eb309bd09ed0638ad6d9c00
8,profileid:6625336,002020d8308198d8e23841d67a2edb0d
9,profileid:1896926,a23b1fda417d14d96568b59149b581db


## Create Presentation Nodes

In [10]:
presentations_map = {"presentationId": "id", "title": "name", "presenters": "presenters", "presentationUrl": "presentationUrl",	"videoUrl": "videoUrl"}
presentations = utils.rename_and_reorder_columns(presentations, presentations_map)
presentations.drop_duplicates(inplace=True)
presentations.to_csv(os.path.join(KG_PATH, "nodes", "Presentation.csv"), index=False)
print(f"Number of Presentation nodes: {presentations.shape[0]}")
presentations

Number of Presentation nodes: 87


Unnamed: 0,id,name,presenters,presentationUrl,videoUrl
0,ae27baf31f88698080a13283b977faf3,Longitudinal smell testing to detect SARS-CoV-2 infection,Mark Albers,,
1,962bcba40d8ae53eb9f6adfa92e476de,"The ArOMa-T, a tool for rapid, adaptive testing of olfactory thresholds",John Hayes,,
2,78e761fdd29e4c8a1435e1bd0fd35a6e,Confectionary-based screening tool to assess chemosensory loss in COVID-19 patients,Christopher Simons,,
3,006458e5ddfa0299c0c5310106b61ce8,SCENTinel and other screening tools for remote testing,Pamela Dalton,,
4,cf197e5a369f99145e525699089439fb,COVID-19 detection through scent analysis with a compact GC device,Xudong Fan,,
5,41d9821dc51fbf7d26d14dc4ac2a0d54,A portable breath analyzer system to screen for SARS-CoV-2 infection,Xiao-An Fu,,
6,b34c0f79e57273e641c23e7d0bbb1260,Breath volatile organic compounds (VOCs) for COVID-19 diagnostics,Cristina Davis,,
7,86b10ec18eb309bd09ed0638ad6d9c00,"Using rapid, self-administered testing to assess chemosensory function",John Hayes,,
8,002020d8308198d8e23841d67a2edb0d,Longitudinal At Home Remote Olfactory Testing to Detect SARS-CoV2,Mark Albers,,
9,a23b1fda417d14d96568b59149b581db,A Confectionary-based Screening Tool for Assessing Chemosensory Loss in COVID-19 Patients,Susan Travers,,


## Create Event nodes
Event info is maintained manually in the data directory

In [11]:
doc_id = "1ZPvaKqHYIusiIROpjImlXLC2tqZFnkwb"
grid_id = "77214925"
events = pd.read_csv(f"https://docs.google.com/spreadsheets/d/{doc_id}/export?format=csv&gid={grid_id}", usecols=["id", "name", "eventType", "eventUrl", "startDate", "endDate", "city", "state", "country"], dtype=str, keep_default_na=False)
events.to_csv(os.path.join(KG_PATH, "nodes", "Event.csv"), index=False)
print(f"Number of Event nodes: {events.shape[0]}")
events

Number of Event nodes: 14


Unnamed: 0,id,name,eventType,eventUrl,startDate,endDate,city,state,country
0,2023-11-07_E1,Towards Universal Chemosensory Testing (TUCT 2023),hybrid conference,https://smellandtastetestingforall.monell.org/,2023-11-05,2023-11-07,Philadelphia,PA,United States of America
1,2023-09-21_E1,RADx-rad 2023 Conference,webinar,https://www.radxrad.org/event/radx-rad-2023-conference-final-conference/,2023-09-21,2023-09-21,,,
2,2023-08-15_E1,"RADx Data Hub: Making Datasets Findable and Accessible Using Digital Object Identifiers (DOIs), Compact URIs (CURIEs), and Other Identifiers",webinar,https://radx-hub.nih.gov/info/events,2023-08-15,2023-08-15,,,
3,2023-06-20_E1,Association of Environmental Engineering and Science Professors (AEESP) National Conference,in-person conference,https://aeesp2023.sites.northeastern.edu/,2023-06-20,2023-06-23,Boston,MA,United States of America
4,2023-05-16_E1,RADx Data Hub: Privacy Protection: De-identifying Human Subject Data,webinar,https://radx-hub.nih.gov/info/events,2023-05-16,2023-05-16,,,
5,2023-04-27_E1,Pediatric Academic Society,in-person conference,https://www.pas-meeting.org/2023-homepage/,2023-04-27,2023-05-01,Washington,DC,United States of America
6,2022-11-05_E1,AMIA 2022 Annual Symposium,in-person conference,https://amia.org/education-events/amia-2022-annual-symposium,2022-11-05,2022-11-09,Washington,DC,United States of America
7,2022-11-18_E1,7th Annual MetaSUB Conference GLOBAL METAGENOMICS SUMMIT,in-person conference,http://metasub.org/2022-conference/,2022-11-18,2022-11-21,Miami Beach,FL,United States of America
8,2022-11-09_E1,American Society for Microbiology,in-person conference,,2022-11-09,2022-11-12,Washington,DC,United States of America
9,2022-10-20_E1,New York Immunology Conference,in-person conference,,2022-10-20,2022-10-20,Albany,NY,United States of America


In [12]:
#shutil.copy("../data/Event.csv", os.path.join(KG_PATH, "nodes"))

## Create Event-RELATED_TO-CoordinationCenter.csv

In [13]:
#shutil.copy("../data/Event-RELATED_TO-CoordinationCenter.csv", os.path.join(KG_PATH, "relationships"))