# get_patent_info
This notebook creates dataset-related node and relationship files for the RADx-KG:
* Patent.csv
* Researcher-IS_INVENTOR_OF-Patent
* Patent-IS_RELATED_TO-Grant.csv

In [1]:
import os
import pandas as pd
import utils
import publication_query
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [2]:
KG_PATH = "../kg/data"

In [3]:
patent_info = pd.read_csv(
        "https://docs.google.com/spreadsheets/d/1O_IpO8-1IMHXmhpEw6eRU6KPNtBtnJ38GEL-tx9F9Mw/export?format=csv",
        keep_default_na=False,
       )

In [4]:
print(f"Number of projects {patent_info.shape[0]}")
patent_info

Number of projects 17


Unnamed: 0,coreProjectNum,inventors,patentId,patentUrl,title,filingDate,grantedDate,status,applicants
0,R01DE031114,Jesse Jokerst|Yash Mantri|Zhicheng Jin,WO2022169991A1,https://patents.google.com/patent/WO2022169991A1,A face covering having a test strip for colorimetric monitoring of proteases and methods of detecting proteases,2022-02-03,,pending,Regents Of The University Of California
1,U01HL152410,"Jie Huang|Chen Zhu|Rex E Gerald, II",US11740171B2,https://patents.google.com/patent/US11740171B2,Open-ended hollow coaxial cable resonator sensor,2021-03-23,2023-08-29,active,University of Missouri System
2,U01HL150852,Edward P DeMauro|German Drazer|Hao Lin|Mehdi Javanmard,US20220280062A1,https://patents.google.com/patent/US20220280062A1,"Novel, Rapid Breathalyzer Diagnostic Device for the Presence of SARS-CoV-2",2022-03-04,,pending,Rutgers State University of New Jersey
3,R61HD105593,Ananth Annapragada|Sridevi Devaraj|Devika Subramanian|Aadith Vittala|Xinpu Chen|Craig Rusin|Carl Allen,WO2023049044A1,https://patents.google.com/patent/WO2023049044A1,Stratification of disease severity following an inflammatory condition,2022-09-16,,pending,Baylor College Of Medicine|Texas Children's Hospital|William Marsh Rice University
4,R61HD105590,Shamim Nemati|Supreeth Prajwal Shashikumar|Atul Malhotra|Jonathan Lam,WO2023009846A1,https://patents.google.com/patent/WO2023009846A1,Machine learning enabled patient stratification \n,2022-07-29,,pending,Regents Of The University Of California
5,R42DE030829,Roya Khosravi-Far|Ramin M Hakami,WO2022204258A1,https://patents.google.com/patent/WO2022204258A1,Device and method for detecting inflammation,2022-03-23,,pending,"InnoTech Precision Medicine, Inc"
6,U18TR003780,Samarjit Das,WO2022178313A1,https://patents.google.com/patent/WO2022178313A1,Detecting diseases in subjects using extracelluar vesicles \n,2022-02-18,,pending,The Johns Hopkins University
7,U01AA029345,Selma Piranej|Khalid Salaita|Alisina Bazrafshan,WO2023192883A2,https://patents.google.com/patent/WO2023192883A2,Rolling sensor systems for detecting analytes and diagnostic methods related thereto,2023-03-28,,pending,Emory University
8,U18TR003807,Eduardo Reategui|Jingjing Zhang,WO2023114970A2,https://patents.google.com/patent/WO2023114970A2,Single extracellular vesicle protein and rna assay via in-situ fluorescence microscopy in a uv micropattern array,2022-12-16,,pending,Ohio State Innovation Foundation
9,U18TR003778,David Wong|Feng Li|Fang Wei|Otto Yang|Jennifer Fulcher|David Chia|Samantha Chiang,WO2023137426A2,https://patents.google.com/patent/WO2023137426A2,Sars-cov-2 (covid-19) tests on saliva and blood using efirm technology,2023-01-13,,pending,The Regents Of The University Of California|The US Government as represented by the Department of Veterans Affairs


In [5]:
patent_inventors = patent_info[["inventors", "patentId"]].copy()
patent_inventors["inventors"] = patent_inventors["inventors"].str.split("|")
patent_inventors = patent_inventors.explode("inventors")
#publication_query.expand_name_column(patent_inventors, "inventors")
#patent_inventors.rename(columns={"name": "inventor"}, inplace=True)
patent_grants = patent_info[["coreProjectNum", "patentId"]].copy()
patent = patent_info[["patentId", "title", "inventors", "filingDate", "grantedDate", "status", "applicants"]].copy()

In [6]:
investigators = pd.read_csv(os.path.join(KG_PATH, "nodes", "Researcher_investigators.csv"), dtype=str, keep_default_na=False)
other = pd.read_csv(os.path.join(KG_PATH, "nodes", "Researcher_primary_coauthors.csv"), dtype=str, keep_default_na=False)
researchers = pd.concat([investigators, other])
researchers.head()

Unnamed: 0,id,name,fullName,firstName,middleName,lastName,orcid,profileId
0,profileid:2563052,Allen CE,Carl E Allen,Carl,E,Allen,orcid:0000-0002-6625-739X,profileid:2563052
1,profileid:7039414,Annapragada AV,Ananth V Annapragada,Ananth,V,Annapragada,orcid:0000-0002-3156-9617,profileid:7039414
2,profileid:10320851,Bassiri H,Hamid Bassiri,Hamid,,Bassiri,orcid:0000-0001-6532-8478,profileid:10320851
3,profileid:7989301,Burns JC,Jane C Burns,Jane,C,Burns,orcid:0000-0001-5679-1217,profileid:7989301
4,profileid:8667619,Chiu CY,Charles Yen Chiu,Charles,Yen,Chiu,orcid:0000-0003-2915-2094,profileid:8667619


In [7]:
investigators.head()

Unnamed: 0,id,name,fullName,firstName,middleName,lastName,orcid,profileId
0,profileid:2563052,Allen CE,Carl E Allen,Carl,E,Allen,orcid:0000-0002-6625-739X,profileid:2563052
1,profileid:7039414,Annapragada AV,Ananth V Annapragada,Ananth,V,Annapragada,orcid:0000-0002-3156-9617,profileid:7039414
2,profileid:10320851,Bassiri H,Hamid Bassiri,Hamid,,Bassiri,orcid:0000-0001-6532-8478,profileid:10320851
3,profileid:7989301,Burns JC,Jane C Burns,Jane,C,Burns,orcid:0000-0001-5679-1217,profileid:7989301
4,profileid:8667619,Chiu CY,Charles Yen Chiu,Charles,Yen,Chiu,orcid:0000-0003-2915-2094,profileid:8667619


In [8]:
research_inventors = utils.fuzzy_merge(patent_inventors, researchers, left_fuzzy_on="inventors", right_fuzzy_on="fullName", how="left", threshold=0.95)

In [9]:
research_inventors.head()

Unnamed: 0,inventors,patentId,match,score,id,name,fullName,firstName,middleName,lastName,orcid,profileId
0,Jesse Jokerst,WO2022169991A1,,0.0,,,,,,,,
1,Yash Mantri,WO2022169991A1,Yash Mantri,1.0,s2authorid:89826291,Mantri Y,Yash Mantri,Yash,,Mantri,,
2,Zhicheng Jin,WO2022169991A1,Zhicheng Jin,1.0,s2authorid:2152843701,Jin Z,Zhicheng Jin,Zhicheng,,Jin,,
3,Zhicheng Jin,WO2022169991A1,Zhicheng Jin,1.0,s2authorid:1745180850,Jin Z,Zhicheng Jin,Zhicheng,,Jin,,
4,Zhicheng Jin,WO2022169991A1,Zhicheng Jin,1.0,s2authorid:2209272,Jin Z,Zhicheng Jin,Zhicheng,,Jin,,


In [10]:
research_inventors_no_match = research_inventors.query("id == ''").copy()
research_inventors_no_match.head()

Unnamed: 0,inventors,patentId,match,score,id,name,fullName,firstName,middleName,lastName,orcid,profileId
0,Jesse Jokerst,WO2022169991A1,,0.0,,,,,,,,
6,Chen Zhu,US11740171B2,,0.0,,,,,,,,
7,"Rex E Gerald, II",US11740171B2,,0.0,,,,,,,,
8,Edward P DeMauro,US20220280062A1,,0.0,,,,,,,,
9,German Drazer,US20220280062A1,,0.0,,,,,,,,


In [11]:
researchers["firstLastName"] = researchers["firstName"] + " " + researchers["lastName"]
researchers.head()

Unnamed: 0,id,name,fullName,firstName,middleName,lastName,orcid,profileId,firstLastName
0,profileid:2563052,Allen CE,Carl E Allen,Carl,E,Allen,orcid:0000-0002-6625-739X,profileid:2563052,Carl Allen
1,profileid:7039414,Annapragada AV,Ananth V Annapragada,Ananth,V,Annapragada,orcid:0000-0002-3156-9617,profileid:7039414,Ananth Annapragada
2,profileid:10320851,Bassiri H,Hamid Bassiri,Hamid,,Bassiri,orcid:0000-0001-6532-8478,profileid:10320851,Hamid Bassiri
3,profileid:7989301,Burns JC,Jane C Burns,Jane,C,Burns,orcid:0000-0001-5679-1217,profileid:7989301,Jane Burns
4,profileid:8667619,Chiu CY,Charles Yen Chiu,Charles,Yen,Chiu,orcid:0000-0003-2915-2094,profileid:8667619,Charles Chiu


In [12]:
publication_query.expand_name_column(research_inventors_no_match, "inventors")
research_inventors_no_match["inventor"] = research_inventors_no_match["firstName"] + " " + research_inventors_no_match["lastName"]

In [13]:
research_inventors_no_match = research_inventors_no_match[["inventors", "inventor", "patentId"]].copy()

In [14]:
research_inventors_no_match = utils.fuzzy_merge(research_inventors_no_match, researchers, left_fuzzy_on="inventor", right_fuzzy_on="firstLastName", how="left", threshold=0.96)

In [15]:
research_inventors_no_match.head()

Unnamed: 0,inventors,inventor,patentId,match,score,id,name,fullName,firstName,middleName,lastName,orcid,profileId,firstLastName
0,Jesse Jokerst,Jesse Jokerst,WO2022169991A1,Jesse Jokerst,1.0,profileid:11288941,Jokerst JV,Jesse Vincent Jokerst,Jesse,Vincent,Jokerst,orcid:0000-0003-2829-6408,profileid:11288941,Jesse Jokerst
1,Chen Zhu,Chen Zhu,US11740171B2,,0.0,,,,,,,,,
2,"Rex E Gerald, II",Rex II,US11740171B2,,0.0,,,,,,,,,
3,Edward P DeMauro,Edward DeMauro,US20220280062A1,Edward DeMauro,1.0,orcid:0000-0002-3793-8014,DeMauro EP,Edward P DeMauro,Edward,P,DeMauro,orcid:0000-0002-3793-8014,,Edward DeMauro
4,German Drazer,German Drazer,US20220280062A1,,0.0,,,,,,,,,


## Create Researcher-IS_INVENTOR-Patent relationships

In [16]:
research_inventors = research_inventors[["id", "patentId"]].copy()
research_inventors.query("id != ''", inplace=True)

In [17]:
research_inventors_no_match = research_inventors_no_match[["id", "patentId"]].copy()
research_inventors_no_match.query("id != ''", inplace=True)

In [18]:
research_inventors_matched = pd.concat([research_inventors, research_inventors_no_match])

In [19]:
research_inventors_matched.rename(columns={"id": "from", "patentId": "to"}, inplace=True)
research_inventors_matched.to_csv(os.path.join(KG_PATH, "relationships", "Researcher-IS_INVENTOR-Patent.csv"), index=False)
research_inventors_matched.head()                          

Unnamed: 0,from,to
1,s2authorid:89826291,WO2022169991A1
2,s2authorid:2152843701,WO2022169991A1
3,s2authorid:1745180850,WO2022169991A1
4,s2authorid:2209272,WO2022169991A1
5,orcid:0000-0002-8659-2910,US11740171B2


## Create Patent nodes

In [20]:
patent_map = {"patentId": "id", "title": "name", "inventors": "inventors", "filingDate": "filingDate",	"grantedDate": "grantedDate", "status": "status", "applicants": "applicants", "patentUrl": "url"}
patent = utils.rename_and_reorder_columns(patent_info, patent_map)

In [21]:
patent.to_csv(os.path.join(KG_PATH, "nodes", "Patent.csv"), index=False)
patent.head()    

Unnamed: 0,id,name,inventors,filingDate,grantedDate,status,applicants,url
0,WO2022169991A1,A face covering having a test strip for colorimetric monitoring of proteases and methods of detecting proteases,Jesse Jokerst|Yash Mantri|Zhicheng Jin,2022-02-03,,pending,Regents Of The University Of California,https://patents.google.com/patent/WO2022169991A1
1,US11740171B2,Open-ended hollow coaxial cable resonator sensor,"Jie Huang|Chen Zhu|Rex E Gerald, II",2021-03-23,2023-08-29,active,University of Missouri System,https://patents.google.com/patent/US11740171B2
2,US20220280062A1,"Novel, Rapid Breathalyzer Diagnostic Device for the Presence of SARS-CoV-2",Edward P DeMauro|German Drazer|Hao Lin|Mehdi Javanmard,2022-03-04,,pending,Rutgers State University of New Jersey,https://patents.google.com/patent/US20220280062A1
3,WO2023049044A1,Stratification of disease severity following an inflammatory condition,Ananth Annapragada|Sridevi Devaraj|Devika Subramanian|Aadith Vittala|Xinpu Chen|Craig Rusin|Carl Allen,2022-09-16,,pending,Baylor College Of Medicine|Texas Children's Hospital|William Marsh Rice University,https://patents.google.com/patent/WO2023049044A1
4,WO2023009846A1,Machine learning enabled patient stratification \n,Shamim Nemati|Supreeth Prajwal Shashikumar|Atul Malhotra|Jonathan Lam,2022-07-29,,pending,Regents Of The University Of California,https://patents.google.com/patent/WO2023009846A1
