# Output from Autophrase (keywords) is inserted back to the Input dataset.

In [1]:
import pandas as pd
import numpy as np

## Key function

In [2]:
def keyword_list_percentage_calculator(df_data, df_column, autophrase_kwlist): 
    mask_len = df_data.shape[0]
    mask_sel = np.full((mask_len), False, dtype=bool)
    for key_iter in autophrase_kwlist:
        mask_temp = df_data[df_column].str.contains(key_iter)
        mask_sel = np.logical_or(mask_sel,mask_temp)
    
    mask_coverage = mask_sel.value_counts()
    percent_coverage = float(mask_coverage[1]/[mask_coverage[0] + mask_coverage[1]])*100
    return percent_coverage

In [3]:
def keywordList_column_creator( df_data, df_column,autophrase_kwlist):
    dfcolumn_list = df_data[df_column].to_list()
    #pubmed_abstract_list = pubmed_abstract_list[:10]
    final_list = []
    for column_iter  in dfcolumn_list:
        cell_final_value = []
        for key_iter in autophrase_kwlist:
            if (key_iter in str(column_iter)):
                cell_final_value.append(key_iter)
        final_list.append(cell_final_value)
    return final_list

### Load Autophrase

In [4]:
pubmed_autophrase = pd.read_csv('Auto_phrase/AutoPhrase_output_PubMed.txt',delimiter="	",names=['temp','keywords'])
NIH_autophrase = pd.read_csv('Auto_phrase/AutoPhrase_output_NIH.txt',delimiter="	",names=['temp','keywords'])
clinical_autophrase = pd.read_csv('Auto_phrase/AutoPhrase_output_clinical.txt',delimiter="	",names=['temp','keywords'])

### Find all common_phrases belong to all 3 dataset

In [5]:
pubmed_autophrase_kwlist_fullist = set(pubmed_autophrase['keywords'].to_list())
NIH_autophrase_kwlist_fullist = set(NIH_autophrase['keywords'].to_list())
clinical_autophrase_kwlist_fullist = set(clinical_autophrase['keywords'].to_list())

In [6]:
temp_common = pubmed_autophrase_kwlist_fullist.intersection(NIH_autophrase_kwlist_fullist)
common_keyword = clinical_autophrase_kwlist_fullist.intersection(temp_common)

In [7]:
len(common_keyword)

187

# PUB_MED Data

In [8]:
df_pubmed = pd.read_csv('Raw_data/PubMed_wAbstracts-Pacemaker.csv')

### Selecting first 500 keywords:

In [9]:
pubmed_autophrase = pubmed_autophrase[:500]
pubmed_autophrase_kwlist_500 = set(pubmed_autophrase['keywords'].to_list())
pubmed_autophrase_kwlist = list (pubmed_autophrase_kwlist_500.union(common_keyword))

In [10]:
len(pubmed_autophrase_kwlist)

663

In [11]:
textfile = open("Auto_phrase/autophrase_kwlist_pubmed1.txt",  "w",  encoding="utf-8")
for element in pubmed_autophrase_kwlist:
    textfile.write(str(element) + "\n")
textfile.close()

### Calculate Autophrase keyword coverage in Pubmed abstract

In [12]:
keyword_list_percentage_calculator (df_pubmed, 'Abstract', pubmed_autophrase_kwlist)

97.733499377335

### Iternate Autophrase keyword coverage in Pubmed abstract to get list of keywords

In [13]:
df_pubmed['key_words'] = keywordList_column_creator(df_pubmed,'Abstract',pubmed_autophrase_kwlist)

In [14]:
df_pubmed[['PMID','key_words']].head()
df_pubmed.head()

Unnamed: 0,PMID,Title,Authors,Citation,First Author,Journal/Book,Publication Year,Create Date,PMCID,NIHMS ID,DOI,Abstract,key_words
0,31070324,[Micra® leadless pacemaker],"Lancellotti P, Gach O, Marechal P, Robinet S.",Rev Med Liege. 2019 Sup;74(S1):S104-S108.,Lancellotti P,Rev Med Liege,2019,2019/05/10,,,,The Micra® leadless pacemaker has demonstrated...,"[a new, for patients with, patients with, atri..."
1,31373521,Pacemaker complications and costs: a nationwid...,"Clémenty N, Fernandes J, Carion PL, de Léotoin...",J Med Econ. 2019 Nov;22(11):1171-1178. doi: 10...,Clémenty N,J Med Econ,2019,2019/08/03,,,10.1080/13696998.2019.1652186,Aims: Novel leadless pacemakers (LPMs) may red...,"[leadless pacemakers, inclusion criteria]"
2,30893006,Pacemaker exteriorization,"Dupont J, Koch A, Vonck A, Sarens T.",Acta Cardiol. 2020 Aug;75(4):375-376. doi: 10....,Dupont J,Acta Cardiol,2020,2019/03/21,,,10.1080/00015385.2019.1587139,,[]
3,30936325,Runaway pacemaker,"Gul A, Sheikh MA, Rao A.",BMJ Case Rep. 2019 Mar 31;12(3):e225411. doi: ...,Gul A,BMJ Case Rep,2019,2019/04/03,PMC6453287,,10.1136/bcr-2018-225411,Runaway pacemaker is phenomenon in which pacem...,"[heart rate, history of, ventricular tachycard..."
4,28956076,Pacemaker reprogramming rarely needed after de...,"Curila K, Smida J, Herman D, Osmancik P, Stros...",Herz. 2019 Feb;44(1):56-59. doi: 10.1007/s0005...,Curila K,Herz,2019,2017/09/29,,,10.1007/s00059-017-4627-5,BACKGROUND: Most outpatient follow-ups after p...,"[to determine, changes in, battery depletion]"


In [15]:
df_pubmed.to_csv('Auto_phrase/pubmed_data_output_withkeyword.csv')

# Clinical_ Data

In [16]:
df_clinical = pd.read_csv('Raw_data/ClinicalTrialsGov-Pacemaker-wContent.csv')

In [17]:
df_clinical.columns

Index(['Unnamed: 0', 'Rank', 'NCT Number', 'Title', 'Acronym', 'Status',
       'Study Results', 'Conditions', 'Interventions', 'Outcome Measures',
       'Sponsor/Collaborators', 'Gender', 'Age', 'Phases', 'Enrollment',
       'Funded Bys', 'Study Type', 'Study Designs', 'Other IDs', 'Start Date',
       'Primary Completion Date', 'Completion Date', 'First Posted',
       'Results First Posted', 'Last Update Posted', 'Locations',
       'Study Documents', 'URL', 'Detailed Description', 'Study Arms',
       'Eligibility Criteria'],
      dtype='object')

### Selecting first 500 keywords:

In [18]:
clinical_autophrase = clinical_autophrase[:500]
clinical_autophrase_kwlist_500 = set(clinical_autophrase['keywords'].to_list())
clinical_autophrase_kwlist = list (clinical_autophrase_kwlist_500.union(common_keyword))

In [19]:
len(clinical_autophrase_kwlist)

620

In [20]:
textfile = open("Auto_phrase/autophrase_kwlist_clinical.txt",  "w",  encoding="utf-8")
for element in clinical_autophrase_kwlist:
    textfile.write(str(element) + "\n")
textfile.close()

### Calculate Autophrase keyword coverage in Clinical abstract

In [36]:
keyword_list_percentage_calculator (df_clinical, 'Detailed Description', clinical_autophrase_kwlist)

96.625

### Iternate Autophrase keyword coverage in Clinical abstract to get list of keywords

In [22]:
df_clinical['key_words'] = keywordList_column_creator(df_clinical,'Detailed Description',clinical_autophrase_kwlist)

In [23]:
df_clinical.head()

Unnamed: 0.1,Unnamed: 0,Rank,NCT Number,Title,Acronym,Status,Study Results,Conditions,Interventions,Outcome Measures,...,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents,URL,Detailed Description,Study Arms,Eligibility Criteria,key_words
0,0,1,NCT04926792,Taiwan Registry for Leadless Pacemaker,,Not yet recruiting,No Results Available,Performance of the Leadless Pacemaker in Taiwa...,Device: Leadless Pacemaker,Acute complication|Subacute complication|Leadl...,...,15-Jun-21,,15-Jun-21,,,https://ClinicalTrials.gov/show/NCT04926792,The purpose of this registry is to investigate...,,Inclusion Criteria:\r\n\r\n 1. The pa...,"[to evaluate, follow up]"
1,1,2,NCT02931760,Placement Of Cardiac PacemaKer Trial (POCKET)-RCT,POCKET-RCT,Unknown status,No Results Available,Pacemaker Ddd,Device: subcutaneous pacemaker|Device: intramu...,patient overall satisfaction|complications due...,...,13-Oct-16,,13-Oct-16,"Region Gavleborg, Gävle, Sweden",,https://ClinicalTrials.gov/show/NCT02931760,Background: Implantation of a cardiac pacemake...,"['subcutaneous pacemaker', 'intramuscular pace...",Inclusion Criteria:\r\n\r\n - Indica...,"[et al, randomized controlled, cardiac pacemak..."
2,2,3,NCT03876600,Cost-utility Analysis of Ambulatory Care Compa...,START,Not yet recruiting,No Results Available,Pacemaker Ddd,Procedure: Replacement of permanent pacemakers...,"Cost-utility analysis,from the perspective of ...",...,15-Mar-19,,15-Mar-19,"Lyon University Hospital, Lyon, Bron, France|A...",,https://ClinicalTrials.gov/show/NCT03876600,"In France, nearly 49,000 permanent pacemakers ...","['conventional hospitalization management.', '...",Inclusion Criteria:\r\n\r\n - Man or...,"[hospital discharge, benefits of, due to, perm..."
3,3,4,NCT03388281,"Morbidity, Mortality and Gender Differences in...",,Completed,No Results Available,"Pacemaker, Artificial|Cohort Studies|Arrhythmi...","Other: No intervention, retrospective analysis...",Survival after pacemaker implantation|Implanta...,...,2-Jan-18,,2-Jan-18,,,https://ClinicalTrials.gov/show/NCT03388281,,Patients with implanted pacemaker,Inclusion Criteria:\r\n\r\n - All pa...,[]
4,4,5,NCT04163770,Evaluation Of Pacemakers in Children,,Unknown status,No Results Available,Disorder of Cardiac Pacemaker System,Device: caliberation of pacemaker,Performance of pacemakers in children,...,15-Nov-19,,15-Nov-19,,,https://ClinicalTrials.gov/show/NCT04163770,Although advances in device and lead technolog...,['performance of pacemaker at time of implanta...,Inclusion Criteria:\r\n\r\n • Pediatric...,"[congenital heart disease, congenital heart, p..."


In [24]:
df_clinical.to_csv('Auto_phrase/clinical_data_output_withkeyword.csv')

# NIH_ Data

In [26]:
df_NIH = pd.read_csv('Raw_data/NIH_pacemaker_data.csv')

In [27]:
df_NIH.columns

Index(['Unnamed: 0', 'APPLICATION_ID', 'ACTIVITY', 'ADMINISTERING_IC',
       'APPLICATION_TYPE', 'ARRA_FUNDED', 'AWARD_NOTICE_DATE', 'BUDGET_START',
       'BUDGET_END', 'CFDA_CODE', 'CORE_PROJECT_NUM', 'ED_INST_TYPE',
       'FOA_NUMBER', 'FULL_PROJECT_NUM', 'FUNDING_ICs', 'FUNDING_MECHANISM',
       'FY', 'IC_NAME', 'NIH_SPENDING_CATS', 'ORG_CITY', 'ORG_COUNTRY',
       'ORG_DEPT', 'ORG_DISTRICT', 'ORG_DUNS', 'ORG_FIPS', 'ORG_IPF_CODE',
       'ORG_NAME', 'ORG_STATE', 'ORG_ZIPCODE', 'PHR', 'PI_IDS', 'PI_NAMEs',
       'PROGRAM_OFFICER_NAME', 'PROJECT_START', 'PROJECT_END', 'PROJECT_TERMS',
       'PROJECT_TITLE', 'SERIAL_NUMBER', 'STUDY_SECTION', 'STUDY_SECTION_NAME',
       'SUBPROJECT_ID', 'SUFFIX', 'SUPPORT_YEAR', 'DIRECT_COST_AMT',
       'INDIRECT_COST_AMT', 'TOTAL_COST', 'TOTAL_COST_SUB_PROJECT'],
      dtype='object')

In [28]:
df_NIH['abstract'] = df_NIH['PROJECT_TITLE'] + df_NIH['PROJECT_TERMS']

### Selecting first 500 keywords:

In [29]:
NIH_autophrase = NIH_autophrase[:500]
NIH_autophrase_kwlist_500 = set(NIH_autophrase['keywords'].to_list())
NIH_autophrase_kwlist = list (NIH_autophrase_kwlist_500.union(common_keyword))

In [30]:
len(NIH_autophrase_kwlist)

643

In [37]:
textfile = open("Auto_phrase/autophrase_kwlist_NIH.txt",  "w",  encoding="utf-8")
for element in clinical_autophrase_kwlist:
    textfile.write(str(element) + "\n")
textfile.close()

### Calculate Autophrase keyword coverage in NIH Title

In [38]:
keyword_list_percentage_calculator (df_NIH, 'abstract', NIH_autophrase_kwlist)

92.01154956689123

### Iternate Autophrase keyword coverage in NIH Title to get list of keywords

In [39]:
df_NIH['key_words'] = keywordList_column_creator(df_NIH,'abstract',NIH_autophrase_kwlist)

In [40]:
df_NIH.head()

Unnamed: 0.1,Unnamed: 0,APPLICATION_ID,ACTIVITY,ADMINISTERING_IC,APPLICATION_TYPE,ARRA_FUNDED,AWARD_NOTICE_DATE,BUDGET_START,BUDGET_END,CFDA_CODE,...,STUDY_SECTION_NAME,SUBPROJECT_ID,SUFFIX,SUPPORT_YEAR,DIRECT_COST_AMT,INDIRECT_COST_AMT,TOTAL_COST,TOTAL_COST_SUB_PROJECT,abstract,key_words
0,0,"""10351373",R21,AI,1.0,N,9/22/2021,9/22/2021,8/31/2022,855.0,...,"Surgery, Anesthesiology and Trauma Study Section",,,1.0,150000.0,81750.0,231750.0,"""",Innovating Organ Shipment by Studying Environm...,[]
1,1,"""10298453",R56,HL,1.0,N,9/21/2021,9/25/2021,8/31/2022,837.0,...,Integrative Vascular Physiology and Pathology ...,,A1,1.0,267355.0,140361.0,407716.0,"""",Role of the soluble (pro)renin receptor in blo...,"[salt sensitive hypertension, blood pressure]"
2,2,"""10275251",R01,HL,1.0,N,9/17/2021,9/20/2021,8/31/2022,839.0,...,"Basic Biology of Blood, Heart and Vasculature ...",,,1.0,355491.0,216850.0,572341.0,"""",Hypertension augmented COVID-19 through renin-...,[liver injury]
3,3,"""10325868",R44,HL,1.0,N,9/20/2021,9/20/2021,8/31/2022,837.0,...,Special Emphasis Panel,,A1,1.0,,,856873.0,"""",Developing a new drug for treating myocardial ...,"[a new, myocardial ischemia, myocardial infarc..."
4,4,"""10479415",R56,HL,1.0,N,9/20/2021,9/21/2021,8/31/2022,837.0,...,HIV Comorbidities and Clinical Studies Study S...,,A1,1.0,507689.0,98034.0,605723.0,"""","Cardiac Energetics, Diastolic Dysfunction and ...",[heart failure]


In [35]:
df_NIH.to_csv('Auto_phrase/NIH_data_output_withkeyword.csv')