# RAKE
Script to use RAKE method to extract keywords from abstract and title of PubMed data, from detailed description, title, and eligibility criteria of Clinical Trials Data and from Project Terms and Project Title of NIH data.

Mary Kate Montgomery

12/2/21

In [2]:
# Import libraries
from rake_nltk import Rake
import pandas as pd
import nltk

In [3]:
# Create rake instance
r = Rake()

In [3]:
# ONE TIME ONLY
#nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/marykate/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## PubMed Data

In [38]:
# Read csv of abstracts - PubMEd
df = pd.read_csv('Raw_data/PubMed_wAbstracts-Pacemaker.csv')

In [6]:
# Extract keywords from text with rake
def top_10_phrases(txt):
    if not type(txt) == str:
        return []
    r.extract_keywords_from_text(txt)
    p = r.get_ranked_phrases()
    return p[0:10]

In [21]:
# Extract keywords from Pubmed abstract and title
df['KW_RAKE_Abstract'] = df.apply(lambda row: top_10_phrases(row['Abstract']), axis=1)
df['KW_RAKE_Title'] = df.apply(lambda row: top_10_phrases(row['Title']), axis=1)

In [35]:
# Write new data to csv
df.to_csv('rake_output/PubMed_RAKE.csv')

In [24]:
# Print list of most common phrases from abstract
top_phrases = []
for p in range(len(df)):
    [top_phrases.append(a) for a in df.iloc[p]['KW_RAKE_Abstract']]
    [top_phrases.append(a) for a in df.iloc[p]['KW_RAKE_Title']]

from collections import Counter
Counter = Counter(top_phrases)
pubmed_phrases = Counter.most_common(100)
print(pubmed_phrases)

[('patients', 922), ('transcatheter aortic valve replacement', 573), ('transcatheter aortic valve implantation', 520), ('cardiac implantable electronic devices', 373), ('patient', 363), ('case report', 319), ('permanent pacemaker implantation', 288), ('long', 283), ('pacemaker', 282), ('analysis', 243), ('pacemaker implantation', 223), ('outcomes', 208), ('surgical aortic valve replacement', 193), ('cardiac resynchronization therapy', 189), ('safety', 187), ('atrial fibrillation', 179), ('predictors', 161), ('impact', 159), ('cardiac implantable electronic device', 158), ('sick sinus syndrome', 158), ('comparison', 155), ('meta', 154), ('complete heart block', 153), ('severe aortic stenosis', 139), ('case', 138), ('single', 133), ('complete atrioventricular block', 126), ('results', 121), ('left ventricular ejection fraction', 120), ('implantation', 118), ('systematic review', 117), ('transvenous lead extraction', 116), ('high', 112), ('treatment', 107), ('left bundle branch block', 10

In [25]:
# Write most popular phrases to text
textfile = open("rake_output/rake_mostcommonphrases_pubmed.txt", "w")
for element in pubmed_phrases:
    textfile.write(element[0] + "\n")
textfile.close()

## Clinical Trials Data

In [34]:
# Read data - Clinical Trials
df = pd.read_csv('Raw_data/ClinicalTrialsGov-Pacemaker-wContent.csv')

In [15]:
# Extract keywords from Clinical Trials abstract and title
df['KW_RAKE_DetailedDescription'] = df.apply(lambda row: top_10_phrases(row['Detailed Description']), axis=1)
df['KW_RAKE_Title'] = df.apply(lambda row: top_10_phrases(row['Title']), axis=1)
df['KW_RAKE_EligibilityCriteria'] = df.apply(lambda row: top_10_phrases(row['Eligibility Criteria']), axis=1)

In [43]:
df.to_csv('rake_output/ClinTrials_RAKE.csv')

In [18]:
# Print list of most common phrases from detailed description
top_phrases = []
for p in range(len(df)):
    [top_phrases.append(a) for a in df.iloc[p]['KW_RAKE_DetailedDescription']]
    [top_phrases.append(a) for a in df.iloc[p]['KW_RAKE_Title']]
    [top_phrases.append(a) for a in df.iloc[p]['KW_RAKE_EligibilityCriteria']]
    
from collections import Counter
Counter = Counter(top_phrases)
clintrials_phrases = Counter.most_common(100)
print(clintrials_phrases)

[('inclusion criteria', 466), ('patients', 236), ('exclusion criteria', 106), ('left ventricular ejection fraction', 72), ('atrial fibrillation', 66), ('study', 65), ('safety', 64), ('heart failure', 52), ('provide informed consent', 49), ('cardiac resynchronization therapy', 47), ('pacemaker', 43), ('18 years', 41), ('give informed consent', 40), ('new york heart association', 38), ('effect', 34), ('treatment', 33), ('evaluation', 31), ('age ≥ 18 years', 31), ('severe aortic stenosis', 28), ('sick sinus syndrome', 27), ('efficacy', 27), ('non', 27), ('transcatheter aortic valve implantation', 26), ('comparison', 26), ('provide written informed consent', 25), ('permanent atrial fibrillation', 25), ('implantable cardioverter defibrillator', 25), ('surgical aortic valve replacement', 25), ('18 years old', 24), ('transcatheter aortic valve replacement', 24), ('crt', 24), ('icd', 23), ('persistent atrial fibrillation', 22), ('dual chamber pacemaker', 22), ('pacemakers', 21), ('written info

In [19]:
#Write most popular phrases to text file
textfile = open("rake_output/rake_mostcommonphrases_clintrials.txt", "w")
for element in clintrials_phrases:
    textfile.write(element[0] + "\n")
textfile.close()

## NIH Data

In [26]:
# Read data - NIH
df = pd.read_csv('Raw_data/pacemaker_data.csv')

In [7]:
df.keys()

Index(['Unnamed: 0', 'APPLICATION_ID', 'ACTIVITY', 'ADMINISTERING_IC',
       'APPLICATION_TYPE', 'ARRA_FUNDED', 'AWARD_NOTICE_DATE', 'BUDGET_START',
       'BUDGET_END', 'CFDA_CODE', 'CORE_PROJECT_NUM', 'ED_INST_TYPE',
       'FOA_NUMBER', 'FULL_PROJECT_NUM', 'FUNDING_ICs', 'FUNDING_MECHANISM',
       'FY', 'IC_NAME', 'NIH_SPENDING_CATS', 'ORG_CITY', 'ORG_COUNTRY',
       'ORG_DEPT', 'ORG_DISTRICT', 'ORG_DUNS', 'ORG_FIPS', 'ORG_IPF_CODE',
       'ORG_NAME', 'ORG_STATE', 'ORG_ZIPCODE', 'PHR', 'PI_IDS', 'PI_NAMEs',
       'PROGRAM_OFFICER_NAME', 'PROJECT_START', 'PROJECT_END', 'PROJECT_TERMS',
       'PROJECT_TITLE', 'SERIAL_NUMBER', 'STUDY_SECTION', 'STUDY_SECTION_NAME',
       'SUBPROJECT_ID', 'SUFFIX', 'SUPPORT_YEAR', 'DIRECT_COST_AMT',
       'INDIRECT_COST_AMT', 'TOTAL_COST', 'TOTAL_COST_SUB_PROJECT'],
      dtype='object')

In [27]:
# Extract keywords from NIH project terms and title
df['KW_RAKE_Terms'] = df.apply(lambda row: top_10_phrases(row['PROJECT_TERMS']), axis=1)
df['KW_RAKE_Title'] = df.apply(lambda row: top_10_phrases(row['PROJECT_TITLE']), axis=1)

In [9]:
df.to_csv('new_data/NIH_RAKE.csv')

In [29]:
df

Unnamed: 0.1,Unnamed: 0,APPLICATION_ID,ACTIVITY,ADMINISTERING_IC,APPLICATION_TYPE,ARRA_FUNDED,AWARD_NOTICE_DATE,BUDGET_START,BUDGET_END,CFDA_CODE,...,STUDY_SECTION_NAME,SUBPROJECT_ID,SUFFIX,SUPPORT_YEAR,DIRECT_COST_AMT,INDIRECT_COST_AMT,TOTAL_COST,TOTAL_COST_SUB_PROJECT,KW_RAKE_Terms,KW_RAKE_Title
0,0,"""10351373",R21,AI,1.0,N,2021-09-22,2021-09-22,2022-08-31,855.0,...,"Surgery, Anesthesiology and Trauma Study Section",,,1.0,150000.0,81750.0,231750.0,"""","[vascular cell adhesion molecule, intercellula...","[studying environmental factors, innovating or..."
1,1,"""10298453",R56,HL,1.0,N,2021-09-21,2021-09-25,2022-08-31,837.0,...,Integrative Vascular Physiology and Pathology ...,,A1,1.0,267355.0,140361.0,407716.0,"""","[blood pressure reduction, salt sensitive hype...","[blood pressure regulation, renin receptor, so..."
2,2,"""10275251",R01,HL,1.0,N,2021-09-17,2021-09-20,2022-08-31,839.0,...,"Basic Biology of Blood, Heart and Vasculature ...",,,1.0,355491.0,216850.0,572341.0,"""","[severe acute respiratory syndrome, mass spect...","[hypertension augmented covid, induced interna..."
3,3,"""10325868",R44,HL,1.0,N,2021-09-20,2021-09-20,2022-08-31,837.0,...,Special Emphasis Panel,,A1,1.0,,,856873.0,"""","[small business innovation research grant, ope...","[treating myocardial ischemia, reperfusion inj..."
4,4,"""10479415",R56,HL,1.0,N,2021-09-20,2021-09-21,2022-08-31,837.0,...,HIV Comorbidities and Clinical Studies Study S...,,A1,1.0,507689.0,98034.0,605723.0,"""","[united states national institutes, magnetic r...","[exercise intolerance, diastolic dysfunction, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9359,9359,"""10251125",R44,HL,5.0,N,2021-09-17,2021-09-01,2022-08-31,837.0,...,Special Emphasis Panel,,,3.0,,,1044010.0,"""","[small business innovation research grant, uni...","[phase 2a clinical study, worsening heart fail..."
9360,9360,"""10203600",U54,NS,1.0,N,2021-09-08,2021-09-15,2022-08-31,,...,Special Emphasis Panel,7308.0,,1.0,113355.0,63479.0,,"176834""","[autonomic nervous system disorders, nervous s...","[autonomic biomarker core, care4kids]"
9361,9361,"""10336561",R01,HL,7.0,N,2021-07-14,2021-09-10,2022-08-31,837.0,...,Hypertension and Microcirculation Study Section,,,4.0,308129.0,166390.0,474519.0,"""","[polymerase chain reaction, oncogene proteins ...","[cardiac dysfunction, treatment, sdkp, ischemi..."
9362,9362,"""10389101",R37,GM,3.0,N,2021-09-13,2021-01-01,2021-12-31,859.0,...,,,S1,19.0,136061.0,,136061.0,"""","[research project grants, therapy development,...","[circadian programs, bacteria]"


In [32]:
# Print list of most common phrases from abstract
top_phrases = []
for p in range(len(df)):
    [top_phrases.append(a) for a in df.iloc[p]['KW_RAKE_Terms']]
    [top_phrases.append(a) for a in df.iloc[p]['KW_RAKE_Title']]

from collections import Counter
Counter = Counter(top_phrases)
nih_phrases = Counter.most_common(100)
print(nih_phrases)

[('public health relevance', 1300), ('coronary heart disease', 1110), ('signal transduction', 1058), ('united states national institutes', 1056), ('operative surgical procedures', 964), ('congenital heart disorder', 901), ('magnetic resonance imaging', 842), ('new therapeutic target', 760), ('heart rate variability', 757), ('heart failure', 711), ('induced pluripotent stem cell', 630), ('united states', 610), ('role', 570), ('cardiovascular risk factor', 551), ('novel therapeutic intervention', 547), ('dependent diabetes mellitus', 531), ('pathway interactions', 529), ('cardiovascular disorder risk', 496), ('mass spectrum analysis', 455), ('reactive oxygen species', 446), ('research project grants', 439), ('transcription factor', 406), ('sudden cardiac death', 402), ('fatty acid glycerol esters', 397), ('gene expression', 384), ('heart disease risk', 384), ('congestive heart failure', 374), ('congenital heart defects', 365), ('translational protein processing', 337), ('public health', 

In [33]:
#Write most popular phrases to text file
textfile = open("rake_output/rake_mostcommonphrases_nih.txt", "w")
for element in nih_phrases:
    textfile.write(element[0] + "\n")
textfile.close()