In [1]:
# Import libraries
import pandas as pd
import numpy as np
import py_stringmatching

## Ontology Data

In [2]:
# Read ontology data from text files (contain results of SPARQL queries)

# Types of heart disease
with open('heart_diseases.txt') as f:
    diseases =  f.readlines()
diseases = [d.replace('\n','') for d in diseases]

# Subclasses of pacemaker
with open('pacemaker_subclasses.txt') as f:
    subclasses =  f.readlines()
subclasses = [s.replace('@en\n','').replace('"','') for s in subclasses]

## Function Definitions

In [3]:
# Functions for parsing inputs
def split_bar(txt):
    if type(txt) == str:
        out = txt.split('|')
    else:
        out = txt
    return out

def parse_txt(txt):
    if type(txt) == str:
        txt = txt.replace('[','').replace(']','').replace("'","").split(', ')
    return txt

def split_text(txt):
    if type(txt) == str:
        txt = txt.split(' ')
    return txt

In [4]:
# Functions for string matching
def preprocess(txt):
    ws_tok = py_stringmatching.WhitespaceTokenizer()
    # Use all lower case
    txt = txt.lower()
    # Tokenize w/ whitespace
    tok = ws_tok.tokenize(txt)
    return tok

def match_jaccard(elem, l,thresh,jac,ws_tok):
    
    
    # Preprocess
    elem = preprocess(elem)
    
    # Calc jaccard similarity
    sim = [jac.get_sim_score(elem,i) for i in l]

    if np.max(sim) > thresh:
        return True
    else:
        return False

In [5]:
# Function to match specific fields against terms taken from ontologies
def match_terms(row, terms, fields, thresh):
    # Initialize
    jac = py_stringmatching.Jaccard()
    ws_tok = py_stringmatching.WhitespaceTokenizer()
    # Set threshold for jaccard similarity
    matches = []
    for field in fields:
        if type(row[field]) == float:
            continue
        if type(row[field]==list):
            prep = [preprocess(i) for i in row[field]]
        else:
            prep = row[field]
        for t in terms:
            if match_jaccard(t,prep,thresh,jac,ws_tok):
                matches.append(t)
            elif match_jaccard(t+'s',prep,thresh,jac,ws_tok):
                matches.append(t)
    matches = list(np.unique(matches))
    return matches

In [6]:
def search_mappings(KW_map,mapping):
    if mapping not in KW_map:
        return ''
    if KW_map[mapping] == mapping:
        return mapping
    else:
        return search_mappings(KW_map,KW_map[mapping])
    
def consol_KWs(KW):
    '''Takes list of key words or phrases and creates mapping that consolidates synonyms'''
    KW_map = {}
    for kw in KW:
        mapping = kw
        if kw == '':
            KW_map[kw] = mapping
            continue
        # Handle plurals
        if kw[-1] == 's' and kw[:-1] in KW:
            mapping = kw[:-1]
        # Handle prefixes
        kw_w = kw.split(' ')
        if len(kw_w) > 1 and ' '.join(kw_w[1:]) in KW:
            mapping = ' '.join(kw_w[1:])
        # Save to mapping
        KW_map[kw] = mapping
    KW_map_final = {}
    for kw in KW:
        KW_map_final[kw] = search_mappings(KW_map,kw)
    KW_out = [KW_map[kw] for kw in KW]
    KW_out = list(np.unique(KW_out))
    return KW_out


## Clinical Trials

In [7]:
# Read data + rake keywords
df = pd.read_csv('new_data/ClinTrials_RAKE.csv')
df['Conditions'] = df.apply(lambda row: split_bar(row['Conditions']), axis = 1)
df['KW_RAKE_DetailedDescription'] = df.apply(lambda row: parse_txt(row['KW_RAKE_DetailedDescription']), axis = 1)
df['KW_RAKE_Title'] = df.apply(lambda row: parse_txt(row['KW_RAKE_Title']), axis = 1)
df['KW_RAKE_EligibilityCriteria'] = df.apply(lambda row: parse_txt(row['KW_RAKE_EligibilityCriteria']), axis = 1)

# Read LDA keywords
df_lda = pd.read_csv('new_data/data_matching_Project_IMP_columnsOnly.csv')
df_lda = df_lda.set_index(keys='clinicTrial')
df_lda = df_lda['clinicTrial_keywords']
# Join LDA keywords to existing df
df = df.join(df_lda).rename(columns={'clinicTrial_keywords':'KW_LDA'})
df['KW_LDA'] = df.apply(lambda row: split_text(row['KW_LDA']),axis=1)

# Read Autophrase keywords
df_ap = pd.read_csv('new_data/clinical_data_output_withkeyword.csv')
df_ap = df_ap['key_words']
df = df.join(df_ap).rename(columns={'key_words':'KW_Autophrase'})
df['KW_Autophrase'] = df.apply(lambda row: parse_txt(row['KW_Autophrase']), axis = 1)

df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Rank,NCT Number,Title,Acronym,Status,Study Results,Conditions,Interventions,...,Study Documents,URL,Detailed Description,Study Arms,Eligibility Criteria,KW_RAKE_DetailedDescription,KW_RAKE_Title,KW_RAKE_EligibilityCriteria,KW_LDA,KW_Autophrase
0,0,0,1,NCT04926792,Taiwan Registry for Leadless Pacemaker,,Not yet recruiting,No Results Available,[Performance of the Leadless Pacemaker in Taiw...,Device: Leadless Pacemaker,...,,https://ClinicalTrials.gov/show/NCT04926792,The purpose of this registry is to investigate...,,Inclusion Criteria:\r\n\r\n 1. The pa...,"[world benefit specifically, patient selection...","[taiwan registry, leadless pacemaker]",[successfully received leadless pacemaker impl...,"[therapy, surgical, cardiac, tavi, child, inte...","[follow up, to evaluate]"
1,1,1,2,NCT02931760,Placement Of Cardiac PacemaKer Trial (POCKET)-RCT,POCKET-RCT,Unknown status,No Results Available,[Pacemaker Ddd],Device: subcutaneous pacemaker|Device: intramu...,...,,https://ClinicalTrials.gov/show/NCT02931760,Background: Implantation of a cardiac pacemake...,"['subcutaneous pacemaker', 'intramuscular pace...",Inclusion Criteria:\r\n\r\n - Indica...,[figure 1 shows two hypothetical truncated log...,"[pocket )- rct, cardiac pacemaker trial, place...","[skinny patient deemed, vvir )., intramuscular...","[therapy, surgical, cardiac, tavi, child, inte...","[vascular access, a small, surgical procedure,..."
2,2,2,3,NCT03876600,Cost-utility Analysis of Ambulatory Care Compa...,START,Not yet recruiting,No Results Available,[Pacemaker Ddd],Procedure: Replacement of permanent pacemakers...,...,,https://ClinicalTrials.gov/show/NCT03876600,"In France, nearly 49,000 permanent pacemakers ...","['conventional hospitalization management.', '...",Inclusion Criteria:\r\n\r\n - Man or...,[764 pacemaker replacement hospital stays took...,"[permanent pacemakers replacement, elective re...","[battery elective wear without programmed act,...",,"[due to, hospital discharge, permanent pacemak..."
3,3,3,4,NCT03388281,"Morbidity, Mortality and Gender Differences in...",,Completed,No Results Available,"[Pacemaker, Artificial, Cohort Studies, Arrhyt...","Other: No intervention, retrospective analysis...",...,,https://ClinicalTrials.gov/show/NCT03388281,,Patients with implanted pacemaker,Inclusion Criteria:\r\n\r\n - All pa...,[],"[gender differences, patients, pacemakers, mor...","[single documented diagnose, medical universit...","[therapy, surgical, cardiac, tavi, child, inte...",[]
4,4,4,5,NCT04163770,Evaluation Of Pacemakers in Children,,Unknown status,No Results Available,[Disorder of Cardiac Pacemaker System],Device: caliberation of pacemaker,...,,https://ClinicalTrials.gov/show/NCT04163770,Although advances in device and lead technolog...,['performance of pacemaker at time of implanta...,Inclusion Criteria:\r\n\r\n • Pediatric...,[chamber pacemakers pacing requires two endoca...,"[pacemakers, evaluation, children]","[• pediatric age group 0 month, permanent card...",,"[in children, history of, a small, patients wi..."


In [8]:
# Apply ontologies
df['condition'] = df.apply(lambda row: match_terms(row,diseases,['Conditions','KW_RAKE_DetailedDescription','KW_RAKE_Title','KW_RAKE_EligibilityCriteria','KW_LDA','KW_Autophrase'],0.5),axis=1)
df['subclass'] = df.apply(lambda row: match_terms(row, subclasses,['KW_RAKE_DetailedDescription','KW_RAKE_Title','KW_RAKE_EligibilityCriteria','KW_LDA','KW_Autophrase'],0.5), axis=1)

In [9]:
# Consolidate synonyms
df['condition'] = df.apply(lambda row: consol_KWs(row['condition']),axis=1)
df['subclass'] = df.apply(lambda row: consol_KWs(row['subclass']),axis=1)

In [27]:
df.iloc[9]['Detailed Description']

"This protocol outlines the methods on the Micra Pacemaker. The Micra Transcatheter Pacemaker\r\n      System is a miniaturized single chamber pacemaker system that is delivered via catheter\r\n      through the femoral vein and is implanted directly inside the right ventricle of the heart.\r\n      The Micra device eliminates the need for a device pocket and insertion of a pacing lead,\r\n      thereby eliminating lead and pocket related complications associated with traditional pacing\r\n      systems while providing similar pacing benefits. The US Food and Drug Administration (FDA)\r\n      approved the Micra system on April 6, 2016, for the following conditions:\r\n\r\n        -  symptomatic paroxysmal or permanent high-grade atrioventricular block (AV) in the\r\n           presence of atrial fibrillation (AF)\r\n\r\n        -  symptomatic paroxysmal or permanent high-grade AV block in the absence of AF, as an\r\n           alternative to dual chamber pacing, when atrial lead place

In [22]:
df.iloc[9]['condition']

['Atrial Fibrillation', 'Atrioventricular Block']

In [28]:
df.iloc[9]['subclass']

['Single chamber pacemaker']

In [None]:
# Write data to new csv file
df.to_csv('new_data/data_with_ontology_terms/clinTrials.csv')

## PubMed Data

In [None]:
# Read data + rake keywords
df = pd.read_csv('new_data/PubMed_RAKE.csv')
df['KW_RAKE_Abstract'] = df.apply(lambda row: parse_txt(row['KW_RAKE_Abstract']), axis = 1)
df['KW_RAKE_Title'] = df.apply(lambda row: parse_txt(row['KW_RAKE_Title']), axis = 1)

# Read LDA keywords
df_lda = pd.read_csv('new_data/data_matching_Project_IMP_columnsOnly.csv')
df_lda = df_lda.set_index(keys='pubMed')
df_lda = df_lda['pubMed_keywords']

# Join LDA keywords to existing df
df = df.join(df_lda).rename(columns={'pubMed_keywords':'KW_LDA'})
df['KW_LDA'] = df.apply(lambda row: split_text(row['KW_LDA']),axis=1)

# Read Autophrase keywords
df_ap = pd.read_csv('new_data/pubmed_data_output_withkeyword.csv')
df_ap = df_ap['key_words']
df = df.join(df_ap).rename(columns={'key_words':'KW_Autophrase'})
df['KW_Autophrase'] = df.apply(lambda row: parse_txt(row['KW_Autophrase']), axis = 1)

df.head()

In [None]:
# Apply ontologies
df['condition'] = df.apply(lambda row: match_terms(row,diseases,['KW_RAKE_Abstract','KW_RAKE_Title','KW_LDA','KW_Autophrase'],0.5),axis=1)
df['subclass'] = df.apply(lambda row: match_terms(row, subclasses,['KW_RAKE_Abstract','KW_RAKE_Title','KW_LDA','KW_Autophrase'],0.5), axis=1)
#df['condition'] = df.apply(lambda row: match_terms(row, conditions,['KW_RAKE_Abstract','KW_RAKE_Title','KW_LDA','KW_Autophrase'],0.5), axis=1)

In [None]:
# Consolidate synonyms
df['condition'] = df.apply(lambda row: consol_KWs(row['condition']),axis=1)
df['subclass'] = df.apply(lambda row: consol_KWs(row['subclass']),axis=1)

In [None]:
df.head()

In [None]:
# Write data to new csv file
df.to_csv('new_data/data_with_ontology_terms/pubMed.csv')

## NIH Data

In [29]:
# Read data
df = pd.read_csv('new_data/NIH_RAKE.csv')
df['KW_RAKE_Terms'] = df.apply(lambda row: parse_txt(row['KW_RAKE_Terms']), axis = 1)
df['KW_RAKE_Title'] = df.apply(lambda row: parse_txt(row['KW_RAKE_Title']), axis = 1)

# Read LDA keywords


# Read Autophrase keywords
df_ap = pd.read_csv('new_data/NIH_data_output_withkeyword.csv')
df_ap = df_ap['key_words']
df = df.join(df_ap).rename(columns={'key_words':'KW_Autophrase'})
df['KW_Autophrase'] = df.apply(lambda row: parse_txt(row['KW_Autophrase']), axis = 1)

In [None]:
# Apply ontologies
df['condition'] = df.apply(lambda row: match_terms(row,diseases,['KW_RAKE_Title','KW_RAKE_Terms','PROJECT_TERMS','KW_Autophrase'],0.5),axis=1)
df['subclass'] = df.apply(lambda row: match_terms(row, subclasses,['KW_RAKE_Title','KW_RAKE_Terms','PROJECT_TERMS','KW_Autophrase'],0.5), axis=1)

In [None]:
# Consolidate synonyms
df['condition'] = df.apply(lambda row: consol_KWs(row['condition']),axis=1)
df['subclass'] = df.apply(lambda row: consol_KWs(row['subclass']),axis=1)


In [None]:
# Write data to new csv file
df.to_csv('new_data/data_with_ontology_terms/NIH.csv')