In [1]:
# Parse data exported in xml from ClinicalTrials.gov 
# Pull "detailed description", arms and interventions" and "eligibility criteria"
# Mary Kate Montgomery

In [47]:
# Import libraries
import xmltodict as xml
import pandas as pd

In [78]:
# Read csv
df = pd.read_csv('new_data/ClinicalTrialsGov-Pacemaker.csv')
df.head()

Unnamed: 0,Rank,NCT Number,Title,Acronym,Status,Study Results,Conditions,Interventions,Outcome Measures,Sponsor/Collaborators,...,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents,URL
0,1,NCT04926792,Taiwan Registry for Leadless Pacemaker,,Not yet recruiting,No Results Available,Performance of the Leadless Pacemaker in Taiwa...,Device: Leadless Pacemaker,Acute complication|Subacute complication|Leadl...,National Taiwan University Hospital|Taiwan Soc...,...,202103105RINB,July 2021,June 2024,June 2025,"June 15, 2021",,"June 15, 2021",,,https://ClinicalTrials.gov/show/NCT04926792
1,2,NCT02931760,Placement Of Cardiac PacemaKer Trial (POCKET)-RCT,POCKET-RCT,Unknown status,No Results Available,Pacemaker Ddd,Device: subcutaneous pacemaker|Device: intramu...,patient overall satisfaction|complications due...,Region Gävleborg,...,2016371,October 2016,December 2017,June 2019,"October 13, 2016",,"October 13, 2016","Region Gavleborg, Gävle, Sweden",,https://ClinicalTrials.gov/show/NCT02931760
2,3,NCT03876600,Cost-utility Analysis of Ambulatory Care Compa...,START,Not yet recruiting,No Results Available,Pacemaker Ddd,Procedure: Replacement of permanent pacemakers...,"Cost-utility analysis,from the perspective of ...","Nantes University Hospital|Ministry of Health,...",...,RC18_0034,April 2019,April 2022,April 2022,"March 15, 2019",,"March 15, 2019","Lyon University Hospital, Lyon, Bron, France|A...",,https://ClinicalTrials.gov/show/NCT03876600
3,4,NCT03388281,"Morbidity, Mortality and Gender Differences in...",,Completed,No Results Available,"Pacemaker, Artificial|Cohort Studies|Arrhythmi...","Other: No intervention, retrospective analysis...",Survival after pacemaker implantation|Implanta...,Medical University of Vienna,...,EK1525/2015,"July 28, 2015","March 31, 2016","March 31, 2016","January 2, 2018",,"January 2, 2018",,,https://ClinicalTrials.gov/show/NCT03388281
4,5,NCT04163770,Evaluation Of Pacemakers in Children,,Unknown status,No Results Available,Disorder of Cardiac Pacemaker System,Device: caliberation of pacemaker,Performance of pacemakers in children,Assiut University,...,EPCP,"January 15, 2020","January 15, 2021","February 15, 2021","November 15, 2019",,"November 15, 2019",,,https://ClinicalTrials.gov/show/NCT04163770


In [79]:
def readXml(row):
    id = row['NCT Number']

    # Initialize with empty values
    row['Detailed Description'] = ''
    row['Study Arms'] = ''
    row['Eligibility Criteria'] = ''
    
    # Read file into dict
    with open('Pacemaker/ClinTrials_XML/'+id+'.xml') as fd:
        doc = xmltodict.parse(fd.read())
        
    if 'clinical_study' not in doc.keys():
        return row
        
    # Get detailed description
    if 'detailed_description' in doc['clinical_study'].keys():
        row['Detailed Description'] = doc['clinical_study']['detailed_description']['textblock']

    # Get arms and interventions
    if 'arm_group' in doc['clinical_study'].keys():
        if type(doc['clinical_study']['arm_group']) == list:
            num_arms = len(doc['clinical_study']['arm_group'])
            arm_labels = []; 
            for i in range(num_arms):
                arm_labels.append(doc['clinical_study']['arm_group'][i]['arm_group_label'])
        else:
            arm_labels = doc['clinical_study']['arm_group']['arm_group_label']    
        
        row['Study Arms'] = arm_labels
    
    # Get eligibility criteria
    if 'eligibility' in doc['clinical_study'].keys() and 'criteria' in doc['clinical_study']['eligibility'].keys():
        row['Eligibility Criteria'] = doc['clinical_study']['eligibility']['criteria']['textblock']
    
    return row

df = df.apply(lambda row: readXml(row),axis=1)

In [81]:
# Write data to csv
df.to_csv('new_data/ClinicalTrialsGov-Pacemaker-wContent.csv')