# ClinicalTrials.gov ETL
---
## The purpose of this script is to scrape research locations from ClinicalTrials.gov's website and load it into a database for warehousing.

In [1]:
# import dependencies
import bs4
from collections import defaultdict    
from bs4 import BeautifulSoup    
import requests
import pandas as pd
import pymongo
import json
from bson import json_util

# Create MongoDB connection

In [2]:
# make a mongo connection
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Create a new database
db = client.clinical_trial_db

# Create a new collection and reset the collection
collection = db.ct_gov
db.drop_collection(collection)

{'ok': 0.0,
 'errmsg': 'ns not found',
 'code': 26,
 'codeName': 'NamespaceNotFound'}

# Extract and transform XML site data

In [3]:
# Input: Research id used for querying from clinicaltrials.gov
# Ouput: Returns a dictionary of lists of cleaned XML site data
# Purpose: Function takes a nctid string as input and ouputs cleaned XML data.
#          The nctid is querried in the website using the request.get(URL) method.
#          This queried xml is loaded into a soup object, which is then used
#          to parse the xml into the "data" dictionary list object.
def clinicalTrialsGov(nctid):
    
    # Initialize dictionary list
    data = defaultdict(list)
    
    # Load XML into soup object
    soup = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/"\
                                      + nctid + "?displayxml=true").text, "xml")
    
    # Create list of tags that will be scraped from the "soup" object
    subset = ['name','status','city', 'zip']
    
    # Find all subset tags in the soup object
    for tag in soup.find_all(subset):
        # Transform the found location data and put into the "data" object
        data['ct{}'.format(tag.name.capitalize())].append(tag.get_text(strip=True))
    
    # Return the "data" object
    return data

# Create a dictionary of scraped data for the NCT01592370 tag
data = clinicalTrialsGov('NCT01592370')

# Load to MongoDB

In [4]:
# Create dataframe from "data" object
df = pd.DataFrame({'name':data['ctName'],'status':data['ctStatus'],'city':data['ctCity'],'zip':data['ctZip']})
# Insert research tag
df.insert(0, 'NCT', 'NCT01592370')

# Convert the dataframe into a json
records = json.loads(df.T.to_json()).values()

# Load the json into the collection "ct_gov" in the "clinical_trial_db" database
db.ct_gov.insert_many(records)

# Show contents of database
records=db.ct_gov.find()
for record in records:
    print(record)

{'_id': ObjectId('5cb36e58ecb18b0180aac644'), 'NCT': 'NCT01592370', 'name': 'Local Institution', 'status': 'Withdrawn', 'city': 'Little Rock', 'zip': '72205'}
{'_id': ObjectId('5cb36e58ecb18b0180aac645'), 'NCT': 'NCT01592370', 'name': 'Local Institution', 'status': 'Not yet recruiting', 'city': 'Fresno', 'zip': '93701'}
{'_id': ObjectId('5cb36e58ecb18b0180aac646'), 'NCT': 'NCT01592370', 'name': 'Local Institution', 'status': 'Withdrawn', 'city': 'Long Beach', 'zip': '90813'}
{'_id': ObjectId('5cb36e58ecb18b0180aac647'), 'NCT': 'NCT01592370', 'name': 'Division Of Hematology & Oncology Ctr. For Health Sciences', 'status': 'Active, not recruiting', 'city': 'Los Angeles', 'zip': '90095'}
{'_id': ObjectId('5cb36e58ecb18b0180aac648'), 'NCT': 'NCT01592370', 'name': 'University of Colorado Denver', 'status': 'Recruiting', 'city': 'Aurora', 'zip': '80045'}
{'_id': ObjectId('5cb36e58ecb18b0180aac649'), 'NCT': 'NCT01592370', 'name': 'Yale University School Of Medicine', 'status': 'Completed', 'ci