# Bristol Myers Squibb ETL
---
## The purpose of this script is to scrape research locations from BMS's website.

In [1]:
# Dependencies

# Web scraping dependencies
from bs4 import BeautifulSoup

# Asynchronous scraping dependencies
import time
from selenium import webdriver

# Database deployment dependencies
import pymongo

# Data Dependencies
import pandas as pd

# Create MongoDB connection

In [2]:
# Set up connection to localhost and connect to a client
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Create a new database
db = client.clinical_trial_db

# Create a new collection and reset the collection
collection = db.bristol_myers_squibb
db.drop_collection(collection)

{'ok': 0.0,
 'errmsg': 'ns not found',
 'code': 26,
 'codeName': 'NamespaceNotFound'}

# Extract HTML Site Data From BMS

In [3]:
# Chromedriver set-up
executable_path = {'executable_path': 'chromedriver.exe'}
browser = webdriver.Chrome(**executable_path)
url = 'https://www.bmsstudyconnect.com/content/studyconnect/us/en/sites-list.html?id=NCT01357668&language=en&geo_lati=&geo_long=&location='
browser.get(url)

# 20 second time delay is so that the page can load and all information can be scraped
time.sleep(20)

# Scrape the html on the site after the timer is done
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')

# set the class definition to only find divs that match a specific kind of class
research_divs = soup.find_all("div", {"class": "grid-x grid-margin-x bmssc-result-item bmssc-site-result"})

# find the trial_id from the page as well
trial_id = soup.find("span", id="trail-id-each").text

# Parse HTML and load to database

In [4]:
# Loop through returned results in research_divs
# These list boxes contain the research site locations and their current recruitment status
for result in research_divs:
    # Error handling
    # Handles cases where HTML atributes do not exist within a particular result in the research_divs list
    try:
        # Return the location of the research site
        location = result.find('div', class_ = "bmssc-type-medium--alternate bmssc-color-dark-grey").text
        
        # Case 1: The research site is currently Recruiting
        is_recruiting = result.find('div', class_="hide-for-medium bmssc-status--indicator is-recruiting active").text
        
        # Modify strings
        is_recruiting = is_recruiting.replace(" ", "")
        is_recruiting = is_recruiting.replace("\n", "")
        
        # update location_dict
        location_dict = { trial_id: { "status" : is_recruiting, "location": location} }

        # insert into the collection
        collection.insert_one(location_dict)
    
    # Make exception for html attribute not existing
    except AttributeError as e:
        # Case 2: The research site is not recruiting yet 
        not_recruiting = result.find('div', class_="hide-for-medium bmssc-status--indicator is-notyetrecruiting active").text
        
        # Modify strings
        not_recruiting = not_recruiting.replace(" ", "")
        not_recruiting = not_recruiting.replace("\n", "")
        
        # update location_dict
        location_dict = { trial_id: { "status" : not_recruiting, "location": location} }
        
        # insert into the collection
        collection.insert_one(location_dict)

In [5]:
# Display items in MongoDB collection
listings = db.bristol_myers_squibb.find()

for listing in listings:
    print(listing)

{'_id': ObjectId('5cb1709decb18b605c898ce0'), 'IM101-240': {'status': 'Recruiting', 'location': 'Little Rock, Arkansas 72202'}}
{'_id': ObjectId('5cb1709decb18b605c898ce1'), 'IM101-240': {'status': 'Recruiting', 'location': 'Valencia 46009'}}
{'_id': ObjectId('5cb1709decb18b605c898ce2'), 'IM101-240': {'status': 'Recruiting', 'location': 'Los Angeles, California 90027'}}
{'_id': ObjectId('5cb1709decb18b605c898ce3'), 'IM101-240': {'status': 'Recruiting', 'location': 'Augusta, Georgia 30912'}}
{'_id': ObjectId('5cb1709decb18b605c898ce4'), 'IM101-240': {'status': 'Recruiting', 'location': 'Chicago, Illinois 60611'}}
{'_id': ObjectId('5cb1709decb18b605c898ce5'), 'IM101-240': {'status': 'Recruiting', 'location': 'Chicago, Illinois 60637'}}
{'_id': ObjectId('5cb1709decb18b605c898ce6'), 'IM101-240': {'status': 'Recruiting', 'location': 'Indianapolis, Indiana 46202'}}
{'_id': ObjectId('5cb1709decb18b605c898ce7'), 'IM101-240': {'status': 'Recruiting', 'location': 'Minneapolis, Minnesota 55454'}}