# Bristol Myers Squibb ETL
---
## The purpose of this script is to scrape research locations from BMS's website.

In [44]:
# Dependencies

# Web scraping dependencies
from bs4 import BeautifulSoup

# Asynchronous scraping dependencies
import time
from selenium import webdriver

# Database deployment dependencies
import pymongo

# Data Dependencies
import pandas as pd

# Create MongoDB connection

In [52]:
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
db = client.clinical_trial_db
collection = db.bristol_myers_squibb

# Extract HTML Site Data From BMS

In [17]:
# Chromedriver set-up
executable_path = {'executable_path': 'chromedriver.exe'}
browser = webdriver.Chrome(**executable_path)
url = 'https://www.bmsstudyconnect.com/content/studyconnect/us/en/sites-list.html?id=NCT01357668&language=en&geo_lati=&geo_long=&location='
browser.get(url)

#15 second time delay is so that the page can load and all informatin can be scraped
time.sleep(20)

# Scrape the html on the site after the timer is done
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')

# set the class definition to only find divs that match a specific kind of class
research_divs = soup.find_all("div", {"class": "grid-x grid-margin-x bmssc-result-item bmssc-site-result"})

# find the trial_id from the page as well
trial_id = soup.find("span", id="trail-id-each").text

# Parse HTML and load to database

In [50]:
# Loop through returned results in research_divs
# These list boxes contain the research site locations and their current recruitment status
for result in research_divs:
    # Error handling
    # Handles cases where HTML atributes do not exist within a particular result in the research_divs list
    try:
        # Return the location of the research site
        location = result.find('div', class_ = "bmssc-type-medium--alternate bmssc-color-dark-grey").text
        
        # Case 1: The research site is currently Recruiting
        is_recruiting = result.find('div', class_="hide-for-medium bmssc-status--indicator is-recruiting active").text
        
        # Modify strings
        is_recruiting = is_recruiting.replace(" ", "")
        is_recruiting = is_recruiting.replace("\n", "")
        
        # update location_dict
        location_dict = { trial_id: { "status" : is_recruiting, "location": location} }

        # insert into the collection
        collection.insert_one(location_dict)
    
    # Make exception for html attribute not existing
    except AttributeError as e:
        # Case 2: The research site is not recruiting yet 
        not_recruiting = result.find('div', class_="hide-for-medium bmssc-status--indicator is-notyetrecruiting active").text
        
        # Modify strings
        not_recruiting = not_recruiting.replace(" ", "")
        not_recruiting = not_recruiting.replace("\n", "")
        
        # update location_dict
        location_dict = { trial_id: { "status" : not_recruiting, "location": location} }
        
        # insert into the collection
        collection.insert_one(location_dict)

In [53]:
# Display items in MongoDB collection
listings = db.bristol_myers_squibb.find()

for listing in listings:
    print(listing)

{'_id': ObjectId('5cb15975ecb18b0fd0dc07ed'), 'Little Rock, Arkansas 72202': 'Recruiting', 'Valencia 46009': 'Recruiting', 'Los Angeles, California 90027': 'Recruiting', 'Augusta, Georgia 30912': 'Recruiting', 'Chicago, Illinois 60611': 'Recruiting', 'Chicago, Illinois 60637': 'Recruiting', 'Indianapolis, Indiana 46202': 'Recruiting', 'Minneapolis, Minnesota 55454': 'Recruiting', 'Saint Louis, Missouri 63104': 'Recruiting', 'Murcia 30120': 'Recruiting', 'Madrid 28046': 'Recruiting', 'Esplugues (Barcelona) 8950': 'Recruiting', 'Barcelona 8035': 'Recruiting', 'San Juan 00919-5206': 'Recruiting', 'Guadalajara 44620': 'Recruiting', 'Pisa 56100': 'Recruiting', 'Padova 35128': 'Recruiting', 'Genova 16147': 'Recruiting', 'Firenze 50139': 'Recruiting', 'Sankt Augustin D-53757': 'Recruiting', 'Heidelberg 69120': 'Recruiting', 'Hamburg D-22081': 'Recruiting', 'Freiburg 79106': 'Recruiting', 'Berlin 13353': 'Recruiting', 'Bad Bramstedt 24576': 'Recruiting', 'Montreal, Quebec H3H 1P3': 'Recruiting