In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import random
import time

In [2]:
# Setup Chrome Webdriver
options = webdriver.ChromeOptions()
options.add_argument("--headless") # Run in headless mode for faster execution
# Some websites block automated browsers. So try adding a user-agent string to make Selenium appear as a regular browser
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [3]:
df = pd.read_csv("drugbank_selenium_drugs.csv")
df.head()

Unnamed: 0,Drug Name,Drug URL
0,1-Palmitoyl-2-oleoyl-sn-glycero-3-(phospho-rac...,https://go.drugbank.com/drugs/DB11331
1,"1,2-Benzodiazepine",https://go.drugbank.com/drugs/DB12537
2,"1,2-Distearoyllecithin",https://go.drugbank.com/drugs/DB14099
3,"1,2-icosapentoyl-sn-glycero-3-phosphoserine",https://go.drugbank.com/drugs/DB14096
4,2-mercaptobenzothiazole,https://go.drugbank.com/drugs/DB11496


In [4]:
df["Drug Background"] = ""
df["Drug Indication"] = ""

In [5]:
missed_urls = []

In [None]:
for i, row in df.iterrows():
    url = row["Drug URL"]
    
    driver.get(url)
    time.sleep(2)

    try:
        background_paragraphs = driver.find_elements(By.XPATH, "//dt[@id='background']/following-sibling::dd[1]//p")
        indication_paragraphs = driver.find_elements(By.XPATH, "//dt[@id='indication']/following-sibling::dd[1]//p")

        background_text = ' '.join([para.text for para in background_paragraphs if para.text.strip()])
        indication_text = ' '.join([para.text for para in indication_paragraphs if para.text.strip()])
        # print(f"Row {i} text: {background_text}")
        df.loc[i, 'Drug Background'] = background_text if background_text.lower().strip() != "not available" else ""
        df.loc[i, 'Drug Indication'] = indication_text if indication_text.lower().strip() != "not available" else ""
    except Exception as e:
        missed_urls.append(i)
        print(f"Error loading {url} on row {i+1}: {e}")
    
    if (i + 1) % 20 == 0:
        print(f"Time to sleep, finished until row {i + 1}")
        time.sleep(random.uniform(10,15))

    if (i + 1) % 50 == 0:
        print(f"Finished until row {i + 1}")
        df.to_csv("drugbank_drugs_details.csv")

df.to_csv("drugbank_drugs_details.csv")

Finished until row 50
Finished until row 100


In [6]:
df.sample(5)

Unnamed: 0,Drug Name,Drug URL,Drug Background,Drug Indication
0,1-Palmitoyl-2-oleoyl-sn-glycero-3-(phospho-rac...,https://go.drugbank.com/drugs/DB11331,Palmitoyloleoyl-phosphatidylglycerol was a com...,Palmitoyloleoyl-phosphatidylglycerol was a com...
1,"1,2-Benzodiazepine",https://go.drugbank.com/drugs/DB12537,Benzodiazepine is under investigation for the ...,
2,"1,2-Distearoyllecithin",https://go.drugbank.com/drugs/DB14099,,
3,"1,2-icosapentoyl-sn-glycero-3-phosphoserine",https://go.drugbank.com/drugs/DB14096,,
4,2-mercaptobenzothiazole,https://go.drugbank.com/drugs/DB11496,,


In [7]:
missed_urls

[11, 92, 132, 135]

In [None]:
url = "https://go.drugbank.com/drugs/DB12001"
driver.get(url)

background_paragraphs = driver.find_elements(By.XPATH, "//dt[@id='background']/following-sibling::dd[1]//p")
indication_paragraphs = driver.find_elements(By.XPATH, "//dt[@id='indication']/following-sibling::dd[1]//p")

background_text = ' '.join([para.text for para in background_paragraphs if para.text.strip()])
indication_text = ' '.join([para.text for para in indication_paragraphs if para.text.strip()])

print(background_text)
print(indication_text)
        


Abemaciclib is an antitumor agent and dual inhibitor of cyclin-dependent kinases 4 (CDK4) and 6 (CDK6) that are involved in the cell cycle and promotion of cancer cell growth in case of unregulated activity. On September 28, 2017, FDA granted approval of abemaciclib treatment under the market name Verzenio for the treatment of HR-positive and HER2-negative advanced or metastatic breast cancer that has progressed after unsuccessful endocrine therapy. It is either given alone in patients who has undergone endocrine therapy and chemotherapy after the metastasis of cancer, or in combination with Fulvestrant. Following oral treatment in patients with HR-positive, HER2-negative breast cancer, abemaciclib demonstrated increased progression-free survival rates and objective response rates. Abemaciclib has been used in trials studying the treatment of melanoma, lymphoma, neoplasm, solid tumor, and glioblastoma.
Indicated in combination with fulvestrant for the treatment of women with hormone re