### Requirements

In [None]:
!pip install pandas
!pip install selenium
!pip install alive-progress
!pip install webdriver-manager

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from alive_progress import alive_bar
import time
import pandas as pd

### Fetching URLs from BBC news on the Russo-Ukranian War

In [3]:
s = Service(ChromeDriverManager(path=r".\\").install())
driver = webdriver.Chrome(service=s)
print("-> Launching Chrome...")
driver.get("https://bbc.com/news/world-60525350")

urls = []

# Ignored Exceptions
ignored_exceptions = (
    NoSuchElementException,
    StaleElementReferenceException,
)

# Waits
wait = WebDriverWait(driver, 10)
wait2 = WebDriverWait(driver, 10, ignored_exceptions=ignored_exceptions)
wait3 = WebDriverWait(driver, 5)

buffering_time = 1

# CSS Selector Tags
popup_tag = "button[class='tp-close tp-active']"
title_tag = "a[class='qa-heading-link lx-stream-post__header-link']"
nextPage_tag = "a[class='lx-pagination__btn gs-u-mr+ qa-pagination-next-page lx-pagination__btn--active']"
numPages_tag = "span[class='lx-pagination__page-number qa-pagination-total-page-number']"

# Scraper
try:
    print("-> Awaiting pop-ups...", end=" ")
    popup = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, popup_tag)))
finally:
    popup.click()
    print("Pop-up eliminated.")
num_pages = 5  # int(driver.find_element(By.CSS_SELECTOR, numPages_tag).text)

with alive_bar(num_pages, spinner="dots_waves", bar="smooth", force_tty=True) as bar:
    for page in range(num_pages - 1):
        bar()
        time.sleep(buffering_time)
        titles = wait2.until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, title_tag))
        )
        for title in titles:
            urls.append(title.get_attribute("href"))

        next = driver.find_element(By.CSS_SELECTOR, nextPage_tag)
        next.click()
    bar()

print(f"-> {len(urls)} URLs fetched successfully!")
driver.quit()





[WDM] - Current google-chrome version is 101.0.4951
[WDM] - Get LATEST chromedriver version for 101.0.4951 google-chrome
[WDM] - Driver [.\\.wdm\drivers\chromedriver\win32\101.0.4951.41\chromedriver.exe] found in cache


-> Launching Chrome...
-> Awaiting pop-ups... Pop-up eliminated.
|████████████████████████████████████████| 2/2 [100%] in 1.7s (1.21/s)                                                  
-> 20 URLs fetched successfully!


### Parsing URLs for titles and text

In [None]:
s = Service(ChromeDriverManager(path=r".\\").install())
driver = webdriver.Chrome(service=s)
print("-> Launching Chrome...")

texts = []
titles = []
index = 0

pgTitle_tag = "h1[class='ssrcss-15xko80-StyledHeading e1fj1fc10']"
pgTitle_tag_2 = "h1[class='ssrcss-1qr3f1s-StyledHeading e1fj1fc10']"
pgText_tag = "p[class='ssrcss-1q0x1qg-Paragraph eq5iqo00']"


def check_exists(tag):
    try:
        driver.find_element(By.CSS_SELECTOR, tag)
    except NoSuchElementException:
        return False
    return True


for url in urls:

    tempTitle = ""
    tempText = ""

    print(url)
    driver.get(url)

    print("-> Awaiting pop-ups...", end=" ")
    if check_exists(popup_tag):
        popup = driver.find_element(By.CSS_SELECTOR, popup_tag)
        popup.click()
        print("Pop-up eliminated.")
    else:
        print("No pop-ups found.")

    if check_exists(pgTitle_tag):
        tempTitle += driver.find_element(By.CSS_SELECTOR, pgTitle_tag).text
    elif check_exists(pgTitle_tag_2):
        tempTitle += driver.find_element(By.CSS_SELECTOR, pgTitle_tag_2).text
    print(f'-> Title: "{tempTitle}"')

    if check_exists(pgText_tag):
        phrases = driver.find_elements(By.CSS_SELECTOR, pgText_tag)
        for phrase in phrases:
            tempText += " " + phrase.text
    print(f"-> Word count: {len(tempText.split())}")

    texts.append(tempText)
    titles.append(tempTitle)

print("Database generated successfully!")
driver.quit()
dict_data = {"Title": titles, "Text": texts}
data = pd.DataFrame(dict_data)
data.head(5)
