### Imports

In [1]:
from selenium import webdriver
import string 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.utils import ChromeType
from alive_progress import alive_bar
import time
import pandas as pd
import os

### Functions

In [2]:
ROOT_DIR = os.path.dirname(os.path.abspath("__file__"))

def check_exists(tag):
    try:
        driver.find_element(By.CSS_SELECTOR, tag)
    except NoSuchElementException:
        return False
    return True

### Fetching URLs from BBC news on the Russo-Ukranian War

In [None]:
# Driver configChromeDriverManager(path=r".\\").install()
s = Service(ChromeDriverManager(chrome_type=ChromeType.BRAVE, path=ROOT_DIR).install())
driver = webdriver.Chrome(service=s)

# WaitsChromeDriverManager(path=r".\\").install()
ignored_exceptions = (
    NoSuchElementException,
    StaleElementReferenceException,
)
wait = WebDriverWait(driver, 10)
wait2 = WebDriverWait(driver, 10, ignored_exceptions=ignored_exceptions)
buffering_time = 1

# CSS Selector Tags
popup_tag = "button[class='tp-close tp-active']"
title_tag = "a[class='qa-heading-link lx-stream-post__header-link']"
nextPage_tag = "a[class='lx-pagination__btn gs-u-mr+ qa-pagination-next-page lx-pagination__btn--active']"
numPages_tag = (
    "span[class='lx-pagination__page-number qa-pagination-total-page-number']"
)

# Scraper
urls = []
bbc = "https://bbc.com/news/world-60525350"

print("-> Launching Chrome...")
driver.get(bbc)

popup = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, popup_tag)))
popup.click()

num_pages = 3 #int(driver.find_element(By.CSS_SELECTOR, numPages_tag).text)

with alive_bar(
    num_pages, title="-> Scraper", spinner="dots_waves", bar="smooth", force_tty=True
) as bar:
    for page in range(num_pages - 1):
        bar()
        time.sleep(buffering_time)
        titles = wait2.until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, title_tag))
        )
        for title in titles:
            if "live" not in title.get_attribute("href"):
                urls.append(title.get_attribute("href"))

        next = driver.find_element(By.CSS_SELECTOR, nextPage_tag)
        next.click()
    bar()

unique_urls = list(dict.fromkeys(urls))
print(f"-> {len(unique_urls)} URLs fetched successfully!")
driver.quit()

### Parsing URLs for titles and text

In [None]:
# Driver config
s = Service(ChromeDriverManager(chrome_type=ChromeType.BRAVE, path=ROOT_DIR).install())
driver = webdriver.Chrome(service=s)

# CSS Selector Tags
pgTitle_tag = "h1[class='ssrcss-15xko80-StyledHeading e1fj1fc10']"
pgTitle_tag_2 = "h1[class='ssrcss-1qr3f1s-StyledHeading e1fj1fc10']"
pgText_tag = "p[class='ssrcss-1q0x1qg-Paragraph eq5iqo00']"
date_tag = "time[data-testid='timestamp']"

# Parser
texts = []
titles = []
dates = []
urls = []

print("-> Launching Chrome...")

with alive_bar(
    len(unique_urls), spinner="dots_waves", title="-> Parser", bar="smooth", force_tty=True,
) as bar:
    for url in unique_urls:

        bar()
        tempTitle = ""
        tempText = ""
        tempDate = ""

        driver.get(url)

        if check_exists(popup_tag):
            popup = driver.find_element(By.CSS_SELECTOR, popup_tag)
            popup.click()

        if check_exists(pgTitle_tag):
            tempTitle += driver.find_element(By.CSS_SELECTOR, pgTitle_tag).text
        elif check_exists(pgTitle_tag_2):
            tempTitle += driver.find_element(By.CSS_SELECTOR, pgTitle_tag_2).text
        
        if check_exists(date_tag):
            date = driver.find_element(By.CSS_SELECTOR, date_tag)
            tempDate += date.get_attribute('datetime')

        phrases = driver.find_elements(By.CSS_SELECTOR, pgText_tag)
        for phrase in phrases:
            tempText += " " + phrase.text

        texts.append(tempText)
        titles.append(tempTitle)
        dates.append(tempDate)
        urls.append(url)

driver.quit()

dict_data = {"URL":urls,"Date":dates,"Title": titles, "Text": texts}
data = pd.DataFrame(dict_data)

print("Database generated successfully!")

In [45]:
# Filtering undesirable data
sig = "From Top Gun: Maverick to Turning Red and Everything Everywhere All at Once When fridges didn't exist, locals had to find other ways to keep food cool The role self-deception plays in leading people astray © 2022 BBC. The BBC is not responsible for the content of external sites. Read about our approach to external linking."
data["Text"] = data["Text"].replace({sig: ""}, regex=True)
data["Date"] = data["Date"].replace({".000Z": ""}, regex=True)
data = data.replace("", float("NaN")).dropna(subset=["Text"]).reset_index(drop=True)

os.makedirs(os.getcwd() + "/data", exist_ok=True)
data.to_csv("data/bbc.csv", index=True, header=True)
data.head(25)

Unnamed: 0,URL,Date,Title,Text
0,https://www.bbc.com/news/world-europe-61757829,2022-06-11T00:57:56,Russia's war prompts regional rush to become E...,European Union leaders are to decide later th...
1,https://www.bbc.com/news/world-europe-61749877,2022-06-11T00:53:56,Ukraine's prosecutors wrestle with a new role:...,It took Vadym Bobryntsev four days to bury hi...
2,https://www.bbc.com/news/world-europe-61767191,2022-06-10T21:14:22,Putin and Peter the Great: Russian leader like...,Vladimir Putin's admiration for Peter the Gre...
3,https://www.bbc.com/news/uk-61756025,2022-06-10T19:51:50,Do everything to free Aiden Aslin and Shaun Pi...,"Boris Johnson has urged ministers to do ""ever..."
4,https://www.bbc.com/news/world-europe-61764196,2022-06-10T19:39:44,Ukraine round-up: Mariupol cholera warning and...,"The Ukrainian port city of Mariupol, all but ..."
5,https://www.bbc.com/news/uk-61754684,2022-06-10T19:33:12,Families of condemned Britons Aiden Aslin and ...,The families of two Britons sentenced to deat...
6,https://www.bbc.com/news/world-europe-61762787,2022-06-10T17:56:29,Cholera in Mariupol: Ruined city at risk of ma...,"The Ukrainian port city of Mariupol, all but ..."
7,https://www.bbc.com/news/world-asia-india-6078...,2022-06-10T15:53:01,Ukraine crisis: Why is India buying Russian oil?,There's been a significant increase in India'...
8,https://www.bbc.com/news/world-europe-60506682,2022-06-10T11:42:47,Ukraine war in maps: Tracking the Russian inva...,"Heavy fighting continues in east Ukraine, wit..."
9,https://www.bbc.com/news/business-61727807,2022-06-10T10:08:29,Russia's new version of McDonald's unveils logo,The Russian fast food chain that was formerly...
