### Requirements

In [None]:
!pip install pandas
!pip install selenium
!pip install alive-progress
!pip install webdriver-manager

### Imports

In [88]:
from selenium import webdriver
import string 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from alive_progress import alive_bar
import time
import pandas as pd
import os

### Functions

In [128]:
def check_exists(tag):
    try:
        driver.find_element(By.CSS_SELECTOR, tag)
    except NoSuchElementException:
        return False
    return True

### Fetching URLs from BBC news on the Russo-Ukranian War

In [None]:

# Driver config
s = Service(ChromeDriverManager(path=r".\\").install())
driver = webdriver.Chrome(service=s)

# Waits
ignored_exceptions = (NoSuchElementException, StaleElementReferenceException,)
wait = WebDriverWait(driver, 10)
wait2 = WebDriverWait(driver, 10, ignored_exceptions=ignored_exceptions)
buffering_time = 0.5

# CSS Selector Tags
popup_tag = "button[class='tp-close tp-active']"
title_tag = "a[class='qa-heading-link lx-stream-post__header-link']"
nextPage_tag = "a[class='lx-pagination__btn gs-u-mr+ qa-pagination-next-page lx-pagination__btn--active']"
numPages_tag = "span[class='lx-pagination__page-number qa-pagination-total-page-number']"

# Scraper
urls = []
bbc = "https://bbc.com/news/world-60525350"

print("-> Launching Chrome...")
driver.get(bbc)

popup = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, popup_tag)))
popup.click()

num_pages = 3  # int(driver.find_element(By.CSS_SELECTOR, numPages_tag).text)

with alive_bar(num_pages, title="-> Scraper", spinner="dots_waves", bar="smooth", force_tty=True) as bar:
    for page in range(num_pages - 1):
        bar()
        time.sleep(buffering_time)
        titles = wait2.until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, title_tag))
        )
        for title in titles:
            if "live" not in title.get_attribute("href"):
                urls.append(title.get_attribute("href"))

        next = driver.find_element(By.CSS_SELECTOR, nextPage_tag)
        next.click()
    bar()

unique_urls = list(dict.fromkeys(urls))
print(f"-> {len(unique_urls)} URLs fetched successfully!")
driver.quit()
print(unique_urls)

### Parsing URLs for titles and text

In [132]:
# Driver config
s = Service(ChromeDriverManager(path=r".\\").install())
driver = webdriver.Chrome(service=s)

# CSS Selector Tags
pgTitle_tag = "h1[class='ssrcss-15xko80-StyledHeading e1fj1fc10']"
pgTitle_tag_2 = "h1[class='ssrcss-1qr3f1s-StyledHeading e1fj1fc10']"
pgText_tag = "p[class='ssrcss-1q0x1qg-Paragraph eq5iqo00']"

# Parser
texts = []
titles = []

print("-> Launching Chrome...")

with alive_bar(
    len(unique_urls), spinner="dots_waves",title="-> Parser", bar="smooth", force_tty=True) as bar:
    for url in unique_urls:
        
        bar()
        tempTitle = ""
        tempText = ""

        driver.get(url)

        if check_exists(popup_tag):
            popup = driver.find_element(By.CSS_SELECTOR, popup_tag)
            popup.click()

        if check_exists(pgTitle_tag):
            tempTitle += driver.find_element(By.CSS_SELECTOR, pgTitle_tag).text
        elif check_exists(pgTitle_tag_2):
            tempTitle += driver.find_element(By.CSS_SELECTOR, pgTitle_tag_2).text

        phrases = driver.find_elements(By.CSS_SELECTOR, pgText_tag)
        for phrase in phrases:
            tempText += " " + phrase.text
        
        texts.append(tempText)
        titles.append(tempTitle)

driver.quit()

dict_data = {"Title": titles, "Text": texts}
data = pd.DataFrame(dict_data)

print("Database generated successfully!")
data.head(20)




[WDM] - Current google-chrome version is 101.0.4951
[WDM] - Get LATEST chromedriver version for 101.0.4951 google-chrome
[WDM] - Driver [.\\.wdm\drivers\chromedriver\win32\101.0.4951.41\chromedriver.exe] found in cache


-> Launching Chrome...
-> Parser |████████████████████████████████████████| 39/39 [100%] in 1:50.0 (0.35/s)                                    
Database generated successfully!


Unnamed: 0,Title,Text
0,Ukraine war: Bodies of dead Russian soldiers a...,When their dogs started digging insistently a...
1,Ukraine: The fallen Russian soldiers left behind,Weeks after Russian forces were pushed back f...
2,Ros Atkins on... Russia's food war,The West has accused the Russian military of ...
3,US closes loophole for Russian debt payments,The US is cutting off another financial route...
4,"Ukraine war: 'This is just the beginning, ever...",The Ukrainian army is under more pressure tha...
5,Ukraine war round-up: Fighting in the east and...,Exactly three months after Russia launched it...
6,"Ukraine war: Put values over profits, Nato chi...",The war in Ukraine has highlighted how countr...
7,"Ukraine war: World faces 'dark hour', Biden te...","The world is ""navigating a dark hour in our s..."
8,Ukrainian mum and son left homeless 'after hos...,A Ukrainian refugee rehomed under a governmen...
9,"Putin weaponising Ukraine’s crops, says Polish PM","Vladimir Putin is ""weaponising Ukraine's crop..."


In [133]:
os.makedirs(os.getcwd()+'/data', exist_ok=True)
data.to_csv('data/data.csv',index=True, header=True)