In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from dateutil import parser
from tqdm import tqdm
import time
import pandas as pd
import re

In [None]:
def preprocess_text(text):
    tt = b"Copyright 2025 BBC. All rights reserved.\xc2\xa0\xc2\xa0The BBC is not responsible for the content of external sites.\xc2\xa0Read about our approach to external linking. \xc2\xa0"
    text = text.encode(encoding='utf-8').replace(tt, b'').decode(encoding='utf-8')
    text = text.replace('\xa0', ' ')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
def parse_news(start, end):
    search_urls = []
    news_urls = []
    texts = []
    dates = []
    page = start

    while page <= end: 
        search_urls.append(f'https://bbc.com/search?q=gaza&page={page}')
        page += 1
    
    # driver for selenium
    chrome_options = webdriver.SafariOptions()
    chrome_options.add_argument("--headless")  
    
    driver = webdriver.Safari(options=chrome_options)

    try:
        for search_url in tqdm(search_urls):
            driver.get(search_url)
            time.sleep(1)
        
            links = driver.find_elements(By.XPATH, '//div[contains(@class, "sc-c6f6255e-0")]//a')
            links_set = set(link.get_attribute('href') for link in links)
            
            for url in links_set:
                try:
                    news_urls.append(url)
                    driver.get(url)
                    time.sleep(1)
                    
                    try:
                       paragraphs = driver.find_elements(By.CLASS_NAME, 'sc-eb7bd5f6-0')
                       article_text = " ".join([p.text for p in paragraphs])
                       dt = list(driver.find_elements(By.TAG_NAME, 'time'))
                       if len(dt) > 0:
                           dt = dt[0].get_attribute('datetime')
                           dt = parser.parse(dt)
                           dates.append(f'{dt.year}-{dt.month}-{dt.day}')
                       else:
                           dates.append(None)
                       if len(article_text) == 0: 
                           print('oops')
                       texts.append(article_text)
                    except Exception as e:
                        print(e)
                except Exception as e:
                    print(f"Error processing URL {url}: {e}")
    finally:
        driver.quit()

    return texts, news_urls, dates

In [12]:
texts, news_urls, dates = parse_news(0, 1)

100%|█████████████████████████████████████████████| 2/2 [00:08<00:00,  4.26s/it]


In [None]:
df = pd.DataFrame({
    'article_text': texts,
    'article_url': news_urls,
    'date': dates
})

df['text'] = df['article_text'].apply(preprocess_text) 
df.to_csv('bbc_news.csv')

In [143]:
df

Unnamed: 0,article_text,article_url,date
0,A senior UN aid official has warned that half ...,https://www.bbc.com/news/world-middle-east-676...,2023-12-10
1,Israel and Hamas have reached a deal to exchan...,https://www.bbc.com/news/world-middle-east-674...,2023-11-23
2,A man originally from Gaza has been hospitalis...,https://www.bbc.com/news/uk-england-bristol-67...,2023-12-18
3,US Defence Secretary Lloyd Austin has said he ...,https://www.bbc.com/news/world-us-canada-67753876,2023-12-18
4,"Yarden Roman-Gat, 36, was abducted from Kibbut...",https://www.bbc.com/news/world-middle-east-677...,2023-12-18
...,...,...,...
1345,"As the Irish general election draws nearer, tw...",https://www.bbc.com/news/articles/c9vnlvg32z0o,2024-11-20
1346,The announcement of arrest warrants by the Int...,https://www.bbc.com/news/articles/ckgr4n0720eo,2024-11-21
1347,The UN peacekeeping agency in southern Lebanon...,https://www.bbc.com/news/articles/cq8vpvjlqz3o,2024-11-19
1348,Copyright 2025 BBC. All rights reserved. The ...,https://www.bbc.com/audio/play/p0jyj515,
