In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from tqdm import tqdm
import pandas as pd
import re

In [2]:
def preprocess_text(text):
    tt = '© 2025 Cable News Network. A Warner Bros. Discovery Company. All Rights Reserved.  CNN Sans ™ & © 2016 Cable News Network. For privacy options, please see our privacy policy:https://www.cnn.com/privacy.'
    text = text.replace(tt, '')
    text = text.replace('\xa0', ' ')
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def extract_date(url):
    match = re.search(r'/(\d{4})/(\d{2})/(\d{2})/', url)
    if match:
        year, month, day = match.groups()
        return f"{year}-{month}-{day}"
    return None

In [None]:
def parse_news(start, end):
    search_urls = []
    news_urls = []
    texts = []
    page = start
    offset = start * 10

    while page <= end: 
        search_urls.append(f'https://edition.cnn.com/search?q=israel+hamas&from={offset}&size=10&page={page}&sort=relevance&types=article&section=')
        page += 1
        offset += 10

    chrome_options = webdriver.SafariOptions()
    chrome_options.add_argument("--headless")  
    
    driver = webdriver.Safari(options=chrome_options)

    try:
        for search_url in tqdm(search_urls):
            driver.get(search_url)
            time.sleep(1)
        
            links = driver.find_elements(By.XPATH, '//div[contains(@class, "card")]//a')
            links_set = set(link.get_attribute('href') for link in links)
            
            for url in links_set:
                try:
                    news_urls.append(url)
                    driver.get(url)
                    time.sleep(1)
                    
                    try:
                       paragraphs = driver.find_elements(By.TAG_NAME, "p")
                       article_text = " ".join([p.text for p in paragraphs])
                       texts.append(article_text)
                    except Exception as e:
                        print(e)
                except Exception as e:
                    print(f"Error processing URL {url}: {e}")
    finally:
        driver.quit()

    # create df 
    df = pd.DataFrame({
        'article_text': texts,
        'article_url': news_urls
    })

    return df

In [None]:
df = parse_news(151, 200)

In [40]:
df['text'] = df['article_text'].apply(preprocess_text) 
df['date'] = df['article_url'].apply(extract_date)

In [None]:
df.to_csv('cnn_news.csv')