In [1]:
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from datetime import datetime

# Initialize Chrome WebDriver
chromedriver_path = 'C://chromedriver-win64/chromedriver.exe'
service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service)

# Use only one keyword for trial
keywords = ['Government policy']

# Base CNN URL template
base_url = 'https://edition.cnn.com/search?q={}&from=0&size=10&page={}&sort=newest&types=all&section='

# Define the date limit
date_limit = datetime(2024, 1, 1)

# Function to clean up and format keyword for URL
def format_keyword(keyword):
    return '+'.join(keyword.split())

# Function to convert scraped date string to a datetime object
def parse_date(date_string):
    try:
        return datetime.strptime(date_string, '%b %d, %Y')
    except ValueError:
        return None

# Function to scrape news for each keyword
def scrape_news(driver, keyword):
    formatted_keyword = format_keyword(keyword)
    page = 1
    scraped_data = []
    stop_scraping = False

    # Scrape only two pages for trial
    while page <= 2 and not stop_scraping:
        url = base_url.format(formatted_keyword, page)
        print(f"Fetching URL: {url}")
        driver.get(url)

        try:
            # Wait for the first headline element to appear
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="search"]/div[2]/div/div[2]/div/div[2]/div/div/div[1]/a[2]'))
            )

            # Scrape link, headline, description, and date using updated XPaths
            for i in range(1, 11):  # Scrape up to 10 results per page
                try:
                    link_elem = driver.find_element(By.XPATH, f'//*[@id="search"]/div[2]/div/div[2]/div/div[2]/div/div/div[{i}]/a[2]')
                    headline_elem = driver.find_element(By.XPATH, f'//*[@id="search"]/div[2]/div/div[2]/div/div[2]/div/div/div[{i}]/a[2]/div/div[1]')
                    desc_elem = driver.find_element(By.XPATH, f'//*[@id="search"]/div[2]/div/div[2]/div/div[2]/div/div/div[{i}]/a[2]/div/div[3]')
                    date_elem = driver.find_element(By.XPATH, f'//*[@id="search"]/div[2]/div/div[2]/div/div[2]/div/div/div[{i}]/a[2]/div/div[2]')

                    link = link_elem.get_attribute('href')
                    headline = headline_elem.text
                    description = desc_elem.text
                    date_str = date_elem.text.strip()
                    article_date = parse_date(date_str)

                    # If the date is earlier than the limit, stop scraping for this keyword
                    if article_date and article_date < date_limit:
                        stop_scraping = True
                        break

                    # Save scraped data
                    scraped_data.append({
                        'keyword': keyword,
                        'title': headline,
                        'desc': description,
                        'date': date_str,
                        'link': link
                    })

                except Exception as e:
                    print(f"Error scraping article {i}: {e}")
                    continue

        except Exception as e:
            print(f"Error loading page or finding elements: {e}")
            break

        page += 1
        time.sleep(2)  # Wait between requests to avoid being blocked

    return scraped_data

# Scrape news for the single keyword
all_news_data = []
for keyword in keywords:
    print(f"Scraping news for keyword: {keyword}")
    news_data = scrape_news(driver, keyword)
    all_news_data.extend(news_data)
    print(f"Finished scraping {len(news_data)} articles for keyword: {keyword}")

driver.quit()

# Save results to CSV
csv_file = 'scraped_cnn_news_trial.csv'
csv_columns = ['keyword', 'title', 'desc', 'date', 'link']

try:
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=csv_columns)
        writer.writeheader()
        writer.writerows(all_news_data)
    print(f"Data successfully saved to {csv_file}")
except IOError:
    print("I/O error when writing to CSV")


Scraping news for keyword: Government policy
Fetching URL: https://edition.cnn.com/search?q=Government+policy&from=0&size=10&page=1&sort=newest&types=all&section=
Fetching URL: https://edition.cnn.com/search?q=Government+policy&from=0&size=10&page=2&sort=newest&types=all&section=
Finished scraping 20 articles for keyword: Government policy
Data successfully saved to scraped_cnn_news_trial.csv
