In [9]:
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time

csv_filename = "article_links_kpmg.csv"
article_urls = []
with open(csv_filename, "r", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        article_urls.append(row["url"])
print("Loaded", len(article_urls), "article URLs")

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 10)

output_csv = "article_details.csv"
with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
    fieldnames = ["url", "category", "title", "report_date", "content", "pdf_url"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    for url in article_urls:
        try:
            driver.get(url)
            time.sleep(3) 
            
            try:
                accept_button = wait.until(
                    EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept All Cookies')]"))
                )
                accept_button.click()
                time.sleep(2)
            except Exception as e:
                print(f"No cookie banner found or already accepted on {url}: {e}")
            
            category = ""
            title = ""
            try:
                breadcrumb_items = driver.find_elements(By.CSS_SELECTOR, "ol.cmp-breadcrumb__list li")
                if len(breadcrumb_items) >= 2:
                    title = breadcrumb_items[-1].text.strip()
                    category = breadcrumb_items[-2].text.strip()
                else:
                    print("Not enough breadcrumb items found; will try alternative extraction for title.")
            except Exception as e:
                print(f"Error extracting breadcrumbs from {url}: {e}")
            
            if not title:
                try:
                    h1_title = driver.find_element(By.CSS_SELECTOR, "h1.cmp-hero-csi__title")
                    title = h1_title.text.strip()
                except Exception as e:
                    print(f"Error extracting h1 title from {url}: {e}")
            
            report_date = ""
            try:
                time_element = driver.find_element(By.TAG_NAME, "time")
                report_date = time_element.get_attribute("datetime")
                if not report_date:
                    report_date = time_element.text.strip()
            except Exception as e:
                print(f"No report date found on {url}: {e}")
            
            content = ""
            try:
                main_container = driver.find_element(By.CSS_SELECTOR, "div.main.container.responsivegrid")
                text_elements = main_container.find_elements(By.CSS_SELECTOR, "div.cmp-text")
                content_parts = [el.text.strip() for el in text_elements if el.text.strip()]
                content = "\n\n".join(content_parts)
            except Exception as e:
                print(f"Error extracting content from {url}: {e}")
            
            pdf_url = ""
            try:
                pdf_link_element = driver.find_element(By.XPATH, "//a[contains(@href, '.pdf')]")
                pdf_url = pdf_link_element.get_attribute("href")
            except Exception as e:
                print(f"No PDF link found in {url}: {e}")
            
            writer.writerow({
                "url": url,
                "category": category,
                "title": title,
                "report_date": report_date,
                "content": content,
                "pdf_url": pdf_url
            })
            print(f"Scraped content from {url}")
        except Exception as e:
            print(f"Error scraping {url}: {e}")

driver.quit()
print("Finished scraping article content. Output saved to", output_csv)


Loaded 300 article URLs
No report date found on https://kpmg.com/xx/en/our-insights/value-creation/a-delicate-balancing-act-between-economic-impact-and-taxation.html: Message: no such element: Unable to locate element: {"method":"tag name","selector":"time"}
  (Session info: chrome=133.0.6943.142); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00900B43+25139]
	(No symbol) [0x008913F4]
	(No symbol) [0x007704E3]
	(No symbol) [0x007B83D7]
	(No symbol) [0x007B872B]
	(No symbol) [0x00801002]
	(No symbol) [0x007DD014]
	(No symbol) [0x007FE778]
	(No symbol) [0x007DCDC6]
	(No symbol) [0x007ABDE9]
	(No symbol) [0x007AD124]
	GetHandleVerifier [0x00C04373+3185251]
	GetHandleVerifier [0x00C2291A+3309578]
	GetHandleVerifier [0x00C1CF42+3286578]
	GetHandleVerifier [0x00997AE0+643536]
	(No symbol) [0x0089A20D]
	(No symbol) [0x008970B8]
	(No symbol) [0x00897257]
	(No sym