In [1]:
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from seleniumbase import Driver
import csv
import time
import json

In [2]:
def initialize_driver(pagenum):
    driver = Driver(uc=True)
    driver.get(f"https://www.kompas.com/tag/palestina?page={pagenum}")
    return driver

In [3]:
def scrape_links_from_page(driver):
    time.sleep(5)
    my_page = driver.page_source
    my_html = BeautifulSoup(my_page, "html.parser")
    card_containers = my_html.find_all('div', class_='article__list clearfix')

    nav_links = []
    for container in card_containers:
        link_link = container.find('a', href=True)
        href = link_link['href'] if link_link else None
        if href:
            nav_links.append(href)
    
    return nav_links

In [4]:
def scrape_article_details(driver, link):
    driver.get(link)
    time.sleep(5)
    details = {'url': link}
    
    try:
        # Extract the script tag containing the desired data
        script_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, 'script'))
        )
        script_tags = driver.find_elements(By.TAG_NAME, 'script')
        
        # Find the relevant script tag
        script_content = None
        for script in script_tags:
            if 'window.dataLayer = window.dataLayer || []' in script.get_attribute('innerHTML'):
                script_content = script.get_attribute('innerHTML')
                break
        
        if not script_content:
            raise ValueError("No relevant script tag found.")
        
        # Extract JSON data from the script tag
        start = script_content.find("{")
        end = script_content.rfind("}") + 1
        data = json.loads(script_content[start:end])
        
        # Extract relevant details
        details.update({
            'content_site': data.get('content_site', 'unknown'),
            'content_editor': data.get('content_editor', 'unknown'),
            'content_type': data.get('content_type', 'unknown'),
            'content_PublishedDate': data.get('content_PublishedDate', 'unknown'),
            'content_title': data.get('content_title', 'unknown'),
            'content_tags': data.get('content_tags', 'unknown'),
            'content_total_words': data.get('content_total_words', 'unknown')
        })
    
    except Exception as e:
        print(f"An error occurred while extracting details: {e}")
    
    return details

In [None]:
driver = Driver(uc=True)

all_nav_links = []
try:
    for page_num in range(46, 55):
        print(f"Scraping links from page {page_num}...")
        driver.get(f"https://www.kompas.com/tag/palestina?page={page_num}")
        page_links = scrape_links_from_page(driver)
        all_nav_links.extend(page_links)
except Exception as e:
    print(f"Error while scraping links: {e}")
finally:
    driver.quit()

# Scrape article details
all_data = []
driver = Driver(uc=True)  # Reinitialize the driver for scraping details
try:
    for i, link in enumerate(all_nav_links):
        print(f"Scraping details from link {i+1}/{len(all_nav_links)}: {link}")
        try:
            details = scrape_article_details(driver, link)
            all_data.append(details)
        except Exception as e:
            print(f"Error scraping {link}: {e}")
finally:
    driver.quit()

# Write data to CSV
if all_data:
    header = sorted(set(key for data in all_data for key in data.keys()))
    output_file = '/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/kompas/kompas_46-55.csv'
    try:
        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=header)
            writer.writeheader()
            for data in all_data:
                writer.writerow(data)
        print(f"Data has been written to {output_file}")
    except Exception as e:
        print(f"Error writing to CSV: {e}")
else:
    print("No data scraped to write to CSV.")