# V3 Narasipal Scraping
Adjustments:
- Asynchronous requests
- Reduce unnecessary wait times
- Headless & optimized selenium
- Parallel processing
- Session reuse

In [None]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from seleniumbase import Driver
from concurrent.futures import ThreadPoolExecutor
import csv
import json
import sys
sys.path.append("/Users/salmadanu/Desktop/Skripsi/skripsi-env/skripsienv/lib/python3.9/site-packages")
import undetected_chromedriver as uc

# Kompas

In [None]:
def configure_driver():
    options = uc.ChromeOptions()
    options.add_argument("--headless")  # Run without UI
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--blink-settings=imagesEnabled=false")  # Disable images for faster loading

    driver = uc.Chrome(options=options)  # ✅ Correct way to initialize
    return driver


In [None]:
async def fetch_page(session, url):
    async with session.get(url) as response:
        return await response.text()

async def scrape_links_kompas(page_num):
    url = f"https://www.kompas.com/tag/palestina?page={page_num}"
    async with aiohttp.ClientSession() as session:
        html = await fetch_page(session, url)
        soup = BeautifulSoup(html, "lxml")
        return [a['href'] for a in soup.select('div.article__list a[href]')]

In [None]:
def scrape_article_details(driver, link):
    driver.get(link)
    details = {'url': link}
    try:
        title_element = WebDriverWait(driver, 3).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'read__title'))
        )
        title = title_element.text.strip()
        details['content_title'] = title
    except:
        details['content_title'] = "unknown"
    return details

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def scrape_articles_parallel(links):
    driver = configure_driver()
    results = []

    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_url = {executor.submit(scrape_article_details, driver, link): link for link in links}

        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                if data:  # Check if scraping was successful
                    results.append(data)
                    print(f"Done scraping: {url}")
            except Exception as e:
                print(f"Failed scraping {url}: {e}")

    driver.quit()
    return results

In [None]:
async def main():
    page_numbers = range(194, 195)  # Adjust range as needed
    tasks = [scrape_links_kompas(page) for page in page_numbers]
    all_links = await asyncio.gather(*tasks)
    flat_links = [link for sublist in all_links for link in sublist]
    
    print(f"Total articles found: {len(flat_links)}")
    articles_data = scrape_articles_parallel(flat_links)
    
    with open("kompas_194.csv", "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = sorted(set(key for data in articles_data for key in data.keys()))
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(articles_data)
    print("Scraping completed successfully!")

if __name__ == "__main__":
    import nest_asyncio
    nest_asyncio.apply()
    asyncio.get_event_loop().run_until_complete(main())
