# V4 Narasipal Scraping
- Revised scripts for Kompas and CNN Indonesia

In [1]:
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from seleniumbase import Driver
import csv
import time
import sys
sys.path.append("/Users/salmadanu/Desktop/Skripsi/skripsi-env/skripsienv/lib/python3.9/site-packages")
import undetected_chromedriver as uc

# Republika

In [None]:
def initialize_driver_republika(linknum):
    driver = Driver(uc=True)
    driver.get(f"https://republika.co.id/tag/palestina/{linknum}")
    return driver

In [None]:
def scrape_articles_from_page_republika(driver):
    time.sleep(5)
    my_page = driver.page_source
    my_html = BeautifulSoup(my_page, "html.parser")
    articles = my_html.find_all('li', class_='list-group-item list-border conten1')
    
    scraped_data = []
    for article in articles:
        link_tag = article.find('a', href=True)
        url = link_tag['href'] if link_tag else None
        
        date_span = article.find('div', class_='date')
        date_time = date_span.text.split(' - ')[-1] if date_span else None
        
        title_tag = article.find('h3').find('span') if article.find('h3') else None
        title = title_tag.text.strip() if title_tag else None
        
        if url and date_time and title:
            scraped_data.append([url, date_time, title])
    
    return scraped_data

In [None]:
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")  
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = uc.Chrome(options=options)

In [None]:
all_articles = []
try:
    for link_num in range(75, 225, 15):  # Halaman terakhir ga include, tambahin step 15
        print(f"Scraping links from page {link_num}...")
        driver.get(f"https://republika.co.id/tag/palestina/{link_num}")
        page_articles = scrape_articles_from_page_republika(driver)
        all_articles.extend(page_articles)
except Exception as e:
    print(f"Error while scraping links: {e}")
finally:
    driver.quit()

if all_articles:
    output_file = '/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/republika/republika_6-15.csv'
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["article_link", "date_time", "article_title"])
        for article in all_articles:
            writer.writerow(article)

print(f"Articles written to {output_file}")

# Detik

In [None]:
def initialize_driver_detik(pagenum):
    driver = Driver(uc=True)
    driver.get(f"https://www.detik.com/tag/palestina/?sortby=time&page={pagenum}")
    return driver

In [None]:
def scrape_articles_from_page_detik(driver):
    time.sleep(5)
    my_page = driver.page_source
    my_html = BeautifulSoup(my_page, "html.parser")
    
    # Find all articles
    articles = my_html.find_all('article')

    scraped_data = []
    for article in articles:
        link_tag = article.find('a', href=True)
        url = link_tag['href'] if link_tag else None
        
        date_span = article.find('span', class_='date')
        date_time = date_span.text.split(', ')[-1] if date_span else None
        
        title_tag = article.find('h2', class_='title')
        title = title_tag.text.strip() if title_tag else None

        if url and date_time and title:
            scraped_data.append([url, date_time, title])
    
    return scraped_data

In [None]:
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")  
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = uc.Chrome(options=options)

In [None]:
all_articles = []
try:
    for page_num in range(105, 125):  # Halaman terakhir ga include
        print(f"Scraping links from page {page_num}...")
        driver.get(f"https://www.detik.com/tag/palestina/?sortby=time&page={page_num}")
        page_articles = scrape_articles_from_page_detik(driver)
        all_articles.extend(page_articles)
except Exception as e:
    print(f"Error while scraping links: {e}")
finally:
    driver.quit()

if all_articles:
    output_file = '/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/detik/detik_105-124.csv'
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["article_link", "date_time", "article_title"])
        for article in all_articles:
            writer.writerow(article)

print(f"Articles written to {output_file}")

# Kompas

In [2]:
def initialize_driver_kompas(pagenum):
    driver = Driver(uc=True)
    driver.get(f"https://www.kompas.com/tag/palestina?page={pagenum}")
    return driver

In [3]:
# Function for scraping bottom 15 articles
def scrape_articles_from_page_kompas_bawah(driver):
    time.sleep(5)
    my_page = driver.page_source
    my_html = BeautifulSoup(my_page, "html.parser")

    article_list = my_html.find('div', class_='latest ga--latest mt2 clearfix -newlayout')  # Get wrapper
    if not article_list:
        return []

    articles = article_list.find_all('div', class_='article__list__title')

    scraped_data = []
    for article in articles:
        link_tag = article.find('a', class_='article__link', href=True)
        url = link_tag['href'] if link_tag else None

        title = link_tag.text.strip() if link_tag else None

        article_info = article.find_parent('div', class_='article__list')
        date_tag = article_info.find('div', class_='article__date') if article_info else None
        date_time = date_tag.text.strip() if date_tag else None

        if url and date_time and title:
            scraped_data.append([url, date_time, title])

    return scraped_data


In [4]:
# Function for scraping top 5 articles
def scrape_articles_from_page_kompas_atas(driver, column_class):
    time.sleep(5)
    my_page = driver.page_source
    my_html = BeautifulSoup(my_page, "html.parser")

    articles = my_html.find_all('div', class_=column_class)

    scraped_data = []
    for article in articles:
        link_tag = article.find('a', class_='article__link', href=True)
        url = link_tag['href'] if link_tag else None

        date_tag = article.find('div', class_='article__date')
        date_time = date_tag.text.strip() if date_tag else None

        title_tag = article.find('a', class_='article__link')
        title = title_tag.text.strip() if title_tag else None

        if url and date_time and title:
            scraped_data.append([url, date_time, title])
            
    return scraped_data


In [5]:
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")  
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = uc.Chrome(options=options)

In [None]:
all_articles = []

try:
    for page_num in range (310, 311): # Halaman terakhir ga include
        print(f"Scraping links from page {page_num}...")
        driver.get(f"https://www.kompas.com/tag/palestina?page={page_num}")

        page_articles_atas_6 = scrape_articles_from_page_kompas_atas(driver, 'col-bs9-6')
        page_articles_atas_3 = scrape_articles_from_page_kompas_atas(driver, 'col-bs9-3')

        all_articles.extend(page_articles_atas_6)
        all_articles.extend(page_articles_atas_3)

        page_articles_bawah = scrape_articles_from_page_kompas_bawah(driver)
        all_articles.extend(page_articles_bawah)
        
except Exception as e:
    print(f"Error while scraping links: {e}")
finally:
    driver.quit()

if all_articles:
    output_file = '/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/kompas_new_fix/kompas_310.csv'
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["article_link", "date_time", "article_title"])
        for article in all_articles:
            writer.writerow(article)

print(f"Articles written to {output_file}")

Scraping links from page 310...
Articles written to /Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/kompas_new_fix/kompas_310.csv
