# V4 Narasipal Scraping
- Revised scripts for Kompas and CNN Indonesia

In [1]:
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from seleniumbase import Driver
import csv
import time
import sys
sys.path.append("/Users/salmadanu/Desktop/Skripsi/skripsi-env/skripsienv/lib/python3.9/site-packages")
import undetected_chromedriver as uc

# [DONE] Republika

In [None]:
def initialize_driver_republika(linknum):
    driver = Driver(uc=True)
    driver.get(f"https://republika.co.id/tag/palestina/{linknum}")
    return driver

In [None]:
def scrape_articles_from_page_republika(driver):
    time.sleep(5)
    my_page = driver.page_source
    my_html = BeautifulSoup(my_page, "html.parser")
    articles = my_html.find_all('li', class_='list-group-item list-border conten1')
    
    scraped_data = []
    for article in articles:
        link_tag = article.find('a', href=True)
        url = link_tag['href'] if link_tag else None
        
        date_span = article.find('div', class_='date')
        date_time = date_span.text.split(' - ')[-1] if date_span else None
        
        title_tag = article.find('h3').find('span') if article.find('h3') else None
        title = title_tag.text.strip() if title_tag else None
        
        if url and date_time and title:
            scraped_data.append([url, date_time, title])
    
    return scraped_data

In [None]:
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")  
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = uc.Chrome(options=options)

In [None]:
all_articles = []
try:
    for link_num in range(60, 90, 15):  # Halaman terakhir ga include, tambahin step 15
        print(f"Scraping links from page {link_num}...")
        driver.get(f"https://republika.co.id/tag/palestina/{link_num}")
        page_articles = scrape_articles_from_page_republika(driver)
        all_articles.extend(page_articles)
except Exception as e:
    print(f"Error while scraping links: {e}")
finally:
    driver.quit()

if all_articles:
    output_file = '/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/republika/republika_5-6.csv'
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["article_link", "date_time", "article_title"])
        for article in all_articles:
            writer.writerow(article)

print(f"Articles written to {output_file}")

## Combine CSV Republika

In [None]:
import os
import pandas as pd

file_path = "/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/republika"
csv_files = [os.path.join(file_path, f) for f in os.listdir(file_path) if f.endswith('.csv')]

dataframes = []
for f in csv_files:
    df = pd.read_csv(f)
    if 'article_link' in df.columns:
        df.rename(columns={'article_link': 'url'}, inplace=True)
    dataframes.append(df)

republika_master = pd.concat(dataframes, ignore_index=True)

output_path = "/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/master_csv/republika_master.csv"
republika_master.to_csv(output_path, index=False)

print(f"Master CSV saved to {output_path}")

# [DONE] Detik

In [None]:
def initialize_driver_detik(pagenum):
    driver = Driver(uc=True)
    driver.get(f"https://www.detik.com/tag/palestina/?sortby=time&page={pagenum}")
    return driver

In [None]:
def scrape_articles_from_page_detik(driver):
    time.sleep(5)
    my_page = driver.page_source
    my_html = BeautifulSoup(my_page, "html.parser")
    
    # Find all articles
    articles = my_html.find_all('article')

    scraped_data = []
    for article in articles:
        link_tag = article.find('a', href=True)
        url = link_tag['href'] if link_tag else None
        
        date_span = article.find('span', class_='date')
        date_time = date_span.text.split(', ')[-1] if date_span else None
        
        title_tag = article.find('h2', class_='title')
        title = title_tag.text.strip() if title_tag else None

        if url and date_time and title:
            scraped_data.append([url, date_time, title])
    
    return scraped_data

In [None]:
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")  
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = uc.Chrome(options=options)

In [None]:
all_articles = []
try:
    for page_num in range(105, 125):  # Halaman terakhir ga include
        print(f"Scraping links from page {page_num}...")
        driver.get(f"https://www.detik.com/tag/palestina/?sortby=time&page={page_num}")
        page_articles = scrape_articles_from_page_detik(driver)
        all_articles.extend(page_articles)
except Exception as e:
    print(f"Error while scraping links: {e}")
finally:
    driver.quit()

if all_articles:
    output_file = '/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/detik/detik_105-124.csv'
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["article_link", "date_time", "article_title"])
        for article in all_articles:
            writer.writerow(article)

print(f"Articles written to {output_file}")

## Combine CSV Detik

In [None]:
import os
import pandas as pd

file_path = "/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/detik"
csv_files = [os.path.join(file_path, f) for f in os.listdir(file_path) if f.endswith('.csv')]

dataframes = []
for f in csv_files:
    df = pd.read_csv(f)
    if 'article_link' in df.columns:
        df.rename(columns={'article_link': 'url'}, inplace=True)
    dataframes.append(df)

kompas_master = pd.concat(dataframes, ignore_index=True)

output_path = "/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/master_csv/detik_master.csv"
kompas_master.to_csv(output_path, index=False)

print(f"Master CSV saved to {output_path}")

# [DONE] Kompas

In [None]:
def initialize_driver_kompas(pagenum):
    driver = Driver(uc=True)
    driver.get(f"https://www.kompas.com/tag/palestina?page={pagenum}")
    return driver

In [None]:
# Function for scraping bottom 15 articles
def scrape_articles_from_page_kompas_bawah(driver):
    time.sleep(5)
    my_page = driver.page_source
    my_html = BeautifulSoup(my_page, "html.parser")

    article_list = my_html.find('div', class_='latest ga--latest mt2 clearfix -newlayout')  # Get wrapper
    if not article_list:
        return []

    articles = article_list.find_all('div', class_='article__list__title')

    scraped_data = []
    for article in articles:
        link_tag = article.find('a', class_='article__link', href=True)
        url = link_tag['href'] if link_tag else None

        title = link_tag.text.strip() if link_tag else None

        article_info = article.find_parent('div', class_='article__list')
        date_tag = article_info.find('div', class_='article__date') if article_info else None
        date_time = date_tag.text.strip() if date_tag else None

        if url and date_time and title:
            scraped_data.append([url, date_time, title])

    return scraped_data


In [None]:
# Function for scraping top 5 articles
def scrape_articles_from_page_kompas_atas(driver, column_class):
    time.sleep(5)
    my_page = driver.page_source
    my_html = BeautifulSoup(my_page, "html.parser")

    articles = my_html.find_all('div', class_=column_class)

    scraped_data = []
    for article in articles:
        link_tag = article.find('a', class_='article__link', href=True)
        url = link_tag['href'] if link_tag else None

        date_tag = article.find('div', class_='article__date')
        date_time = date_tag.text.strip() if date_tag else None

        title_tag = article.find('a', class_='article__link')
        title = title_tag.text.strip() if title_tag else None

        if url and date_time and title:
            scraped_data.append([url, date_time, title])
            
    return scraped_data


In [None]:
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")  
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = uc.Chrome(options=options)

In [None]:
all_articles = []

try:
    for page_num in range (1, 20): # Halaman terakhir ga include
        print(f"Scraping links from page {page_num}...")
        driver.get(f"https://www.kompas.com/tag/palestina?page={page_num}")

        page_articles_atas_6 = scrape_articles_from_page_kompas_atas(driver, 'col-bs9-6')
        page_articles_atas_3 = scrape_articles_from_page_kompas_atas(driver, 'col-bs9-3')

        all_articles.extend(page_articles_atas_6)
        all_articles.extend(page_articles_atas_3)

        page_articles_bawah = scrape_articles_from_page_kompas_bawah(driver)
        all_articles.extend(page_articles_bawah)
        
except Exception as e:
    print(f"Error while scraping links: {e}")
finally:
    driver.quit()

if all_articles:
    output_file = '/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/kompas/kompas_1-19.csv'
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["article_link", "date_time", "article_title"])
        for article in all_articles:
            writer.writerow(article)

print(f"Articles written to {output_file}")

In [None]:
import os
os.system('say "Execution Finished"')  # macOS
print('\a')

## Combine CSV Kompas

In [None]:
import os
import pandas as pd

file_path = "/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/kompas"
csv_files = [os.path.join(file_path, f) for f in os.listdir(file_path) if f.endswith('.csv')]

dataframes = []
for f in csv_files:
    df = pd.read_csv(f)
    if 'article_link' in df.columns:
        df.rename(columns={'article_link': 'url'}, inplace=True)
    dataframes.append(df)

kompas_master = pd.concat(dataframes, ignore_index=True)

output_path = "/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/master_csv/kompas_master.csv"
kompas_master.to_csv(output_path, index=False)

print(f"Master CSV saved to {output_path}")

# [DONE] CNN Indonesia

In [None]:
def initialize_driver_cnnindonesia(pagenum):
    driver = Driver(uc=True)
    driver.get(f"https://www.cnnindonesia.com/tag/palestina?page={pagenum}")
    return driver

In [None]:
from bs4 import Comment

def scrape_articles_from_page_cnnindonesia(driver):
    time.sleep(5)
    my_page = driver.page_source
    my_html = BeautifulSoup(my_page, "html.parser")
    articles = my_html.find_all('article', class_='flex-grow')

    scraped_data = []
    for article in articles:
        link_tag = article.find('a', href=True)
        url = link_tag['href'] if link_tag else None

        # Extract date-time from the comment inside <span class="text-xs text-cnn_black_light3">
        date_span = article.find('span', class_='text-xs text-cnn_black_light3')
        date_time = None
        if date_span:
            comment = date_span.find(string=lambda text: isinstance(text, Comment))
            if comment:
                date_time = comment.strip()

        title_tag = article.find('h2', class_='text-cnn_black_light')
        title = title_tag.text.strip() if title_tag else None

        if url and date_time and title:
            scraped_data.append([url, date_time, title])

    return scraped_data


In [None]:
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")  
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = uc.Chrome(options=options)

In [None]:
all_articles = []
try:
    for page_num in range(1, 50):  # Halaman terakhir ga include
        print(f"Scraping links from page {page_num}...")
        driver.get(f"https://www.cnnindonesia.com/tag/palestina?page={page_num}")
        page_articles = scrape_articles_from_page_cnnindonesia(driver)
        all_articles.extend(page_articles)
except Exception as e:
    print(f"Error while scraping links: {e}")
finally:
    driver.quit()

if all_articles:
    output_file = '/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/cnnindonesia/cnn_1-49.csv'
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["article_link", "date_time", "article_title"])
        for article in all_articles:
            writer.writerow(article)

print(f"Articles written to {output_file}")

## Combine CNN CSV

In [None]:
import os
import pandas as pd

file_path = "/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/cnnindonesia"
csv_files = [os.path.join(file_path, f) for f in os.listdir(file_path) if f.endswith('.csv')]

dataframes = []
for f in csv_files:
    df = pd.read_csv(f)
    if 'article_link' in df.columns:
        df.rename(columns={'article_link': 'url'}, inplace=True)
    dataframes.append(df)

cnn_master = pd.concat(dataframes, ignore_index=True)

output_path = "/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/master_csv/cnn_master.csv"
cnn_master.to_csv(output_path, index=False)

print(f"Master CSV saved to {output_path}")

# [DONE] CNBC Indonesia

In [None]:
def initialize_driver_cnbcindonesia(pagenum):
    driver = Driver(uc=True)
    driver.get(f"https://www.cnbcindonesia.com/tag/palestina?page={pagenum}")
    return driver

In [None]:
def wait_for_page_load(driver, timeout=10):
    """Ensure JavaScript-rendered content fully loads."""
    WebDriverWait(driver, timeout).until(lambda d: d.execute_script("return document.readyState") == "complete")

def scroll_until_no_new_content(driver, max_scrolls=10, wait_time=5):
    """Scroll until no new articles load, or max scrolls reached."""
    previous_height = driver.execute_script("return document.body.scrollHeight")
    for _ in range(max_scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(wait_time)  # Allow time for new content to load

        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == previous_height:  # No new content loaded
            break
        previous_height = new_height

def scrape_articles_from_page_cnbcindonesia(driver):
    """Scrape article titles and URLs from CNBC Indonesia search results."""
    try:
        wait_for_page_load(driver)
        scroll_until_no_new_content(driver)  # Ensure all articles are loaded

        my_page = driver.page_source
        my_html = BeautifulSoup(my_page, "html.parser")

        container = my_html.find('div', class_='flex flex-col gap-6')
        if not container:
            print("Container not found.")
            return []

        articles = container.find_all('article')
        if not articles:
            print("No articles found on page.")
            return []

        scraped_data = []
        base_url = "https://www.cnbcindonesia.com"

        for article in articles:
            link_tag = article.find('a', href=True)
            url = link_tag['href'] if link_tag else None
            if url and not url.startswith("http"):
                url = base_url + url  # Ensure full URL

            title_tag = article.find('h2', class_='font-semibold text-23 group-hover:text-cnbc-primary-blue')
            title = title_tag.text.strip() if title_tag else None

            if url and title:
                scraped_data.append([url, title])

        return scraped_data

    except Exception as e:
        print(f"Error while scraping page: {e}")
        return []

In [None]:
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")  
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = uc.Chrome(options=options)

In [None]:
all_articles = []
try:
    for page_num in range(12, 50):  # Last page not included
        print(f"Scraping links from page {page_num}...")
        driver.get(f"https://www.cnbcindonesia.com/tag/palestina?page={page_num}")
        wait_for_page_load(driver)
        scroll_until_no_new_content(driver)  # Ensure all articles are loaded
        page_articles = scrape_articles_from_page_cnbcindonesia(driver)
        all_articles.extend(page_articles)

except Exception as e:
    print(f"Error while scraping links: {e}")

finally:
    driver.quit()

# Save results to CSV
output_file = '/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/cnbcindonesia/cnbc_12-49.csv'
if all_articles:
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["article_link", "article_title"])
        writer.writerows(all_articles)

print(f"Articles written to {output_file}")

## Combine CNBC CSV

In [None]:
import os
import pandas as pd

file_path = "/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/cnbcindonesia"
csv_files = [os.path.join(file_path, f) for f in os.listdir(file_path) if f.endswith('.csv')]

dataframes = []
for f in csv_files:
    df = pd.read_csv(f)
    if 'article_link' in df.columns:
        df.rename(columns={'article_link': 'url'}, inplace=True)
    dataframes.append(df)

cnbc_master = pd.concat(dataframes, ignore_index=True)

output_path = "/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/master_csv/cnbc_master.csv"
cnbc_master.to_csv(output_path, index=False)

print(f"Master CSV saved to {output_path}")

# Tempo

In [2]:
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc
import csv
import time
from bs4 import BeautifulSoup

# Initialize driver
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = uc.Chrome(options=options)

def scrape_articles_from_page_tempo(driver, num_pages=276):
    all_scraped_data = []

    for page in range(num_pages):
        print(f"Scraping page {page+1}...")

        # Wait until articles are loaded
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//figure"))
        )

        # Get page source
        my_page = driver.page_source
        my_html = BeautifulSoup(my_page, "html.parser")

        # Find articles container
        container = my_html.find('div', class_='flex flex-col divide-y divide-neutral-500')
        if not container:
            print("Container not found. Printing page source for debugging...")
            print(my_page[:2000])
            return all_scraped_data

        # Extract articles
        articles = container.find_all('figure')
        for article in articles:
            link_tag = article.find('a', href=True)
            url = link_tag['href'] if link_tag else None

            title_tag = article.find('figcaption').find('a') if article.find('figcaption') else None
            title = title_tag.text.strip() if title_tag else None

            if url and title:
                all_scraped_data.append([url, title])

        # Click "Next Page" button
        try:
            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[@aria-label='Next Page']"))
            )
            ActionChains(driver).move_to_element(next_button).click().perform()
            time.sleep(5)  # Wait for new page
        except Exception as e:
            print("Next button not found or not clickable:", e)
            break  # Stop if pagination fails

    return all_scraped_data


In [3]:
all_articles = []
driver.get(f"https://www.tempo.co/tag/palestina")
page_articles = scrape_articles_from_page_tempo(driver)
all_articles.extend(page_articles)
output_file = '/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/tempo/tempo_1-276.csv'
if all_articles:
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["article_link", "article_title"])
        writer.writerows(all_articles)

print(f"Articles written to {output_file}")

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 