# V2 Narasipal Scraping
Adjustments:
- Error handling
- Reduced content loading
- Specific scraping for video articles

In [1]:
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from seleniumbase import Driver
from selenium.webdriver.chrome.options import Options
import csv
import time
import json
import sys
sys.path.append("/Users/salmadanu/Desktop/Skripsi/skripsi-env/skripsienv/lib/python3.9/site-packages")
import undetected_chromedriver as uc

# Kompas

In [2]:
def initialize_driver_kompas(pagenum):
    driver = Driver(uc=True)
    driver.get(f"https://www.kompas.com/tag/palestina?page={pagenum}")
    return driver

In [3]:
def scrape_links_from_page_kompas(driver):
    time.sleep(5)
    my_page = driver.page_source
    my_html = BeautifulSoup(my_page, "html.parser")
    card_containers = my_html.find_all('div', class_='article__list clearfix')

    nav_links = []
    for container in card_containers:
        link_link = container.find('a', href=True)
        href = link_link['href'] if link_link else None
        if href:
            nav_links.append(href)
    return nav_links

In [4]:
def scrape_article_details_kompas(driver, link):
    driver.get(link)
    time.sleep(5)
    details = {'url': link}
    
    try:
        # Check if it's a video article
        is_video_article = False
        try:
            video_section = driver.find_element(By.CLASS_NAME, 'videoKG-artikel')
            is_video_article = True
        except:
            pass  # If this element doesn't exist, it's a regular article
        
        if is_video_article:
            print("Processing video article:", link)
            
            # Extract title
            title_element = driver.find_element(By.CLASS_NAME, 'read__title')
            title = title_element.text.strip() if title_element else "unknown"
            
            # Extract published date
            date_element = driver.find_element(By.CLASS_NAME, 'videoKG-date')
            published_date = date_element.text.strip() if date_element else "unknown"
            
            # Extract content editor (Penulis Naskah)
            try:
                content_section = driver.find_element(By.CLASS_NAME, 'read__content')
                text_content = content_section.text
                editor = "unknown"
                for line in text_content.split("\n"):
                    if "Penulis Naskah:" in line:
                        editor = line.replace("Penulis Naskah:", "").strip()
                        break
            except:
                editor = "unknown"

            details.update({
                'content_site': 'Kompas.com',
                'content_editor': editor,
                'content_type': 'Video',
                'content_PublishedDate': published_date,
                'content_title': title,
                'content_tags': 'unknown',
                'content_total_words': 'unknown'
            })
        
        else:
            print("Processing standard article:", link)
            
            # Extract JSON metadata from script tag
            script_tags = driver.find_elements(By.TAG_NAME, 'script')
            script_content = None
            for script in script_tags:
                if 'window.dataLayer = window.dataLayer || []' in script.get_attribute('innerHTML'):
                    script_content = script.get_attribute('innerHTML')
                    break
            
            if script_content:
                            start = script_content.find("{")
                            end = script_content.rfind("}") + 1
                            data = json.loads(script_content[start:end])
                            
                            details.update({
                                'content_site': data.get('content_site', 'unknown'),
                                'content_editor': data.get('content_editor', 'unknown'),
                                'content_type': data.get('content_type', 'unknown'),
                                'content_PublishedDate': data.get('content_PublishedDate', 'unknown'),
                                'content_title': data.get('content_title', 'unknown'),
                                'content_tags': data.get('content_tags', 'unknown'),
                                'content_total_words': data.get('content_total_words', 'unknown')
                            })
            else:
                print(f"No script tag found for {link}, using fallback method.")
                title_element = driver.find_element(By.CLASS_NAME, 'read__title')
                title = title_element.text.strip() if title_element else "unknown"
                details.update({
                         'content_site': 'Kompas.com',
                         'content_editor': 'unknown',
                         'content_type': 'Article',
                         'content_PublishedDate': 'unknown',
                         'content_title': title,
                         'content_tags': 'unknown',
                         'content_total_words': 'unknown'
                 })
    
    except Exception as e:
        print(f"Error extracting details from {link}: {e}")
    
    return details

In [5]:
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")  
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = uc.Chrome(options=options)

In [6]:
all_nav_links = []
try:
    for page_num in range(225, 235): # Halaman terakhir ga include
        print(f"Scraping links from page {page_num}...")
        driver.get(f"https://www.kompas.com/tag/palestina?page={page_num}")
        page_links = scrape_links_from_page_kompas(driver)
        all_nav_links.extend(page_links)
except Exception as e:
    print(f"Error while scraping links: {e}")
finally:
    driver.quit()

# Scrape article details
all_data = []
driver = Driver(uc=True)  # Reinitialize the driver for scraping details
try:
    for i, link in enumerate(all_nav_links):
        print(f"Scraping details from link {i+1}/{len(all_nav_links)}: {link}")
        try:
            details = scrape_article_details_kompas(driver, link)
            all_data.append(details)
        except Exception as e:
            print(f"Error scraping {link}: {e}")
finally:
    driver.quit()

# Write data to CSV
if all_data:
    header = sorted(set(key for data in all_data for key in data.keys()))
    output_file = '/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/kompas_new/kompas_225-234.csv'
    try:
        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=header)
            writer.writeheader()
            for data in all_data:
                writer.writerow(data)
        print(f"Data has been written to {output_file}")
    except Exception as e:
        print(f"Error writing to CSV: {e}")
else:
    print("No data scraped to write to CSV.")

Scraping links from page 225...
Scraping links from page 226...
Scraping links from page 227...
Scraping links from page 228...
Scraping links from page 229...
Scraping links from page 230...
Scraping links from page 231...
Scraping links from page 232...
Scraping links from page 233...
Scraping links from page 234...
Scraping details from link 1/150: https://video.kompas.com/watch/1082719/hamas-sandera-israel-tidak-akan-kembali-kecuali-agresi-berhenti
Processing video article: https://video.kompas.com/watch/1082719/hamas-sandera-israel-tidak-akan-kembali-kecuali-agresi-berhenti
Scraping details from link 2/150: http://www.kompas.com/cekfakta/read/2023/12/08/110900582/-klarifikasi-video-pengibaran-bendera-israel-bukan-berlokasi-di-rs-al
Processing standard article: http://www.kompas.com/cekfakta/read/2023/12/08/110900582/-klarifikasi-video-pengibaran-bendera-israel-bukan-berlokasi-di-rs-al
Scraping details from link 3/150: https://video.kompas.com/watch/1082459/putin-bertemu-presiden-i

## Combine CSV

In [9]:
import os
import pandas as pd

file_path = "/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/kompas_new"
csv_files = [os.path.join(file_path, f) for f in os.listdir(file_path) if f.endswith('.csv')]
kompas_master = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)
kompas_master.to_csv('/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/master_csv/kompas_master.csv', index=False)

# CNN Indonesia
391

In [None]:
def initialize_driver_cnn(pagenum):
    driver = Driver(uc=True)
    driver.get(f"https://www.cnnindonesia.com/tag/palestina?page={pagenum}")
    return driver

In [None]:
def scrape_links_from_page_cnn(driver):
    time.sleep(5)
    my_page = driver.page_source
    my_html = BeautifulSoup(my_page, "html.parser")
    
    card_containers = my_html.find_all('div', class_='flex flex-col gap-5')

    nav_links = []
    for container in card_containers:
        link_link = container.find('a', href=True)
        href = link_link['href'] if link_link else None
        if href:
            nav_links.append(href)
    return nav_links

In [None]:
def scrape_article_details_cnn(driver, link):
    driver.get(link)
    time.sleep(5)
    details = {'url': link}

    try:
        script_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//script[@type='application/ld+json']"))
        )
        script_content = script_element.get_attribute('innerHTML')

        data= json.loads(script_content)

        details.update({
            'content_site': data.get('publisher', {}).get('name', 'unknown'),
            'content_editor': data.get('author', {}).get('name', 'unknown'),
            'content_type': data.get('@type', 'unknown'),
            'content_PublishedDate': data.get('datePublished', 'unknown'),
            'content_title': data.get('headline', 'unknown'),
            'content_tags': data.get('video', {}).get('keywords', 'unknown')
        })
    
    except Exception as e:
        print(f"An error occurred while extracting details: {e}")
    
    return details

In [None]:
driver = Driver(uc=True)

all_nav_links = []
try:
    for page_num in range(46, 55):
        print(f"Scraping links from page {page_num}...")
        driver.get(f"https://www.cnnindonesia.com/tag/palestina?page={page_num}")
        page_links = scrape_links_from_page_cnn(driver)
        all_nav_links.extend(page_links)
except Exception as e:
    print(f"Error while scraping links: {e}")
finally:
    driver.quit()

# Scrape article details
all_data = []
driver = Driver(uc=True)  # Reinitialize the driver for scraping details
try:
    for i, link in enumerate(all_nav_links):
        print(f"Scraping details from link {i+1}/{len(all_nav_links)}: {link}")
        try:
            details = scrape_article_details_cnn(driver, link)
            all_data.append(details)
        except Exception as e:
            print(f"Error scraping {link}: {e}")
finally:
    driver.quit()

# Write data to CSV
if all_data:
    header = sorted(set(key for data in all_data for key in data.keys()))
    output_file = '/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/cnn/cnn_391.csv'
    try:
        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=header)
            writer.writeheader()
            for data in all_data:
                writer.writerow(data)
        print(f"Data has been written to {output_file}")
    except Exception as e:
        print(f"Error writing to CSV: {e}")
else:
    print("No data scraped to write to CSV.")