# Pagerank Berita

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import deque  # Antrian (queue) yang efisien
import time
import sys
import pandas as pd

def build_pagerank_dataset(start_url, max_pages=50, delay=0.1):
    """
    Melakukan crawling multi-halaman untuk membangun edge list untuk PageRank.
    """
    
    pages_to_visit = deque([start_url])
    visited_pages = set()
    edge_list = []
    
    try:
        hostname = urlparse(start_url).hostname
        if not hostname:
            raise ValueError("Hostname tidak ditemukan. Pastikan URL diawali 'https://' atau 'http://'")
        base_hostname = hostname.replace('www.', '')
    except (ValueError, AttributeError) as e:
        print(f"Error: URL awal tidak valid -> '{start_url}'")
        print(f"Detail: {e}")
        return pd.DataFrame(columns=["Halaman Sumber", "Link Keluar (Internal)"]) 

    print(f"Memulai crawl dari: {start_url}")
    print(f"Domain target: {base_hostname}")
    print(f"Batas halaman: {max_pages}\n")

    IGNORED_EXTENSIONS = (
        '.png', '.jpg', '.jpeg', '.gif', '.svg', '.bmp', '.tiff', '.webp',
        '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
        '.zip', '.rar', '.gz', '.tar', '7z',
        '.mp4', '.mkv', '.avi', '.mov', '.mp3', '.wav', '.ogg',
        '.css', '.js', '.xml', '.json', '.csv'
    )

    # Kita akan mencari segmen path 'iklan'
    IGNORED_PATH_SEGMENTS = ('iklan',)

    while pages_to_visit and len(visited_pages) < max_pages:
        current_url = pages_to_visit.popleft()
        
        if current_url in visited_pages:
            continue
        
        try:
            parsed_current_url = urlparse(current_url) # Parse sekali saja
            cleaned_url_path = parsed_current_url.path
        except ValueError:
            print(f"  -> [SKIP] URL tidak valid: {current_url}")
            continue 

        if cleaned_url_path.lower().endswith(IGNORED_EXTENSIONS):
            print(f"  -> [SKIP] Mengabaikan file (dari ekstensi): {current_url}")
            continue
            
        
        # Pecah path menjadi segmen: '/berita/iklan/123' -> ['', 'berita', 'iklan', '123']
        current_path_segments = set(cleaned_url_path.split('/')) 
        
        # Cek apakah ada irisan (intersection) antara segmen path dan segmen yang diabaikan
        if not current_path_segments.isdisjoint(IGNORED_PATH_SEGMENTS):
            print(f"  -> [SKIP] Mengabaikan path terlarang: {current_url}")
            continue
       
            
        visited_pages.add(current_url)
        print(f"[{len(visited_pages)}/{max_pages}] Crawling: {current_url}")
        
        try:
            time.sleep(delay)
            
            try:
                head_response = requests.head(current_url, timeout=3, allow_redirects=True)
                head_response.raise_for_status() 
                content_type = head_response.headers.get('Content-Type', '')
                
                if 'text/html' not in content_type:
                    print(f"  -> [SKIP] Mengabaikan tipe konten non-HTML: {content_type}")
                    continue
            except requests.exceptions.RequestException as head_err:
                if 'head_response' in locals() and head_response.status_code >= 400:
                        print(f"  -> Gagal (HEAD): {current_url}: {head_err}")
                        continue
                pass

            response = requests.get(current_url, timeout=5)
            response.raise_for_status()
            
            final_content_type = response.headers.get('Content-Type', '')
            if 'text/html' not in final_content_type:
                    print(f"  -> [SKIP] Mengabaikan tipe konten non-HTML (setelah GET): {final_content_type}")
                    continue
            
            soup = BeautifulSoup(response.content, 'html.parser')
            links = soup.find_all('a', href=True)
            
            for link in links:
                href = link['href']
                
                if href.startswith(('#', 'javascript:', 'mailto:', 'tel:')) or not href.strip():
                    continue
                
                absolute_url = urljoin(current_url, href)
                absolute_url = absolute_url.split('#')[0] # Hapus fragment

                try:
                    parsed_absolute_url = urlparse(absolute_url)
                    
                    
                    absolute_url_path = parsed_absolute_url.path
                    found_path_segments = set(absolute_url_path.split('/'))
                    
                    if not found_path_segments.isdisjoint(IGNORED_PATH_SEGMENTS):
                        # Jangan tambahkan ke antrian jika path-nya diabaikan
                        continue
                    

                    link_hostname = parsed_absolute_url.hostname
                    if link_hostname and link_hostname.endswith(base_hostname):
                        edge_list.append([current_url, absolute_url])
                        if absolute_url not in visited_pages:
                            pages_to_visit.append(absolute_url)
                except ValueError:
                    # Jika URL yang ditemukan tidak valid
                    continue

        except requests.exceptions.RequestException as e:
            print(f"  -> Gagal mengakses (GET) {current_url}: {str(e)}")
    
    print(f"\nCrawl selesai. Total {len(visited_pages)} halaman di-visit.")
    print(f"Total {len(edge_list)} link (edge) ditemukan.")
    
    if not edge_list:
        print("Tidak ada edge yang ditemukan. DataFrame akan kosong.")
        return pd.DataFrame(columns=["Halaman Sumber", "Link Keluar (Internal)"])
    
    df = pd.DataFrame(edge_list, columns=["Halaman Sumber", "Link Keluar (Internal)"])
    df = df.drop_duplicates().reset_index(drop=True)
    
    print(f"Dataset akhir memiliki {len(df)} edge.")
    
    return df

# --- Bagian Utama untuk Menjalankan Kode ---
if __name__ == "__main__":
    
    target_url = "https://bangsaonline.com/"
    batas_halaman = 500
    
    start_time = time.time()
    
    pagerank_df = build_pagerank_dataset(target_url, max_pages=batas_halaman, delay=0.1)
    
    end_time = time.time()
    duration = end_time - start_time
    
    if not pagerank_df.empty:
        print("\n--- Contoh Hasil Dataset (Head) ---")
        print(pagerank_df.head(10))
        
        output_filename = "pagerank500_edges_berita_new.csv"
        pagerank_df.to_csv(output_filename, index=False)
        print(f"\nFile berhasil disimpan ke {output_filename}")
    else:
        print("\nDataset akhir kosong, tidak ada file CSV yang disimpan.")

    print(f"\n--- Selesai ---")
    minutes = int(duration // 60)
    seconds = int(duration % 60)
    print(f"Total waktu eksekusi: {minutes} menit {seconds} detik ({duration:.2f} detik)")

Memulai crawl dari: https://bangsaonline.com/
Domain target: bangsaonline.com
Batas halaman: 500

[1/500] Crawling: https://bangsaonline.com/
[2/500] Crawling: https://bangsaonline.com
[3/500] Crawling: https://bangsaonline.com/user/registration
  -> Gagal (HEAD): https://bangsaonline.com/user/registration: 500 Server Error: Internal Server Error for url: https://bangsaonline.com/user/registration
[4/500] Crawling: https://bangsaonline.com/feed/
  -> [SKIP] Mengabaikan tipe konten non-HTML: application/xml
[5/500] Crawling: https://bangsaonline.com/live
[6/500] Crawling: https://bangsaonline.com/kanal/jawa-timur
[7/500] Crawling: https://bangsaonline.com/kanal/jatim-metro
[8/500] Crawling: https://bangsaonline.com/kanal/jatim-tengah
[9/500] Crawling: https://bangsaonline.com/kanal/jatim-utara
[10/500] Crawling: https://bangsaonline.com/kanal/jatim-selatan
[11/500] Crawling: https://bangsaonline.com/kanal/jatim-timur
[12/500] Crawling: https://bangsaonline.com/kanal/jatim-barat
[13/500]