In [2]:
import requests
from bs4 import BeautifulSoup

def crawl_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes

        soup = BeautifulSoup(response.content, 'html.parser')

        # Ambil semua judul h1, h2, h3
        headings = soup.find_all(['h1', 'h2', 'h3'])
        for heading in headings:
            print(f"{heading.name}: {heading.get_text()}")

        # Ambil semua link
        links = soup.find_all('a', href=True)
        for link in links:
            print(f"URL: {link['href']} | Teks: {link.get_text()}")

    except requests.exceptions.RequestException as e:
        print(f"Terjadi kesalahan saat mengakses {url}: {e}")

# Gunakan fungsi
crawl_website("https://its.ac.id")

h3: Welcome To ITS
h2: ITS Wins Entrepreneurial Marketing Award
h3: Study at ITS
h2: ITS Student Achieves Top 3 in CoC Season 2
h3: Research
h2: ITS Professor Explores Plant Adaptation on Suboptimal Land
h1: 
            Experience more on institut teknologi sepuluh nopember         
h3: Scholarship
h3: PPID
h3: Admission
h3: Selection Path
h3: Tuition Fees
h3: Discover Passion
h3: 
ITS News

h3: 
ITS MULTI-CAMPUS

h2: SUKOLILO
h2: MANYAR
h2: COKROAMINOTO
h2: BUNCITAN
h3: 
Public Services

h3: 
AGENDAS

h3: 25
h3: 05
h3: 22
URL: https://www.its.ac.id/ | Teks: 


URL: https://www.its.ac.id/admission/ | Teks: Prospective Students
URL: https://www.its.ac.id/current-student/ | Teks: Current Students
URL: https://www.its.ac.id/fresher/ | Teks: New Student
URL: https://www.its.ac.id/lecturer-and-staff/ | Teks: Lecturers & Staffs
URL: https://www.its.ac.id/parents/ | Teks: Parents
URL: https://www.its.ac.id/alumni/ | Teks: Alumni
URL: http://danaabadi.its.ac.id/web/ | Teks: Donate
URL: https:

In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def crawl_website(url):
    """
    Fungsi untuk crawling sebuah website, mengambil semua link, 
    dan menampilkannya dalam format: [Halaman Sumber] | [Link Keluar Absolut]
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Melempar error jika status code bukan 200 OK

        soup = BeautifulSoup(response.content, 'html.parser')

        print(f"Halaman Sumber | Link Keluar")
        print("-" * 80) # Membuat garis pemisah agar rapi

        # Ambil semua tag <a> yang memiliki atribut href
        links = soup.find_all('a', href=True)
        
        for link in links:
            # Mengambil nilai href dari link
            href = link['href']
            
            # Mengubah link relatif (misal: /admission) menjadi link absolut
            # urljoin akan menggabungkan URL dasar dengan href
            absolute_url = urljoin(url, href)
            
            # Mencetak dengan format yang diminta
            print(f"{url} | {absolute_url}")

    except requests.exceptions.RequestException as e:
        print(f"Terjadi kesalahan saat mengakses {url}: {e}")

# --- Gunakan fungsi ---
crawl_website("https://its.ac.id")

Halaman Sumber | Link Keluar
--------------------------------------------------------------------------------
https://its.ac.id | https://www.its.ac.id/
https://its.ac.id | https://www.its.ac.id/admission/
https://its.ac.id | https://www.its.ac.id/current-student/
https://its.ac.id | https://www.its.ac.id/fresher/
https://its.ac.id | https://www.its.ac.id/lecturer-and-staff/
https://its.ac.id | https://www.its.ac.id/parents/
https://its.ac.id | https://www.its.ac.id/alumni/
https://its.ac.id | http://danaabadi.its.ac.id/web/
https://its.ac.id | https://www.youtube.com/user/itseurekatv/live
https://its.ac.id | https://its.ac.id
https://its.ac.id | https://www.its.ac.id/id/beranda/
https://its.ac.id | javascript:;
https://its.ac.id | https://my.its.ac.id/
https://its.ac.id | https://its.ac.id
https://its.ac.id | https://www.its.ac.id/about-its/
https://its.ac.id | https://www.its.ac.id/about-its/facts-and-history/
https://its.ac.id | https://www.its.ac.id/about-its/ranking/
https://its.a

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv

def crawl_and_save_to_csv(url, filename="output.csv"):
    """
    Fungsi untuk crawling website, mengambil semua link, 
    dan menyimpannya ke dalam file CSV.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Melempar error jika status code bukan 200 OK

        soup = BeautifulSoup(response.content, 'html.parser')

        # Membuka file CSV untuk ditulis ('w' = write)
        # newline='' untuk menghindari baris kosong antar baris di file CSV
        # encoding='utf-8' untuk mendukung karakter internasional
        with open(filename, 'w', newline='', encoding='utf-8') as file:
            # Membuat object writer dari modul csv
            writer = csv.writer(file)
            
            # Menulis baris header (judul kolom)
            writer.writerow(["Halaman Sumber", "Link Keluar"])
            
            # Ambil semua tag <a> yang memiliki atribut href
            links = soup.find_all('a', href=True)
            
            print(f"Menemukan {len(links)} link. Menyimpan ke {filename}...")

            for link in links:
                href = link['href']
                absolute_url = urljoin(url, href)
                
                # Menulis satu baris data ke file CSV
                # Isinya adalah https://www.seoptimer.com/id/blog/url-absolut-vs-url-relatif/
                writer.writerow([url, absolute_url])

        print(f"✅ Selesai! Data berhasil disimpan ke file '{filename}'")

    except requests.exceptions.RequestException as e:
        print(f"Terjadi kesalahan saat mengakses {url}: {e}")

# --- Gunakan fungsi ---
# Kita tentukan URL target dan nama file output yang diinginkan
target_url = "https://its.ac.id"
output_filename = "hasil_scraping_its.csv"
crawl_and_save_to_csv(target_url, output_filename)

Menemukan 154 link. Menyimpan ke hasil_scraping_its.csv...
✅ Selesai! Data berhasil disimpan ke file 'hasil_scraping_its.csv'


In [5]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import pandas as pd

def crawl_filter_display_and_save(url, filename="hasil_scraping.csv"):
    """
    Fungsi lengkap untuk:
    1. Crawling website.
    2. Mengambil HANYA link internal (domain yang sama).
    3. Menampilkannya dalam format Pandas DataFrame.
    4. Menyimpannya ke dalam file CSV.
    """
    try:
        # Dapatkan nama domain dasar dari URL input untuk perbandingan
        base_hostname = urlparse(url).hostname.replace('www.', '')

        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')
        
        scraped_data = []
        links = soup.find_all('a', href=True)
        
        print(f"Memeriksa {len(links)} total link yang ditemukan di {url}...")

        for link in links:
            href = link['href']
            # Abaikan link anchor (#) dan link javascript yang tidak relevan
            if href.startswith('#') or href.startswith('javascript:'):
                continue
                
            absolute_url = urljoin(url, href)
            
            # Lakukan Pengecekan Domain
            try:
                link_hostname = urlparse(absolute_url).hostname
                if link_hostname and link_hostname.endswith(base_hostname):
                    scraped_data.append([url, absolute_url])
            except ValueError:
                continue

        # --- Bagian DataFrame ---
        # Buat DataFrame dari data yang sudah difilter
        df = pd.DataFrame(scraped_data, columns=["Halaman Sumber", "Link Keluar (Internal)"])
        
        print("\n--- Hasil Scraping (Filter Link Internal) ---")
        if df.empty:
            print("Tidak ada link internal yang ditemukan.")
            return # Hentikan fungsi jika tidak ada data untuk disimpan

        # Tampilkan DataFrame
        print(df)

        # --- Bagian Penyimpanan CSV ---
        try:
            df.to_csv(filename, index=False, encoding='utf-8')
            print(f"\n✅ Sukses! Data telah disimpan ke file '{filename}'")
        except Exception as e:
            print(f"\n❌ Gagal menyimpan file CSV: {e}")

    except requests.exceptions.RequestException as e:
        print(f"Terjadi kesalahan saat mengakses {url}: {e}")

# --- Gunakan fungsi ---
target_url = "https://its.ac.id"
output_filename = "link_internal_its.csv"
crawl_filter_display_and_save(target_url, output_filename)

Memeriksa 154 total link yang ditemukan di https://its.ac.id...

--- Hasil Scraping (Filter Link Internal) ---
        Halaman Sumber                             Link Keluar (Internal)
0    https://its.ac.id                             https://www.its.ac.id/
1    https://its.ac.id                   https://www.its.ac.id/admission/
2    https://its.ac.id             https://www.its.ac.id/current-student/
3    https://its.ac.id                     https://www.its.ac.id/fresher/
4    https://its.ac.id          https://www.its.ac.id/lecturer-and-staff/
..                 ...                                                ...
119  https://its.ac.id         https://www.its.ac.id/about-its/visit-its/
120  https://its.ac.id        https://www.its.ac.id/about-its/its-awards/
121  https://its.ac.id               https://www.its.ac.id/?page_id=11494
122  https://its.ac.id   https://www.its.ac.id/about-its/executive-board/
123  https://its.ac.id  https://www.its.ac.id/about-its/vision-and-mis...

