# Crawling Link Dalam Page

In [1]:
import requests
from bs4 import BeautifulSoup

def crawl_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes

        soup = BeautifulSoup(response.content, 'html.parser')

        # Ambil semua judul h1, h2, h3
        headings = soup.find_all(['h1', 'h2', 'h3'])
        for heading in headings:
            print(f"{heading.name}: {heading.get_text()}")

        # Ambil semua link
        links = soup.find_all('a', href=True)
        for link in links:
            print(f"URL: {link['href']} | Teks: {link.get_text()}")

    except requests.exceptions.RequestException as e:
        print(f"Terjadi kesalahan saat mengakses {url}: {e}")

# Gunakan fungsi
crawl_website("https://its.ac.id")

h3: Welcome To ITS
h2: ITS Promotes Indonesian Startups in Dubai
h3: Study at ITS
h2: ITS Spektronics Team Wins Two National Championships
h3: Research
h2: Drainage Blockage Detection Robot Developed by ITS Students
h1: 
            Experience more on institut teknologi sepuluh nopember         
h3: Scholarship
h3: PPID
h3: Admission
h3: Selection Path
h3: Tuition Fees
h3: Discover Passion
h3: 
ITS News

h3: 
ITS MULTI-CAMPUS

h2: SUKOLILO
h2: MANYAR
h2: COKROAMINOTO
h2: BUNCITAN
h3: 
Public Services

h3: 
AGENDAS

h3: 25
h3: 05
h3: 03
URL: https://www.its.ac.id/ | Teks: 


URL: https://www.its.ac.id/admission/ | Teks: Prospective Students
URL: https://www.its.ac.id/current-student/ | Teks: Current Students
URL: https://www.its.ac.id/fresher/ | Teks: New Student
URL: https://www.its.ac.id/lecturer-and-staff/ | Teks: Lecturers & Staffs
URL: https://www.its.ac.id/parents/ | Teks: Parents
URL: https://www.its.ac.id/alumni/ | Teks: Alumni
URL: http://danaabadi.its.ac.id/web/ | Teks: Donate

In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import pandas as pd
# Kita tidak butuh 'tabulate' lagi di Jupyter

def get_internal_links_df(url):
    """
    Fungsi ini melakukan crawling, memfilter link internal, 
    dan MENGEMBALIKAN sebuah DataFrame.
    """
    try:
        base_hostname = urlparse(url).hostname.replace('www.', '')
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        scraped_data = []
        links = soup.find_all('a', href=True)
        print(f"Memeriksa {len(links)} total link yang ditemukan di {url}...")

        for link in links:
            href = link['href']
            if href.startswith('#') or href.startswith('javascript:'):
                continue
            absolute_url = urljoin(url, href)
            try:
                link_hostname = urlparse(absolute_url).hostname
                if link_hostname and link_hostname.endswith(base_hostname):
                    scraped_data.append([url, absolute_url])
            except ValueError:
                continue

        df = pd.DataFrame(scraped_data, columns=["Halaman Sumber", "Link Keluar (Internal)"])
        print("DataFrame berhasil dibuat.")
        return df 
    
    except requests.exceptions.RequestException as e:
        print(f"Terjadi kesalahan saat mengakses {url}: {e}")
        return pd.DataFrame() # Kembalikan DataFrame kosong jika error

In [3]:
target_url = "https://its.ac.id"
hasil_df = get_internal_links_df(target_url)

Memeriksa 154 total link yang ditemukan di https://its.ac.id...
DataFrame berhasil dibuat.


In [4]:
hasil_df

Unnamed: 0,Halaman Sumber,Link Keluar (Internal)
0,https://its.ac.id,https://www.its.ac.id/
1,https://its.ac.id,https://www.its.ac.id/admission/
2,https://its.ac.id,https://www.its.ac.id/current-student/
3,https://its.ac.id,https://www.its.ac.id/fresher/
4,https://its.ac.id,https://www.its.ac.id/lecturer-and-staff/
...,...,...
119,https://its.ac.id,https://www.its.ac.id/about-its/visit-its/
120,https://its.ac.id,https://www.its.ac.id/about-its/its-awards/
121,https://its.ac.id,https://www.its.ac.id/?page_id=11494
122,https://its.ac.id,https://www.its.ac.id/about-its/executive-board/


In [5]:
hasil_df.to_csv("link_internal_its.csv", index=False)
print("File berhasil disimpan ke link_internal.csv")

File berhasil disimpan ke link_internal.csv
