# Link Scrapping PTA Teknik Informatika

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
BASE_URL = "https://pta.trunojoyo.ac.id"
START_URL = "https://pta.trunojoyo.ac.id/c_search/byprod/10" # Teknik Informatika
headers = {"User-Agent": "Mozilla/5.0"}

In [3]:
def get_internal_links(url):
    """Ambil semua link internal dari satu halaman"""
    try:
        r = requests.get(url, headers=headers, timeout=10)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        links = []
        for a in soup.find_all("a", href=True):
            href = a["href"]
            if href.startswith("/") or href.startswith(BASE_URL):
                if not href.startswith("javascript"):
                    if href.startswith("/"):
                        href = BASE_URL + href
                    links.append(href.split("#")[0])
        return list(set(links))
    except Exception as e:
        print(f"Error akses {url}: {e}")
        return []

def crawl_graph(start_url, max_pages=20):
    visited = set()
    to_visit = [start_url]
    edges = []

    while to_visit and len(visited) < max_pages:
        current = to_visit.pop(0)
        if current in visited:
            continue
        visited.add(current)
        print(f"Crawling: {current}")
        
        links = get_internal_links(current)
        for link in links:
            edges.append({"source": current, "target": link})
            if link not in visited and link not in to_visit:
                to_visit.append(link)
    
    return edges

# Jalankan crawler graph
edges = crawl_graph(START_URL, max_pages=10)

# Simpan ke DataFrame
df = pd.DataFrame(edges)
df.to_excel("pta_informatika.xlsx", index=False)

Crawling: https://pta.trunojoyo.ac.id/c_search/byprod/10
Crawling: https://pta.trunojoyo.ac.id/c_search/byprod/41
Crawling: https://pta.trunojoyo.ac.id/c_search/
Crawling: https://pta.trunojoyo.ac.id/c_contact/
Crawling: https://pta.trunojoyo.ac.id/c_search/byprod/11
Crawling: https://pta.trunojoyo.ac.id/welcome/detail/070411100070
Crawling: https://pta.trunojoyo.ac.id/c_search/byfac/99
Crawling: https://pta.trunojoyo.ac.id/c_search/byfac/100
Crawling: https://pta.trunojoyo.ac.id/c_search/byprod/35
Crawling: https://pta.trunojoyo.ac.id/c_search/byfac/6


In [4]:
df

Unnamed: 0,source,target
0,https://pta.trunojoyo.ac.id/c_search/byprod/10,https://pta.trunojoyo.ac.id/c_search/byprod/41
1,https://pta.trunojoyo.ac.id/c_search/byprod/10,https://pta.trunojoyo.ac.id/c_search/
2,https://pta.trunojoyo.ac.id/c_search/byprod/10,https://pta.trunojoyo.ac.id/c_contact/
3,https://pta.trunojoyo.ac.id/c_search/byprod/10,https://pta.trunojoyo.ac.id/c_search/byprod/11
4,https://pta.trunojoyo.ac.id/c_search/byprod/10,https://pta.trunojoyo.ac.id/welcome/detail/070...
...,...,...
575,https://pta.trunojoyo.ac.id/c_search/byfac/6,https://pta.trunojoyo.ac.id/c_search/byprod/26
576,https://pta.trunojoyo.ac.id/c_search/byfac/6,https://pta.trunojoyo.ac.id/welcome/detail/110...
577,https://pta.trunojoyo.ac.id/c_search/byfac/6,https://pta.trunojoyo.ac.id/c_search/byprod/14
578,https://pta.trunojoyo.ac.id/c_search/byfac/6,https://pta.trunojoyo.ac.id/c_search/byprod/27
