# Crawling PTA Trunojoyo

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re, sys, time
from concurrent.futures import ThreadPoolExecutor, as_completed

In [8]:
BASE_URL = "https://pta.trunojoyo.ac.id/c_search/byprod"

# Fungsi untuk menentukan maksimal halaman pada daftar page pada website

In [9]:
def get_max_page(prodi_id):
    url = f"{BASE_URL}/{prodi_id}/1"
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")

    last_page = soup.select_one('ol.pagination a:contains("»")')
    if last_page and "href" in last_page.attrs:
        href = last_page["href"]
        max_page = int(href.split("/")[-1])
        return max_page
    return 1

# Fungsi untuk menampilkan progres bar pada terminal atau console

In [10]:
def print_progress(prodi_id, prodi, current_page, total_pages):
    percent = (current_page / total_pages) * 100
    bar_length = 20
    filled_length = int(bar_length * current_page // total_pages)
    bar = '█' * filled_length + '-' * (bar_length - filled_length)
    sys.stdout.write(f'\r[{prodi_id}] {prodi} - Page {current_page}/{total_pages} [{bar}] {percent:.2f}%')
    sys.stdout.flush()
    if current_page == total_pages:
        sys.stdout.write('\n')

# Fungsi untuk mengambil detail isi skripsi dan fungsi untuk crawling seluruh data skripsi

In [11]:
# fungsi ambil detail skripsi
def crawl_detail(link_keluar, prodi):
    try:
        id_match = re.search(r"/detail/(\d+)", link_keluar)
        pta_id = id_match.group(1) if id_match else None

        response = requests.get(link_keluar, timeout=10)
        soup1 = BeautifulSoup(response.content, "html.parser")
        isi = soup1.select_one('div#content_journal')

        judul = isi.select_one('a.title').text.strip()
        penulis = isi.select_one('span:contains("Penulis")').text.split(' : ')[1]
        pembimbing_pertama = isi.select_one('span:contains("Dosen Pembimbing I")').text.split(' : ')[1]
        pembimbing_kedua = isi.select_one('span:contains("Dosen Pembimbing II")').text.split(' :')[1]

        paragraf = isi.select('p[align="justify"]')
        abstrak_id = paragraf[0].get_text(strip=True) if len(paragraf) > 0 else "N/A"
        abstrak_en = paragraf[1].get_text(strip=True) if len(paragraf) > 1 else "N/A"

        return {
            "id": pta_id,
            "penulis": penulis,
            "judul": judul,
            "abstrak_id": abstrak_id,
            "abstrak_en": abstrak_en,
            "pembimbing_pertama": pembimbing_pertama,
            "pembimbing_kedua": pembimbing_kedua,
            "prodi": prodi,
        }
    except Exception as e:
        return None

def pta_all():
    start_time = time.time()
    results = []

    total_prodi = 41
    total_pages = 2 
    max_pages_dict = {}

    # hitung total halaman
    for i in range(1, total_prodi + 1):
        max_page = get_max_page(i)
        max_pages_dict[i] = max_page

    for i in range(1, total_prodi + 1):
        max_page = max_pages_dict[i]
        for j in range(1, max_page + 1):
            url = f"{BASE_URL}/{i}/{j}"
            r = requests.get(url)
            soup = BeautifulSoup(r.content, "html.parser")
            jurnals = soup.select('li[data-cat="#luxury"]')

            isii = soup.select_one('div#begin')
            if not isii:
                continue
            prodi_full = isii.select_one('h2').text.strip()
            prodi = prodi_full.replace("Journal Jurusan ", "")

            links = [jurnal.select_one('a.gray.button')['href'] for jurnal in jurnals]

            # ambil detail paralel
            with ThreadPoolExecutor(max_workers=10) as executor:
                future_to_link = {executor.submit(crawl_detail, link, prodi): link for link in links}
                for future in as_completed(future_to_link):
                    data = future.result()
                    if data:
                        results.append(data)

            # progress bar
            print_progress(i, prodi, j, max_page)

        sys.stdout.write("\n")

    df = pd.DataFrame(results)
    df.to_csv("pta_all.csv", index=False, encoding="utf-8-sig")

    end_time = time.time()
    elapsed = int(end_time - start_time)
    jam, sisa = divmod(elapsed, 3600)
    menit, detik = divmod(sisa, 60)

    print("\n✅ Seluruh data berhasil dikumpulkan!")
    print(f"📊 Total entri: {len(df)}")
    print(f"⏱️ Waktu eksekusi: {jam} jam {menit} menit {detik} detik")

    return df

In [12]:
pta_all()

[1] Ilmu Hukum - Page 284/284 [████████████████████] 100.00%

[2] Teknologi Industri Pertanian - Page 114/114 [████████████████████] 100.00%

[3] Agribisnis - Page 110/110 [████████████████████] 100.00%

[4] Agroteknologi - Page 116/116 [████████████████████] 100.00%

[5] Ilmu Kelautan - Page 96/96 [████████████████████] 100.00%

[6] Ekonomi Pembangunan - Page 118/118 [████████████████████] 100.00%

[7] Manajemen - Page 207/207 [████████████████████] 100.00%

[8] Akuntansi - Page 177/177 [████████████████████] 100.00%

[9] Teknik Industri - Page 143/143 [████████████████████] 100.00%

[10] Teknik Informatika - Page 172/172 [████████████████████] 100.00%

[11] Manajemen Informatika - Page 56/56 [████████████████████] 100.00%

[12] Sosiologi - Page 136/136 [████████████████████] 100.00%

[13] Ilmu Komunikasi - Page 135/135 [████████████████████] 100.00%

[14] Psikologi - Page 104/104 [████████████████████] 100.00%

[15] Sastra Inggris - Page 133/133 [████████████████████] 100.00%

[16] E

Unnamed: 0,id,penulis,judul,abstrak_id,abstrak_en,pembimbing_pertama,pembimbing_kedua,prodi
0,080111100012,Dyah Ayu Citra Seza,Implementasi Fungsi Legislasi Dewan Perwakilan...,ABSTRAK\r\n\r\n Implementasi Fungsi Legi...,ABSTRACT\r\n Implementation of Legislati...,"Yudi Widagdo Harimurti, SH., MH","Safi', SH., MH",Ilmu Hukum
1,090111100077,TOMMY ADITYA PARLINDUNGAN MARBUN,PERLINDUNGAN HUKUM BAGI KONSUMEN ATAS PRODUK E...,Produk elektronik adalah suatu benda bergerak ...,Electronic products is an object moves through...,"DR. DJULAEKA, S.H., M.HUM","DR.USWATUN HASANAH, S.H., M. HUM",Ilmu Hukum
2,070111100060,Moh. Samsul Hidayat,Analisis Terhadap Kekosongan Hukum dalam Penga...,Kasus narkoba tidak henti-hentinya terdengar d...,"Drug cases endlessly heard on television, radi...","Tolib Effendi, SH., MH.","Agus Ramdlany, SH., MH.",Ilmu Hukum
3,080111100002,Maulina Nurlaily,Pertanggungjawaban Pidana Direksi BUMN (Perser...,Badan Usaha Milik Negara (BUMN) adalah Badan u...,State Owned Enterprises (SOEs) are business en...,"Tolib Effendi, SH., MH.","Dr. Eni Suastuti, SH., Mhum.",Ilmu Hukum
4,070111200007,RICA YENA IMADHORA,TELAAH KRITIS TENTANG ALASAN HUKUM YANG DIGUN...,,,"Dr. DENI SBY, S. H., M. S.","SAIFUL ABDULLAH, S. H., M. H.",Ilmu Hukum
...,...,...,...,...,...,...,...,...
14623,160281100006,Sri Rohyatiningsih,PENGARUH TUNJANGAN KINERJA TERHADAP PRODUKTIVI...,Seiring dengan adanya tunjangan kinerja bagi p...,Along with the performance allowance for emplo...,"Dr. Mohtar Rasyid, S.E., M.Sc","Dr. Anita Kristina, S.E., M.Si",Magister Ilmu Ekonomi
14624,160281100007,MUSOFFAN,Analisis Daya Dukung Dermaga Branta Pesisir S...,Dermaga Branta di Desa Branta Pesisir Kecamat...,Branta Port in Branta Pesisir Tlanakan Pamekas...,"Dr. Kurniyati Indahsari, M.Si","Dr. Agus Romadhon, SP. M.Si",Magister Ilmu Ekonomi
14625,170361100010,ahmad syaiful umam,KARAKTERISASI DAN KOLEKSI PLASMA NUTFAH UNTUK ...,Madura merupakan salah satu wilayah pemasok ko...,Madura is one of the regions supplying horticu...,"Dr. Ir. Gita Pawana, M.Si","Dr. Ir. Hj. SIti Fatimah, M.Si",Magister Pengelolaan Sumber Daya Alam
14626,170361100001,Siti Holifah,PENGOLAHAN LIMBAH AIR REBUSAN IKAN TERI MENJAD...,Ikan Teri perlu penanganan serius pasca panen ...,Anchovy needs serious handling after harvest b...,"Dr.Apri Arisandi,S.Pi.,M.Si.","Dr.Ir.H.Asfan,MP.",Magister Pengelolaan Sumber Daya Alam
