# tugas 2


In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re
import concurrent.futures
import random # Impor modul random untuk jeda

# --- PENYESUAIAN KESEIMBANGAN ---
# Kurangi jumlah pekerja agar tidak terlalu agresif
MAX_WORKERS = 2

HEADERS = {
    'User-Agent': 'ganti lek mu wak'
}

# ... (fungsi get_fakultas_prodi_list tidak berubah) ...
def get_fakultas_prodi_list():
    prodi_list = []
    try:
        url_nav = "https://pta.trunojoyo.ac.id/c_search/byfac"
        r = requests.get(url_nav, timeout=10, headers=HEADERS)
        r.raise_for_status()
        soup = BeautifulSoup(r.content, "html.parser")
        sidebar_nav = soup.select_one('div.box.sidebar_nav')
        if not sidebar_nav: return []
        fakultas_items = sidebar_nav.select_one('ul').find_all('li', recursive=False)
        for item_fakultas in fakultas_items:
            anchor_fakultas = item_fakultas.find('a', recursive=False)
            if not anchor_fakultas: continue
            nama_fakultas = anchor_fakultas.get_text(strip=True)
            ul_prodi = item_fakultas.find('ul')
            if not ul_prodi: continue
            for link_prodi in ul_prodi.select('li a'):
                nama_prodi = link_prodi.get_text(strip=True)
                href = link_prodi.get('href')
                prodi_id = href.strip('/').split('/')[-1]
                if prodi_id.isdigit():
                    prodi_list.append({
                        "id_prodi": int(prodi_id),
                        "nama_prodi": nama_prodi,
                        "nama_fakultas": nama_fakultas
                    })
    except requests.exceptions.RequestException as e:
        print(f"Gagal mengambil daftar prodi: {e}")
    return prodi_list


def scrape_jurnal_detail(jurnal_url):
    try:
        # --- PENYESUAIAN KESEIMBANGAN ---
        # Beri jeda acak kecil sebelum setiap permintaan detail
        time.sleep(random.uniform(0.5, 1.5)) 
        
        response = requests.get(jurnal_url, timeout=15, headers=HEADERS)
        response.raise_for_status()
        # ... (sisa fungsi sama)
        isi = BeautifulSoup(response.content, "html.parser").select_one('div#content_journal')
        if not isi: return None
        judul = isi.select_one('a.title').text.strip()
        penulis = isi.select_one('span:contains("Penulis")').text.split(' : ')[1].strip()
        pembimbing_pertama = isi.select_one('span:contains("Dosen Pembimbing I")').text.split(' : ')[1].strip()
        pembimbing_kedua = isi.select_one('span:contains("Dosen Pembimbing II")').text.split(':')[1].strip()
        abstract_paragraphs = isi.select('p[align="justify"]')
        text_indo_mentah = abstract_paragraphs[0].text if len(abstract_paragraphs) > 0 else ""
        abstrak_indonesia = re.sub(r'\s+', ' ', text_indo_mentah).strip()
        text_inggris_mentah = abstract_paragraphs[1].text if len(abstract_paragraphs) > 1 else ""
        abstrak_inggris = re.sub(r'\s+', ' ', text_inggris_mentah).strip()
        return {
            "penulis": penulis, "judul": judul, "pembimbing_pertama": pembimbing_pertama,
            "pembimbing_kedua": pembimbing_kedua, "abstrak": abstrak_indonesia,
            "abstrak_inggris": abstrak_inggris
        }
    except requests.exceptions.RequestException:
        return None

# ... (Fungsi scrape_prodi dan main tetap sama seperti versi multithreading sebelumnya) ...
def scrape_prodi(prodi):
    jurnal_urls = []
    page = 1
    while True:
        try:
            url = f"https://pta.trunojoyo.ac.id/c_search/byprod/{prodi['id_prodi']}/{page}"
            r = requests.get(url, timeout=10, headers=HEADERS)
            r.raise_for_status()
            soup = BeautifulSoup(r.content, "html.parser")
            jurnals_on_page = soup.select('li[data-cat="#luxury"] a.gray.button')
            if not jurnals_on_page: break
            for a_tag in jurnals_on_page:
                jurnal_urls.append(a_tag['href'])
            page += 1
        except requests.exceptions.RequestException:
            break

    if not jurnal_urls:
        print(f"\n✔️ Selesai: {prodi['nama_fakultas']} - {prodi['nama_prodi']} (ID: {prodi['id_prodi']}) | Ditemukan 0 jurnal.")
        return [{'id_prodi': prodi['id_prodi'], 'nama_prodi': prodi['nama_prodi'], 'nama_fakultas': prodi['nama_fakultas'], 'judul': 'Tidak ada jurnal', 'penulis': None, 'pembimbing_pertama': None, 'pembimbing_kedua': None, 'abstrak': None, 'abstrak_inggris': None}]

    all_jurnal_data = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        hasil_scrape = executor.map(scrape_jurnal_detail, jurnal_urls)
        for hasil in hasil_scrape:
            if hasil:
                hasil['id_prodi'] = prodi['id_prodi']
                hasil['nama_prodi'] = prodi['nama_prodi']
                hasil['nama_fakultas'] = prodi['nama_fakultas']
                all_jurnal_data.append(hasil)
    
    print(f"\n✔️ Selesai: {prodi['nama_fakultas']} - {prodi['nama_prodi']} (ID: {prodi['id_prodi']}) | Berhasil mengambil {len(all_jurnal_data)} dari {len(jurnal_urls)} jurnal yang ditemukan.")
    return all_jurnal_data

def main():
    start_time = time.time()
    daftar_prodi = get_fakultas_prodi_list()
    if not daftar_prodi: return pd.DataFrame()

    final_results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        results_per_prodi = executor.map(scrape_prodi, daftar_prodi)
        for list_jurnal in results_per_prodi:
            final_results.extend(list_jurnal)

    df = pd.DataFrame(final_results)
    if not df.empty:
        original_prodi_order = [p['id_prodi'] for p in daftar_prodi]
        df['id_prodi'] = pd.Categorical(df['id_prodi'], categories=original_prodi_order, ordered=True)
        df = df.sort_values('id_prodi').reset_index(drop=True)
        df = df[['nama_fakultas', 'id_prodi', 'nama_prodi', 'judul', 'penulis', 'pembimbing_pertama', 'pembimbing_kedua', 'abstrak', 'abstrak_inggris']]
    
    df.to_csv("pta_semua_jurnal_concurrent.csv", index=False)
    
    end_time = time.time()
    print("\n\n✅ Proses scraping selesai.")
    print(f"Total baris data yang dihasilkan: {len(df)}.")
    print(f"Total waktu eksekusi: {end_time - start_time:.2f} detik.")
    
    return df

# if __name__ == "__main__":
#     df_hasil_semua = main()
#     df_hasil_semua


In [None]:
main()