In [1]:
from bs4 import BeautifulSoup
from os import makedirs
from os.path import exists
from polars import DataFrame
from requests import get
from urllib3 import disable_warnings

disable_warnings()


def scrape_link_eprint(urls, years):
    link_eprint = []

    for url in urls:
        for year in years:
            modified_url = url.replace("year", year)

            try:
                soup = BeautifulSoup(
                    get(modified_url, verify=False).content, "html.parser"
                )
                target_div = soup.find(
                    "div", class_="ep_view_page ep_view_page_view_divisions"
                )

                if target_div:
                    links = [
                        link.get("href", "").strip()
                        for link in target_div.find_all("a")
                    ][2:]
                    link_eprint.extend(links)

            except Exception as e:
                print(f"Error fetching data from {modified_url}: {e}")

    return link_eprint


def clean_text(text):
    return " ".join(text.replace("\n", " ").split())


def scrape_eprint(url):
    soup = BeautifulSoup(get(url, verify=False).content, "html.parser")

    data = {
        "url": url,
        "title": clean_text(soup.find("h1", class_="ep_tm_pagetitle").text.strip()),
    }

    data["abstract"] = (
        clean_text(soup.find("p", class_="ep_field_para").text.strip())
        if soup.find("p", class_="ep_field_para")
        else ""
    )

    def get_field(th_text):
        try:
            return clean_text(
                soup.find("th", string=th_text).find_next("td").text.strip()
            )
        except AttributeError:
            return ""

    data["document_type"] = get_field("Tipe Dokumen:")
    data["subject"] = get_field("Subjek:")
    data["unit_field"] = get_field("Bidang, Unit, atau Jurusan Yang Ditujukan:")
    data["user_id"] = get_field("User ID Pengunggah:")
    data["date_deposited"] = get_field("Date Deposited:")
    data["last_modified"] = get_field("Last Modified:")

    return data


def scrape_and_save_data(url_divisions, years):
    folder_path = "temp"

    if not exists(folder_path):
        makedirs(folder_path)

    file_path = f"{folder_path}/repository_pnj_{years[0]}{years[-1]}.csv"

    DataFrame(
        [scrape_eprint(url) for url in scrape_link_eprint(url_divisions, years) if url]
    ).write_csv(file_path)

    print(f"Data has been saved to ./{file_path}")

In [2]:
scrape_and_save_data(
    url_divisions=[
        # Administrasi Bisnis D3
        "https://repository.pnj.ac.id/view/divisions/prod=5Fab=5Fd3/year.default.html",
        # Administrasi Bisnis D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Fab=5Fd4/year.default.html",
        # MICE D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Fmice=5Fd4/year.default.html",
        # Akuntansi D3
        "https://repository.pnj.ac.id/view/divisions/prod=5Fak=5Fd3/year.default.html",
        # Akuntansi D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Fak=5Fd4/year.default.html",
        # Keuangan dan Perbankan D3
        "https://repository.pnj.ac.id/view/divisions/prod=5Fkpb=5Fd3/year.default.html",
        # Keuangan dan Perbankan D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Fkpb=5Fd4/year.default.html",
        # Keuangan dan Perbankan Syariah D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Fkpbs=5Fd4/year.default.html",
        # Manajemen Keuangan D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Fmk=5Fd4/year.default.html",
        # Manajemen Pemasaran D3
        "https://repository.pnj.ac.id/view/divisions/prod=5Fmp=5Fd3/year.default.html",
        # Bahasa Inggris untuk Komunikasi Bisnis dan Profesional
        "https://repository.pnj.ac.id/view/divisions/prod=5Fbispro=5Fd4/year.default.html",
        # Broadband Multimedia D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Fbm=5Fd4/year.default.html",
        # Instrumentasi dan Kontrol Industri D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Fiki=5Fd4/year.default.html",
        # Teknik Elektronika Industri D3
        "https://repository.pnj.ac.id/view/divisions/prod=5Ftei=5Fd3/year.default.html",
        # Teknik Listrik D3
        "https://repository.pnj.ac.id/view/divisions/prod=5Ftld3/year.default.html",
        # Teknik Otomasi Listrik Industri D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Ftoi=5Fd4/year.default.html",
        # Teknik Telekomunikasi D3
        "https://repository.pnj.ac.id/view/divisions/prod=5Ftt=5Fd3/year.default.html",
        # Desain Grafis D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Fdg=5Fd4/year.default.html",
        # Penerbitan / Jurnalistik D3
        "https://repository.pnj.ac.id/view/divisions/prod=5Fpj=5Fd3/year.default.html",
        # Teknik Grafika D3
        "https://repository.pnj.ac.id/view/divisions/prod=5Ftg=5Fd3/year.default.html",
        # Teknologi Industri Cetak Kemasan D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Ftick=5Fd4/year.default.html",
        # Teknik Informatika D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Fti=5Fd4/year.default.html",
        # Teknik Multimedia Digital D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Ftmd=5Fd4/year.default.html",
        # Teknik Multimedia dan Jaringan D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Ftmj=5Fd4/year.default.html",
        # Manufaktur D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Fman=5Fd4/year.default.html",
        # Pembangkit Tenaga Listrik D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Fptl=5Fd4/year.default.html",
        # Teknik Mesin D3
        "https://repository.pnj.ac.id/view/divisions/prod=5Ftmesin=5Fd3/year.default.html",
        # Teknologi Rekayasa Konversi Energi D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Fke=5Fd3/year.default.html",
        # Teknologi Rekayasa Pemeliharaan Alat Berat D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Falber=5Fd3/year.default.html",
        # Konstruksi Gedung D3
        "https://repository.pnj.ac.id/view/divisions/prod=5Fkg=5Fd3/year.default.html",
        # Konstruksi Gedung D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Fkg=5Fd4/year.default.html",
        # Konstruksi Sipil D3
        "https://repository.pnj.ac.id/view/divisions/prod=5Fks=5Fd3/year.default.html",
        # Teknik Perancangan Jalan dan Jembatan D4
        "https://repository.pnj.ac.id/view/divisions/prod=5Ftpjj=5Fd4/year.default.html",
    ],
    years=[
        "2021",
        "2022",
        "2023",
    ],
)

Data has been saved to ./temp/repository_pnj_20212023.csv
