In [None]:
from collections import defaultdict
from lxml import html
import pandas as pd
import re

In [None]:
beacon = pd.read_csv("data/jonas/beacon-public.csv")

In [None]:
beacon = beacon[beacon["record_count_2020"] >= 5].reset_index(drop=True)  # active

In [None]:
beacon = beacon[["oai_url", "set_spec", "context_name", "issn", "country_consolidated"]]

In [None]:
beacon["issn_1"] = beacon["issn"].str.split(r"\n").str.get(0)
beacon["issn_2"] = beacon["issn"].str.split(r"\n").str.get(1)

In [None]:
beacon["oai_url"] = (
    beacon["oai_url"]
    .str.split(r"\s|\?")
    .str.get(0)
    .str.replace("[\./]+$", "", regex=True)
)

In [None]:
pattern = re.compile("index/oai$|oai$")


def create_url(row):
    return pattern.sub("", row["oai_url"]) + row["set_spec"]


beacon["journal_url"] = beacon.apply(create_url, axis=1)

In [None]:
issn_set = set(str(issn) for issn in beacon["issn_1"].tolist())
issn_set.add(str(issn) for issn in beacon["issn_2"].tolist())

In [None]:
len(issn_set)

In [None]:
for issn in issn_set:
    if not isinstance(issn, str):
        print(issn)
        issn_set.remove(issn)
        break

In [None]:
len(issn_set)

In [None]:
metadata_pattern = "<metadata>.+</metadata>"

In [None]:
metaData = defaultdict(lambda: defaultdict(list))
record_count = defaultdict(int)

for issn in issn_set:
    record_count[issn] = 0

In [None]:
len(record_count)

In [None]:
with open("pkpbeacon.txt", "r") as f:
    for line in f:
        content = re.search(metadata_pattern, line, re.MULTILINE | re.DOTALL)
        if content:
            break

In [None]:
content.group(0)

In [None]:
import lxml.etree as etree
from io import StringIO

In [None]:
# test string
metastr = "<metadata>\\n<oai_dc:dc xmlns:oai_dc=\\'http://www.openarchives.org/OAI/2.0/oai_dc/\\' xmlns:dc=\\'http://purl.org/dc/elements/1.1/\\' xmlns:xsi=\\'http://www.w3.org/2001/XMLSchema-instance\\' xsi:schemaLocation=\\'http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd\\'>\\n<dc:title xml:lang=\\'id-ID\\'>PEMBUKTIAN PERJANJIAN PIUTANG YANG TIDAK TERTULIS DALAM PERKARA KEPAILITAN DI PENGADILAN</dc:title>\\n<dc:creator>SUSILO, ANTONY FERNANDO</dc:creator>\\n<dc:subject xml:lang=\\'id-ID\\'/>\\n<dc:subject xml:lang=\\'id-ID\\'>Kepailitan; Debitur; Kreditur; Pengadilan Niaga</dc:subject>\\n<dc:description xml:lang=\\'id-ID\\'>Penulisan ini bertujuan untuk mengetahui bagaimana pembuktian perjanjian hutang tidak tertulis dalam perkara pailit di pengadilan dengan melakukan analisis yuridis berdasarkan perkara pailit perorangan Leo Wijaya Kusuma. Kasus pailit perorangan ini terjadi antara anggota keluarga,dimana debitur dan kreditor membuat perjanjian hutang tidak tertulis, dan pada waktu yang ditentukan debitur tidak melunasi pinjaman dengan baik, sehingga kreditor mengajukan permohonan pailit ke Pengadilan Niaga untuk menyelesaikan masalah tersebut. Penulisan ini dikaji dengan pendekatan studi yuridis normatif berdasarkan literatur-literatur dan data kepustakaan. Berdasarkan hasil penelitian ditemukan bahwa debitur akan dikenakan sanksi karena lalai membayar hutang dan Pengadilan Niaga juga mengabulkan permohonan pailit dengan mengirimkan somasi terhadap debitur.</dc:description>\\n<dc:publisher xml:lang=\\'en-US\\'>PMIH Untag Semarang</dc:publisher>\\n<dc:contributor xml:lang=\\'id-ID\\'/>\\n<dc:date>2009-07-10</dc:date>\\n<dc:type>info:eu-repo/semantics/article</dc:type>\\n<dc:type>info:eu-repo/semantics/publishedVersion</dc:type>\\n<dc:type xml:lang=\\'id-ID\\'/>\\n<dc:type xml:lang=\\'id-ID\\'/>\\n<dc:format>application/pdf</dc:format>\\n<dc:identifier>http://jurnal.untagsmg.ac.id/index.php/malrev/article/view/2080</dc:identifier>\\n<dc:identifier>10.35973/malrev.v2i1.2080</dc:identifier>\\n<dc:source xml:lang=\\'en-US\\'>MAGISTRA Law Review; Vol 2, No 01 (2021): MAGISTRA Law Review; 78-92</dc:source>\\n<dc:source xml:lang=\\'id-ID\\'>MAGISTRA Law Review; Vol 2, No 01 (2021): MAGISTRA Law Review; 78-92</dc:source>\\n<dc:source>2715-2502</dc:source>\\n<dc:source>10.35973/malrev.v2i1</dc:source>\\n<dc:language>ind</dc:language>\\n<dc:relation>http://jurnal.untagsmg.ac.id/index.php/malrev/article/view/2080/1423</dc:relation>\\n<dc:rights xml:lang=\\'en-US\\'>Copyright (c) 2021 MAGISTRA Law Review</dc:rights>\\n<dc:rights xml:lang=\\'en-US\\'>http://creativecommons.org/licenses/by/4.0</dc:rights>\\n</oai_dc:dc>\\n</metadata>"

In [None]:
tree = etree.XML(metastr)

In [None]:
parser = etree.XMLParser()
tree = etree.XMP(StringIO(metastr), parser)
result = etree.tostring(tree.getroot(), pretty_print=True, method="html")

In [None]:
print(result)  # test

In [None]:
# Full processing

with open("Data/pkpbeacon.txt", "r") as f:
    count = 0
    for line in f:

        content = re.search(metadata_pattern, line, re.MULTILINE | re.DOTALL)
        if content:

            tree = html.fromstring(content.group(0))
            for article in tree.xpath("//metadata"):

                count += 1
                print(article)
                break

                for source in article.xpath(".//source"):
                    if source.text in issn_set:
                        if record_count[source.text] < 100:
                            record_count[source.text] += 1

                            authors = []
                            for creator in article.xpath(".//creator"):
                                if creator is not None:
                                    authors.append(creator.text)
                            if authors:
                                metaData[source.text]["creator"].append(authors)

                            for title in article.xpath(".//title"):
                                if title is not None:
                                    metaData[source.text]["title"].append(title.text)

                            for description in article.xpath(".//description"):
                                if description is not None:
                                    metaData[source.text]["description"].append(
                                        description.text
                                    )

                            for subject in article.xpath(".//subject"):
                                if subject is not None:
                                    metaData[source.text]["subject"].append(
                                        subject.text
                                    )

                            for language in article.xpath(".//language"):
                                if language is not None:
                                    metaData[source.text]["language"].append(
                                        language.text
                                    )

            while tree.getprevious() is not None:
                del tree.getparent()[0]
        del content

In [None]:
len(metaData)

In [None]:
# import json
# with open('Data/beacon_metadata.json', 'w') as f:
# json.dump(metaData, f)

In [None]:
# metaDF = pd.DataFrame.from_dict(metaData, orient='index')

In [None]:
# metaDF.head()