In [1]:
import spacy
import random
from spacy.training.example import Example
from spacy.util import minibatch, compounding
import json
with open('data_full.json', 'r', encoding='utf-8') as f:
    TRAIN_DATA = json.load(f)
nlp = spacy.blank("id")
ner = nlp.add_pipe("ner")
for item in TRAIN_DATA:
    for start, end, label in item['entities']:
        ner.add_label(label)
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*unaffected_pipes):
    optimizer = nlp.begin_training()
    for itn in range(300): 
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            examples = []
            for d in batch:
                doc = nlp.make_doc(d['text'])
                example = Example.from_dict(doc, {"entities": d['entities']})
                examples.append(example)
            nlp.update(examples, drop=0.5, losses=losses)
        print(f"Losses at iteration {itn}: {losses}")
nlp.to_disk("model")
print("Model saved to 'model'")




Losses at iteration 0: {'ner': 13967.142716713588}
Losses at iteration 1: {'ner': 10546.598069934958}
Losses at iteration 2: {'ner': 8770.66391412523}
Losses at iteration 3: {'ner': 7673.624405408091}
Losses at iteration 4: {'ner': 6981.402071063684}
Losses at iteration 5: {'ner': 6455.0341658454}
Losses at iteration 6: {'ner': 6132.434623955842}
Losses at iteration 7: {'ner': 5896.302248239558}
Losses at iteration 8: {'ner': 5588.071745699919}
Losses at iteration 9: {'ner': 5446.760442979574}
Losses at iteration 10: {'ner': 5193.543653198398}
Losses at iteration 11: {'ner': 5051.165867038846}
Losses at iteration 12: {'ner': 4928.54749017089}
Losses at iteration 13: {'ner': 4814.373721709857}
Losses at iteration 14: {'ner': 4713.375847702382}
Losses at iteration 15: {'ner': 4533.689125210166}
Losses at iteration 16: {'ner': 4379.505027068939}
Losses at iteration 17: {'ner': 4272.329638794174}
Losses at iteration 18: {'ner': 4304.534464695865}
Losses at iteration 19: {'ner': 4155.747701

In [5]:
text = """
Keamanan dan perlindungan data menjadi isu hangat usai Pusat Data Nasional (PDN) diretas beberapa waktu lalu. Sehingga, semua pihak termasuk perusahaan Badan Usaha Milik Negara (BUMN) perlu mewaspadai serangan ransomware.
Staf Khusus Menteri BUMN Arya Sinulingga mengatakan, Menteri BUMN Erick Thohir meminta semua perusahaan pelat merah memperkuat sistem keamanannya.

"Kita dengan kejadian PDN Ini memang Pak Menteri juga meminta kita semua memperkuat keamanan sistemnya semua," ujarnya di Jakarta, Kamis (4/7/2024).
Perusahaan negara di semua sektor mulai dari sektor perbankan, PT Pertamina (Persero), Telkom, dan yang lainnya diminta untuk tidak meremehkan keamanan sistem data.

"Apa pun ceritanya keamanan siber ini penting dan ini kejadian walaupun kemarin kita tidak kena di BUMN tapi harus jadi peringatan bagi semua untuk memperkuat," sebutnya.

Menurutnya, backup data perlu dilakukan untuk mengantisipasi hal yang tidak diinginkan di kemudian hari. "Backup mandatori lah itu yang harus dilakukan BUMN supaya kalau ada apa apa bisa menggantikan dengan cepat," pungkasnya.

"""

doc = nlp(text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])


Entities [('Badan Usaha Milik Negara', 'ORGANIZATION'), ('BUMN', 'ORGANIZATION'), ('Arya Sinulingga', 'PERSON'), ('BUMN', 'ORGANIZATION'), ('Erick Thohir', 'PERSON'), ('Jakarta', 'LOCATION'), ('PT Pertamina (Persero)', 'ORGANIZATION'), ('Telkom', 'ORGANIZATION'), ('BUMN', 'ORGANIZATION'), ('BUMN', 'ORGANIZATION')]
