In [29]:
import spacy
from spacy.training.example import Example
import random
import json
import os
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report, f1_score
from seqeval.scheme import IOB2

In [30]:
# 1. Membuat model kosong dengan bahasa Indonesia
nlp = spacy.blank("id")
print("Model kosong bahasa Indonesia berhasil dibuat.")

Model kosong bahasa Indonesia berhasil dibuat.


In [31]:
# 2. Menambahkan pipeline NER secara manual
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

In [32]:
# 3. Load data training dari file JSON (akses key 'annotations')
with open("annotations.json", "r", encoding="utf-8") as f:
    data_json = json.load(f)
    ALL_DATA = data_json["annotations"]

In [33]:
# Bagi data menjadi training dan validation (70% - 30%)
TRAIN_DATA, VALID_DATA = train_test_split(ALL_DATA, test_size=0.3, random_state=42)
print(f"Data training: {len(TRAIN_DATA)}, Data validasi: {len(VALID_DATA)}")

Data training: 308, Data validasi: 133


In [34]:
# 4. Tambahkan label entitas
for text, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

print(f"Label yang ditambahkan: {ner.labels}")

Label yang ditambahkan: ('APP', 'DESC')


In [35]:
n_iter = 100
optimizer = nlp.begin_training()
best_f1_tagging = 0.0
print("Pelatihan dimulai...\n")

# Pendekatan 1: Span Matching
def evaluate_span(nlp, data):
    tp, fp, fn = 0, 0, 0
    for text, annotations in data:
        doc = nlp(text)
        gold = set([(start, end, label) for start, end, label in annotations.get("entities")])
        pred = set([(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents])
        tp += len(gold & pred)
        fp += len(pred - gold)
        fn += len(gold - pred)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    return precision, recall, f1


Pelatihan dimulai...



In [36]:
# Pendekatan 2: BIO Tagging dengan Seqeval
def evaluate_bio(nlp, data):
    true_entities_list = []
    pred_entities_list = []

    for text, annotations in data:
        doc = nlp(text)

        # Prediksi dari model
        pred_tags = []
        for token in doc:
            if token.ent_type_:
                if token.ent_iob_ == "B":
                    pred_tags.append(f"B-{token.ent_type_}")
                else:
                    pred_tags.append(f"I-{token.ent_type_}")
            else:
                pred_tags.append("O")

        # Ground truth
        true_tags = ["O"] * len(doc)
        for start, end, label in annotations["entities"]:
            for token_idx, token in enumerate(doc):
                if start <= token.idx < end:
                    if token.idx == start or (token_idx > 0 and doc[token_idx - 1].idx < start):
                        true_tags[token_idx] = f"B-{label}"
                    else:
                        true_tags[token_idx] = f"I-{label}"

        pred_entities_list.append(pred_tags)
        true_entities_list.append(true_tags)

    report = classification_report(true_entities_list, pred_entities_list, scheme=IOB2, output_dict=True)
    f1 = f1_score(true_entities_list, pred_entities_list, scheme=IOB2)
    return report, f1

In [37]:
# 7. Loop Training
for itn in range(n_iter):
    print(f"Iterasi {itn + 1}/{n_iter}")
    random.shuffle(TRAIN_DATA)
    losses = {}

    # Mini-batch
    batches = [TRAIN_DATA[i:i+4] for i in range(0, len(TRAIN_DATA), 4)]
    for batch in batches:
        examples = []
        for text, annotations in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            examples.append(example)
        nlp.update(examples, drop=0.2, losses=losses)

    # Evaluasi setiap 10 iterasi
    if (itn + 1) % 10 == 0:
        print(f"Losses: {losses}")

        # Evaluasi 1: Span Matching
        p1, r1, f1_span = evaluate_span(nlp, VALID_DATA)
        print(f"[Span Matching] Precision: {p1:.2f}, Recall: {r1:.2f}, F1: {f1_span:.2f}")

        # Evaluasi 2: Tagging BIO
        report, f1_tagging = evaluate_bio(nlp, VALID_DATA)
        print(f"[BIO Tagging] F1: {f1_tagging:.2f}")
        print("Detail Classification Report:")
        for label, scores in report.items():
            if label not in ["micro avg", "macro avg", "weighted avg"]:
                print(f"{label}: precision={scores['precision']:.2f}, recall={scores['recall']:.2f}, f1-score={scores['f1-score']:.2f}")

        # Simpan jika F1 tagging terbaik
        if f1_tagging > best_f1_tagging:
            best_f1_tagging = f1_tagging
            model_dir = f"./models5/model_iter{itn+1}"
            os.makedirs("./models5", exist_ok=True)
            nlp.to_disk(model_dir)
            print(f"Model disimpan pada iterasi {itn+1} dengan F1 BIO: {f1_tagging:.2f}")
        print("\n")
    else:
        print(f"Losses: {losses}\n")

Iterasi 1/100
Losses: {'ner': 10773.590295147227}

Iterasi 2/100
Losses: {'ner': 1753.7035875022386}

Iterasi 3/100
Losses: {'ner': 1519.4452119455345}

Iterasi 4/100
Losses: {'ner': 1419.1594352971583}

Iterasi 5/100
Losses: {'ner': 1188.8073368972784}

Iterasi 6/100
Losses: {'ner': 1144.0515705734822}

Iterasi 7/100
Losses: {'ner': 1011.1055907290774}

Iterasi 8/100
Losses: {'ner': 897.9154077345764}

Iterasi 9/100
Losses: {'ner': 854.6104767243393}

Iterasi 10/100
Losses: {'ner': 791.3761571799857}
[Span Matching] Precision: 0.52, Recall: 0.40, F1: 0.45
[BIO Tagging] F1: 0.45
Detail Classification Report:
APP: precision=0.62, recall=0.50, f1-score=0.55
DESC: precision=0.24, recall=0.17, f1-score=0.20
Model disimpan pada iterasi 10 dengan F1 BIO: 0.45


Iterasi 11/100
Losses: {'ner': 858.5273676703397}

Iterasi 12/100
Losses: {'ner': 697.1362087055364}

Iterasi 13/100
Losses: {'ner': 666.6477624059769}

Iterasi 14/100
Losses: {'ner': 613.0548585387511}

Iterasi 15/100
Losses: {'ner':

In [38]:
if not os.path.exists("./models5"):
    os.makedirs("./models5")
nlp.to_disk("./models5/model_final")
print("Pelatihan selesai. Model final disimpan.")

Pelatihan selesai. Model final disimpan.
