In [9]:
import spacy
from spacy.training.example import Example
import random
import json
import os
from sklearn.model_selection import train_test_split

In [10]:
# 1. Membuat model kosong dengan bahasa Indonesia
nlp = spacy.blank("id")
print("Model kosong bahasa Indonesia berhasil dibuat.")

Model kosong bahasa Indonesia berhasil dibuat.


In [11]:
# 2. Menambahkan pipeline NER secara manual
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

In [12]:
# 3. Load data training dari file JSON (akses key 'annotations')
with open("annotations.json", "r", encoding="utf-8") as f:
    data_json = json.load(f)
    ALL_DATA = data_json["annotations"]

In [13]:
# Bagi data menjadi training dan validation (70% - 30%)
TRAIN_DATA, VALID_DATA = train_test_split(ALL_DATA, test_size=0.3, random_state=42)
print(f"Data training: {len(TRAIN_DATA)}, Data validasi: {len(VALID_DATA)}")

Data training: 308, Data validasi: 133


In [14]:
# 4. Tambahkan label entitas
for text, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

print(f"Label yang ditambahkan: {ner.labels}")

Label yang ditambahkan: ('APP', 'DESC')


In [15]:
# 5. Melatih model
n_iter = 100  # jumlah iterasi (epoch)
optimizer = nlp.begin_training()
best_f1 = 0.0
print("Pelatihan dimulai...\n")

# Fungsi evaluasi sederhana
def evaluate(nlp, data):
    tp, fp, fn = 0, 0, 0
    for text, annotations in data:
        doc = nlp(text)
        gold_entities = set([(start, end, label) for start, end, label in annotations.get("entities")])
        pred_entities = set([(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents])
        
        # Hitung TP, FP, FN
        tp += len(gold_entities & pred_entities)
        fp += len(pred_entities - gold_entities)
        fn += len(gold_entities - pred_entities)
    
    # Hitung precision, recall, F1
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1

for itn in range(n_iter):
    print(f"Iterasi {itn + 1}/{n_iter}")
    random.shuffle(TRAIN_DATA)
    losses = {}
    
    # Update model dengan batch
    batches = [TRAIN_DATA[i:i+4] for i in range(0, len(TRAIN_DATA), 4)]  # Batch size 4
    for batch in batches:
        examples = []
        for text, annotations in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            examples.append(example)
        
        nlp.update(examples, drop=0.2, losses=losses)
    
    # Evaluasi setiap 10 iterasi
    if (itn + 1) % 10 == 0:
        precision, recall, f1 = evaluate(nlp, VALID_DATA)
        print(f"Losses: {losses}")
        print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}\n")
        
        # Simpan model jika performanya lebih baik
        if f1 > best_f1:
            best_f1 = f1
            if not os.path.exists("./models2"):
                os.makedirs("./models2")
            nlp.to_disk(f"./models2/model_iter{itn+1}")
            print(f"Model disimpan pada iterasi {itn+1} dengan F1: {f1:.2f}")
    else:
        print(f"Losses: {losses}\n")

Pelatihan dimulai...

Iterasi 1/100
Losses: {'ner': 10745.615066535403}

Iterasi 2/100
Losses: {'ner': 1827.2099767432207}

Iterasi 3/100
Losses: {'ner': 1534.5164348025128}

Iterasi 4/100
Losses: {'ner': 1309.395052368078}

Iterasi 5/100
Losses: {'ner': 1183.3702026741676}

Iterasi 6/100
Losses: {'ner': 1080.025978149087}

Iterasi 7/100
Losses: {'ner': 1033.196965510554}

Iterasi 8/100
Losses: {'ner': 944.6049244388108}

Iterasi 9/100
Losses: {'ner': 935.0724369803164}

Iterasi 10/100
Losses: {'ner': 819.2675656119445}
Precision: 0.51, Recall: 0.43, F1: 0.47

Model disimpan pada iterasi 10 dengan F1: 0.47
Iterasi 11/100
Losses: {'ner': 836.0087114968453}

Iterasi 12/100
Losses: {'ner': 773.426868588115}

Iterasi 13/100
Losses: {'ner': 631.4979940824516}

Iterasi 14/100
Losses: {'ner': 621.6796030196904}

Iterasi 15/100
Losses: {'ner': 625.3259441290302}

Iterasi 16/100
Losses: {'ner': 549.4975115280412}

Iterasi 17/100
Losses: {'ner': 521.0060689207726}

Iterasi 18/100
Losses: {'ner':

In [16]:
if not os.path.exists("./models2"):
    os.makedirs("./models2")
nlp.to_disk("./models2/model_final")
print("Pelatihan selesai. Model final disimpan.")

Pelatihan selesai. Model final disimpan.
