In [None]:
# Import library yang dibutuhkan

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.optim import AdamW
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import Dataset
import tqdm
import pandas as pd



In [None]:
# Load data kalimat dan labelnya
def load_tsv_ner(path):
    sentences, tags = [], []
    tokens, labels = [], []

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            # pemisah kalimat
            if line == "":
                if tokens:
                    sentences.append(tokens)
                    tags.append(labels)
                    tokens, labels = [], []
                continue

            parts = line.split("\t")
            if len(parts) != 2:
                print("WARNING: format salah â†’", line)
                continue

            token, tag = parts
            tokens.append(token)
            labels.append(tag)

    # push kalimat terakhir
    if tokens:
        sentences.append(tokens)
        tags.append(labels)

    return sentences, tags

sentences, tags = load_tsv_ner("singgalang.tsv")
print("Total sentences:", len(sentences))



In [None]:
# label Encoding
unique_tags = sorted(list(set(tag for seq in tags for tag in seq)))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

print("Label:", tag2id)


In [None]:
# Kelas Dataser
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

class NERDataset(Dataset):
    def __init__(self, sentences, tags):
        self.sentences = sentences
        self.tags = tags
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        words = self.sentences[idx]
        labels = self.tags[idx]

        encoding = tokenizer(words,
                             is_split_into_words=True,
                             return_offsets_mapping=True,
                             padding="max_length",
                             truncation=True,
                             max_length=128)

        encoded_labels = np.ones(len(encoding["input_ids"]), dtype=int) * -100
        word_ids = encoding.word_ids()

        for i, word_id in enumerate(word_ids):
            if word_id is None:
                continue
            encoded_labels[i] = tag2id[labels[word_id]]

        encoding = {k: torch.tensor(v) for k, v in encoding.items()}
        encoding["labels"] = torch.tensor(encoded_labels)

        return encoding


In [None]:
#Data loader
split = int(0.8 * len(sentences))

train_dataset = NERDataset(sentences[:split], tags[:split])
val_dataset   = NERDataset(sentences[split:], tags[split:])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=16)

In [None]:
# Load IndoBERT untuk klasifikasi token
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForTokenClassification.from_pretrained(
    "indobenchmark/indobert-base-p1",
    num_labels=len(unique_tags)
).to(device)

optimizer = AdamW(model.parameters(), lr=3e-5)

In [None]:
# Training loop 
def train(model, train_loader, val_loader, epochs=3):
    for epoch in range(epochs):
        print(f"\n===== Epoch {epoch+1}/{epochs} =====")

        # TRAINING
        model.train()
        train_loss = 0
        pbar = tqdm(train_loader, desc="Training")

        for batch in pbar:
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids,
                            attention_mask=attention_mask,
                            labels=labels)

            loss = outputs.loss
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            pbar.set_postfix({"loss": loss.item()})

        avg_train_loss = train_loss / len(train_loader)

        # VALIDATION
        model.eval()
        val_loss = 0
        pbar = tqdm(val_loader, desc="Validating")

        with torch.no_grad():
            for batch in pbar:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids,
                                attention_mask=attention_mask,
                                labels=labels)

                loss = outputs.loss
                val_loss += loss.item()
                pbar.set_postfix({"loss": loss.item()})

        avg_val_loss = val_loss / len(val_loader)

        print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

train(model, train_loader, val_loader, epochs=12)


In [None]:
#Evaluasi model
def evaluate(model, loader):
    model.eval()

    true_tags = []
    pred_tags = []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids,
                            attention_mask=attention_mask)

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            for i in range(len(labels)):
                true_seq = []
                pred_seq = []

                for j in range(len(labels[i])):
                    if labels[i][j] == -100:
                        continue

                    true_seq.append(id2tag[int(labels[i][j])])
                    pred_seq.append(id2tag[int(predictions[i][j])])

                true_tags.append(true_seq)
                pred_tags.append(pred_seq)

    print("\n===== Classification Report =====")
    print(classification_report(true_tags, pred_tags))

    # Flatten untuk confusion matrix
    y_true_flat = [t for seq in true_tags for t in seq]
    y_pred_flat = [p for seq in pred_tags for p in seq]

    print("\n===== Confusion Matrix =====")
    print(pd.DataFrame(confusion_matrix(y_true_flat, y_pred_flat),
                       index=unique_tags,
                       columns=unique_tags))

evaluate(model, val_loader)