In [50]:
# Import library yang dibutuhkan

import torch
from torch.utils.data import DataLoader, Dataset as TorchDataset
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.optim import AdamW
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm



In [None]:
# Test GPU CUDA
import torch

print("=" * 70)
print("GPU/CUDA TEST")
print("=" * 70)
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA Devices: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"  Device {i}: {torch.cuda.get_device_name(i)}")
    print(f"Current Device: {torch.cuda.current_device()}")
    print(f"CUDA Version: {torch.version.cuda}")
    
    # Test GPU computation
    x = torch.randn(1000, 1000).cuda()
    y = torch.randn(1000, 1000).cuda()
    z = torch.mm(x, y)
    print(f"GPU Memory Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print("✓ GPU test successful!")
else:
    print("✗ GPU not available - using CPU")
    print("Run this in terminal to install CUDA support:")
    print("pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")

print("=" * 70)

# PyTorch dengan CUDA Setup

**PENTING**: PyTorch dengan CUDA sedang diinstall di terminal. 
Setelah instalasi selesai (~ 5-10 menit), restart kernel notebook dengan menekan `Ctrl+Shift+P` -> "Jupyter: Restart Kernel"

In [51]:
df = pd.read_csv("data/SINGGALANG_oversampled.tsv", sep="\t", names=["word", "label"], keep_default_na=False)

In [53]:
sentences = []
labels_list = []

temp_w, temp_l = [], []
end_tokens = [".","?","!"]

for w,l in zip(df["word"], df["label"]):

    if isinstance(w, float) or w == "":
        continue

    if w in end_tokens and temp_w:
        temp_w.append(w)
        temp_l.append(l)
        sentences.append(temp_w)
        labels_list.append(temp_l)
        temp_w, temp_l = [], []
    else:
        temp_w.append(w)
        temp_l.append(l)

if temp_w:
    sentences.append(temp_w)
    labels_list.append(temp_l)

print("Total sentences:", len(sentences))

label_fix = {
    "O":"O","0":"O",".":"O","":"",
    "Person":"Person","1":"Person",
    "Place":"Place","2":"Place",
    "Organisation":"Organisation","3":"Organisation"
}

label_map = {"O":0,"Person":1,"Place":2,"Organisation":3}

clean_labels = []
for seq in labels_list:
    new = []
    for lab in seq:
        new.append(label_fix.get(lab, "O"))
    clean_labels.append(new)

flat_labels = [[label_map[x] for x in seq] for seq in clean_labels]

Total sentences: 48981


In [54]:
# label Encoding
unique_tags = sorted(list(set(tag for seq in clean_labels for tag in seq)))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

print("Label:", tag2id)
print("Unique tags:", unique_tags)


Label: {'O': 0, 'Organisation': 1, 'Person': 2, 'Place': 3}
Unique tags: ['O', 'Organisation', 'Person', 'Place']


In [55]:
# Kelas Dataser
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

class NERDataset(TorchDataset):
    def __init__(self, sentences, tags):
        self.sentences = sentences
        self.tags = tags
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        words = self.sentences[idx]
        labels = self.tags[idx]

        encoding = tokenizer(words,
                             is_split_into_words=True,
                             return_offsets_mapping=True,
                             padding="max_length",
                             truncation=True,
                             max_length=128)

        encoded_labels = np.ones(len(encoding["input_ids"]), dtype=int) * -100
        word_ids = encoding.word_ids()

        for i, word_id in enumerate(word_ids):
            if word_id is None:
                continue
            encoded_labels[i] = tag2id[labels[word_id]]

        encoding = {k: torch.tensor(v) for k, v in encoding.items()}
        encoding["labels"] = torch.tensor(encoded_labels)

        return encoding


In [56]:
#Data loader
split = int(0.8 * len(sentences))

train_dataset = NERDataset(sentences[:split], clean_labels[:split])
val_dataset   = NERDataset(sentences[split:], clean_labels[split:])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=16)

In [60]:
# Load IndoBERT untuk klasifikasi token
print("=" * 60)
print("DEVICE INFORMATION")
print("=" * 60)
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("GPU not available, using CPU")
    
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device.upper()}")
print("=" * 60)

model = AutoModelForTokenClassification.from_pretrained(
    "indobenchmark/indobert-base-p1",
    num_labels=len(unique_tags)
).to(device)

# Freeze semua parameter bert, hanya unfreeze classifier
for param in model.bert.parameters():
    param.requires_grad = False

# Classifier tetap trainable
for param in model.classifier.parameters():
    param.requires_grad = True

# Hitung jumlah trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable_params:,} / Total params: {total_params:,}")
print(f"Trainable: {100 * trainable_params / total_params:.2f}%")

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=3e-5)

DEVICE INFORMATION
CUDA Available: False
GPU not available, using CPU
Using device: CPU


Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable params: 3,076 / Total params: 123,853,828
Trainable: 0.00%


In [58]:
# Training loop 
def train(model, train_loader, val_loader, epochs=3):
    for epoch in range(epochs):
        print(f"\n===== Epoch {epoch+1}/{epochs} =====")

        # TRAINING
        model.train()
        train_loss = 0
        pbar = tqdm(train_loader, desc="Training")

        for batch in pbar:
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids,
                            attention_mask=attention_mask,
                            labels=labels)

            loss = outputs.loss
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            pbar.set_postfix({"loss": loss.item()})

        avg_train_loss = train_loss / len(train_loader)

        # VALIDATION
        model.eval()
        val_loss = 0
        pbar = tqdm(val_loader, desc="Validating")

        with torch.no_grad():
            for batch in pbar:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids,
                                attention_mask=attention_mask,
                                labels=labels)

                loss = outputs.loss
                val_loss += loss.item()
                pbar.set_postfix({"loss": loss.item()})

        avg_val_loss = val_loss / len(val_loader)

        print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

train(model, train_loader, val_loader, epochs=1)



===== Epoch 1/1 =====


Training:   0%|          | 8/2449 [00:21<1:47:14,  2.64s/it, loss=1.39]



===== Epoch 1/1 =====


Training:   0%|          | 8/2449 [00:21<1:47:14,  2.64s/it, loss=1.39]


KeyboardInterrupt: 

In [None]:
#Evaluasi model
def evaluate(model, loader):
    model.eval()

    true_tags = []
    pred_tags = []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids,
                            attention_mask=attention_mask)

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            for i in range(len(labels)):
                true_seq = []
                pred_seq = []

                for j in range(len(labels[i])):
                    if labels[i][j] == -100:
                        continue

                    true_seq.append(id2tag[int(labels[i][j])])
                    pred_seq.append(id2tag[int(predictions[i][j])])

                true_tags.append(true_seq)
                pred_tags.append(pred_seq)

    print("\n===== Classification Report =====")
    print(classification_report(true_tags, pred_tags))

    # Flatten untuk confusion matrix
    y_true_flat = [t for seq in true_tags for t in seq]
    y_pred_flat = [p for seq in pred_tags for p in seq]

    print("\n===== Confusion Matrix =====")
    print(pd.DataFrame(confusion_matrix(y_true_flat, y_pred_flat),
                       index=unique_tags,
                       columns=unique_tags))

evaluate(model, val_loader)