In [1]:
!pip install flask pyngrok flask-cors torch transformers scikit-learn pandas numpy

Collecting pyngrok
  Downloading pyngrok-7.2.12-py3-none-any.whl.metadata (9.4 kB)
Collecting flask-cors
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nv

In [4]:
# Step 1: Load & Preprocess NER Dataset
def read_ner_dataset(file_path):
    sentences = []
    labels = []
    with open(file_path, encoding='utf-8') as f:
        tokens = []
        tags = []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens = []
                    tags = []
            else:
                splits = line.split()
                if len(splits) >= 2:
                    tokens.append(splits[0])
                    tags.append(splits[1])
        if tokens:
            sentences.append(tokens)
            labels.append(tags)
    return sentences, labels

ner_sentences, ner_labels = read_ner_dataset('dataset NER.txt')
print("NER samples:", len(ner_sentences))
print("Contoh:", ner_sentences[0], ner_labels[0])

NER samples: 1095
Contoh: ['Berapa', 'harga', 'kursi', 'rapat'] ['O', 'B-price', 'B-item', 'I-item']


In [5]:
# Step 2: Label Mapping NER
# unique_labels = sorted(list(set([l for label_seq in ner_labels for l in label_seq])))
# label2id = {label: i for i, label in enumerate(unique_labels)}
# id2label = {i: label for label, i in label2id.items()}
# print("Label2id:", label2id)

unique_labels = sorted(list(set([l for label_seq in ner_labels for l in label_seq])))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}
print("Label2id:", label2id)

import json
with open('ner_label_mapping.json', 'w', encoding='utf-8') as f:
    json.dump({
        'label2id': label2id,
        'id2label': id2label
    }, f, ensure_ascii=False, indent=2)
print('✅ Mapping label NER berhasil disimpan ke ner_label_mapping.json')

Label2id: {'B-date': 0, 'B-item': 1, 'B-jabatan': 2, 'B-karyawan': 3, 'B-location': 4, 'B-price': 5, 'B-quantity': 6, 'B-sapaan': 7, 'B-status': 8, 'B-terimakasih': 9, 'I-item': 10, 'I-jabatan': 11, 'I-karyawan': 12, 'I-location': 13, 'I-price': 14, 'O': 15, 'O-item': 16}
✅ Mapping label NER berhasil disimpan ke ner_label_mapping.json


In [6]:
# Step 3: Dataset & DataLoader NER (DIPERBAIKI)
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

tokenizer_ner = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p2')

class NERDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, label2id, max_length=128):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        tokens = self.sentences[idx]
        tags = self.labels[idx]

        encoding = self.tokenizer(
            tokens,
            is_split_into_words=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        word_ids = encoding.word_ids(batch_index=0)
        label_ids = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(self.label2id[tags[word_idx]])
            else:
                label = tags[word_idx]
                if label.startswith("B-"):
                    label = "I-" + label[2:]
                label_ids.append(self.label2id.get(label, self.label2id["O"]))
            previous_word_idx = word_idx

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label_ids)
        }

from sklearn.model_selection import train_test_split
train_sents, test_sents, train_tags, test_tags = train_test_split(
    ner_sentences, ner_labels, test_size=0.2, random_state=42
)

train_dataset_ner = NERDataset(train_sents, train_tags, tokenizer_ner, label2id)
test_dataset_ner = NERDataset(test_sents, test_tags, tokenizer_ner, label2id)
train_loader_ner = DataLoader(train_dataset_ner, batch_size=8, shuffle=True)
test_loader_ner = DataLoader(test_dataset_ner, batch_size=8, shuffle=False)

print("Train samples:", len(train_dataset_ner))
print("Test samples:", len(test_dataset_ner))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Train samples: 876
Test samples: 219


In [7]:
# Step 4: Model NER IndoBERT
from transformers import BertForTokenClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW

import torch

model_ner = BertForTokenClassification.from_pretrained(
    'indobenchmark/indobert-base-p2',
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_ner.to(device)
print(f"Model loaded on device: {device}")

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Model loaded on device: cuda


In [8]:
# Step 5: Training NER IndoBERT
optimizer_ner = AdamW(model_ner.parameters(), lr=2e-5)
epochs = 3
total_steps = len(train_loader_ner) * epochs
scheduler_ner = get_linear_schedule_with_warmup(
    optimizer_ner, num_warmup_steps=0, num_training_steps=total_steps
)

model_ner.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader_ner:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model_ner(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        optimizer_ner.zero_grad()
        loss.backward()
        optimizer_ner.step()
        scheduler_ner.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader_ner)}")

# Save model
torch.save(model_ner.state_dict(), 'ner_model.pth')
print("NER model saved successfully!")

  return forward_call(*args, **kwargs)


Epoch 1, Loss: 0.4054729362103072
Epoch 2, Loss: 0.08455804219808091
Epoch 3, Loss: 0.046419860799373554
NER model saved successfully!


In [9]:
# Step 6: Evaluasi NER
from sklearn.metrics import classification_report
import numpy as np

model_ner.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader_ner:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model_ner(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        labels = labels.cpu().numpy()
        for p, l in zip(preds, labels):
            for pred_id, label_id in zip(p, l):
                if label_id != -100:
                    all_preds.append(pred_id)
                    all_labels.append(label_id)

unique_test_labels = sorted(list(set(all_labels)))
print(f"Unique labels in test data: {unique_test_labels}")
print(f"Total predictions: {len(all_preds)}")

target_names = [id2label[i] for i in unique_test_labels]

print("\n=== NER Classification Report ===")
print(classification_report(
    all_labels,
    all_preds,
    target_names=target_names,
    zero_division=0
))

accuracy = np.mean(np.array(all_preds) == np.array(all_labels))
print(f"\nOverall Accuracy: {accuracy:.4f}")

from collections import Counter
label_counts = Counter(all_labels)
print(f"\nLabel distribution in test data:")
for label_id, count in label_counts.items():
    print(f"  {id2label[label_id]}: {count}")

Unique labels in test data: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15)]
Total predictions: 1507

=== NER Classification Report ===
               precision    recall  f1-score   support

       B-date       0.70      0.70      0.70        10
       B-item       0.98      0.99      0.99       196
    B-jabatan       1.00      1.00      1.00         1
   B-karyawan       0.93      0.98      0.95        41
   B-location       0.93      0.98      0.95        41
      B-price       0.98      0.98      0.98        57
   B-quantity       0.94      0.94      0.94        50
     B-sapaan       1.00      0.20      0.33         5
     B-status       0.93      1.00      0.96        27
B-terimakasih       1.00      0.67      0.80         3
       I-item       1.00      1.00      1.00       607
    I-jabatan       1.00      1.00   