In [None]:
!pip install flask pyngrok flask-cors torch transformers scikit-learn pandas numpy

Collecting pyngrok
  Downloading pyngrok-7.2.12-py3-none-any.whl.metadata (9.4 kB)
Collecting flask-cors
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nv

In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split

def load_intent(file_path):
    intent_data = {'intents': []}
    intent_dict = {}

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    for line in lines:
        line = line.strip()
        if line:
            parts = line.rsplit(' ', 1)
            if len(parts) == 2:
                pattern, tag = parts
                if tag not in intent_dict:
                    intent_dict[tag] = {'tag': tag, 'patterns': [], 'responses': [f"Response untuk {tag}"]}
                intent_dict[tag]['patterns'].append(pattern)

    intent_data['intents'] = list(intent_dict.values())
    return intent_data

intent_data = load_intent('intent.txt')
print("Intent dataset loaded:", len(intent_data['intents']), "intents")

Intent dataset loaded: 9 intents


In [None]:
def prepare_intent_data(intent_data):
    training_data = []

    for intent in intent_data['intents']:
        tag = intent['tag']
        patterns = intent['patterns']

        for pattern in patterns:
            training_data.append({
                'text': pattern,
                'intent': tag,
                'response': [f"Ini response untuk {tag}"]  # Default response
            })

    return training_data

intent_training_data = prepare_intent_data(intent_data)
print("Intent training samples:", len(intent_training_data))

train_intent, test_intent = train_test_split(intent_training_data, test_size=0.2, random_state=42)
print("Train:", len(train_intent), "Test:", len(test_intent))

Intent training samples: 1087
Train: 869 Test: 218


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p2')  # Ganti ke p1

intent_labels = [intent['tag'] for intent in intent_data['intents']]
print("Intent labels:", intent_labels)

class IntentDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label2id = {label: idx for idx, label in enumerate(intent_labels)}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item['text']
        intent = item['intent']

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        intent_label = self.label2id[intent]

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'intent_label': torch.tensor(intent_label, dtype=torch.long)
        }

train_dataset = IntentDataset(train_intent, tokenizer)
test_dataset = IntentDataset(test_intent, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

intent_model = BertForSequenceClassification.from_pretrained(
    'indobenchmark/indobert-base-p2',
    num_labels=len(intent_labels)
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

Intent labels: ['status_barang', 'range_harga', 'lelang_barang', 'jumlah_barang', 'sapaan', 'lokasi_barang', 'kepemilikan_barang', 'harga_barang', 'ucapan_terima_kasih']


pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
intent_model.to(device)

train_intent_split, val_intent = train_test_split(
    train_intent, test_size=0.2, random_state=42,
    stratify=[item['intent'] for item in train_intent]
)

val_dataset = IntentDataset(val_intent, tokenizer, max_length=128)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

train_dataset_split = IntentDataset(train_intent_split, tokenizer, max_length=128)
train_loader_split = DataLoader(train_dataset_split, batch_size=16, shuffle=True)

optimizer = AdamW(intent_model.parameters(), lr=3e-5)
total_steps = len(train_loader_split) * 5
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def validate_model(model, val_loader, device):
    model.eval()
    total_val_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['intent_label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_val_loss += outputs.loss.item()

            predictions = torch.argmax(outputs.logits, dim=1)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = correct_predictions / total_predictions
    return avg_val_loss, val_accuracy

intent_model.train()
train_losses = []
val_losses = []
val_accuracies = []

for epoch in range(5):
    intent_model.train()
    total_train_loss = 0

    for batch in train_loader_split:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['intent_label'].to(device)

        outputs = intent_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader_split)
    train_losses.append(avg_train_loss)

    avg_val_loss, val_accuracy = validate_model(intent_model, val_loader, device)
    val_losses.append(avg_val_loss)
    val_accuracies.append(val_accuracy)

    print(f'Epoch {epoch+1}:')
    print(f'  Train Loss: {avg_train_loss:.4f}')
    print(f'  Val Loss: {avg_val_loss:.4f}')
    print(f'  Val Accuracy: {val_accuracy:.4f}')
    print('-' * 50)

torch.save(intent_model.state_dict(), 'intent_model.pth')

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

  return forward_call(*args, **kwargs)


Epoch 1:
  Train Loss: 0.8329
  Val Loss: 0.0735
  Val Accuracy: 1.0000
--------------------------------------------------
Epoch 2:
  Train Loss: 0.0721
  Val Loss: 0.0344
  Val Accuracy: 0.9943
--------------------------------------------------
Epoch 3:
  Train Loss: 0.0290
  Val Loss: 0.0217
  Val Accuracy: 1.0000
--------------------------------------------------
Epoch 4:
  Train Loss: 0.0217
  Val Loss: 0.0191
  Val Accuracy: 1.0000
--------------------------------------------------
Epoch 5:
  Train Loss: 0.0193
  Val Loss: 0.0187
  Val Accuracy: 0.9943
--------------------------------------------------


In [None]:
intent_model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['intent_label'].to(device)

        outputs = intent_model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

print("Confusion Matrix:")
print(confusion_matrix(true_labels, predictions))
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=intent_labels))

Confusion Matrix:
[[29  0  0  0  0  0  0  0  0]
 [ 0 18  0  0  0  0  0  0  0]
 [ 0  0 31  0  0  0  0  0  0]
 [ 0  0  0 28  0  0  0  0  0]
 [ 0  0  1  0 15  0  0  0  0]
 [ 0  0  0  0  0 21  0  0  0]
 [ 0  0  0  0  0  1 29  0  0]
 [ 0  0  0  0  0  0  0 25  0]
 [ 0  0  0  0  0  0  0  0 20]]

Classification Report:
                     precision    recall  f1-score   support

      status_barang       1.00      1.00      1.00        29
        range_harga       1.00      1.00      1.00        18
      lelang_barang       0.97      1.00      0.98        31
      jumlah_barang       1.00      1.00      1.00        28
             sapaan       1.00      0.94      0.97        16
      lokasi_barang       0.95      1.00      0.98        21
 kepemilikan_barang       1.00      0.97      0.98        30
       harga_barang       1.00      1.00      1.00        25
ucapan_terima_kasih       1.00      1.00      1.00        20

           accuracy                           0.99       218
          macr