In [1]:
import torch
import numpy as np
from transformers import BertTokenizer, BertConfig
from transformers import BertForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

set_seed(1)

Preprocess

In [3]:
import re
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

def preprocessed(data):
    # Mengubah teks menjadi huruf kecil
    data = data.lower()
    # Menghapus angka
    data = re.sub(r"\d+", "", data)
    # Menghapus tanda baca
    data = re.sub(f"[{re.escape(string.punctuation)}]", "", data)
    # Menghapus karakter khusus
    data = re.sub(r"[^\x00-\x7f]", "", data)
    # Menghapus spasi berlebih
    data = re.sub(r"\s+", " ", data).strip()
    # Melakukan Stemming/lemmatization
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    data = stemmer.stem(data)
    # Melakukan Stopword Removal
    factory = StopWordRemoverFactory()
    stopword_remover = factory.create_stop_word_remover()
    data = stopword_remover.remove(data)

    return data

Fungsi Load Model

In [4]:
import os

def load_model(model, load_model_name="model", load_model_dir="E:/code/project-list/bert-hfacs/models/model_trained/"):
    # Pastikan direktori ada, jika tidak buat
    os.makedirs(load_model_dir, exist_ok=True)

    # Gabungkan nama model dengan ekstensi .pth
    load_model_name_with_extension = load_model_name + ".pth"

    # Jika CUDA tidak tersedia, muat model ke CPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Gabungkan direktori dan nama file model
    load_path = os.path.join(load_model_dir, load_model_name_with_extension)

    # Save model
    model.load_state_dict(
        torch.load(
            load_path,
            weights_only=True,
            map_location=device,
        )
    )
    print("Model Weight Loaded")
    return model

Load Model Fine-Tuned IndoBERT

In [5]:
model_path = "E:/code/project-list/bert-hfacs/models/indobert_large"
config = BertConfig.from_pretrained(
    model_path,
    num_hidden_layers=24,
    num_attention_heads=16,
    hidden_size=1024,
    num_labels= 2
)

tokenizer = BertTokenizer.from_pretrained(model_path)

model = BertForSequenceClassification.from_pretrained(
    model_path,
    config=config,
)

model = load_model(model, load_model_name="indobert_large_E10_LR1e-4_BS16")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at E:/code/project-list/bert-hfacs/models/indobert_large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Weight Loaded


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

In [6]:
# Contoh input teks
input_text = "Dari penelitian dan wawancara yang dilakukan di ketahui bahwa pengemudi yang bersangkutan kurang memiliki pengalaman dan keterampilan dalam menghadapi kondisi darurat terutama kendaraan yang terbakar"

In [7]:
input_text_preprocessed = preprocessed(input_text)
print(input_text_preprocessed)

teliti wawancara laku tahu kemudi sangkut kurang milik alam terampil hadap kondisi darurat utama kendara bakar


In [8]:
LABEL2INDEX = {"UA": 0, "PRE": 1}
INDEX2LABEL = {0: "UA", 1: "PRE"}

w2i, i2w = LABEL2INDEX, INDEX2LABEL
print(w2i)
print(i2w)

{'UA': 0, 'PRE': 1}
{0: 'UA', 1: 'PRE'}


In [13]:
subwords = tokenizer.encode(input_text_preprocessed)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
print(subwords)

tensor([[    2,  8989,  5267,  5427,   899, 22110,  9940,    63,  1057,  2318,
           668, 14375, 26384,  1186,  8491,  1256,  1596,    85,  3456,     3]],
       device='cuda:0')


In [14]:
logits = model(subwords)[0]
print(logits)

tensor([[ 0.2064, -0.1592]], device='cuda:0', grad_fn=<AddmmBackward0>)


In [15]:
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()
print(label)

0


In [16]:
import torch.nn.functional as F
print(f'Text: {input_text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Dari penelitian dan wawancara yang dilakukan di ketahui bahwa pengemudi yang bersangkutan kurang memiliki pengalaman dan keterampilan dalam menghadapi kondisi darurat terutama kendaraan yang terbakar | Label : UA (59.041%)
