In [14]:
import pandas as pd
import numpy as np
import re
from transformers import BertTokenizer, BertModel
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch

In [15]:
def clean_text(text):
    # Menghilangkan karakter non-alfanumerik dan mengganti beberapa karakter
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Opsi tambahan: mengganti newline dengan spasi, dll.
    text = text.replace('\n', ' ').replace('\r', '')
    return text

In [16]:
# Inisialisasi Tokenizer mBERT
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [17]:
def sliding_window_tokenize(text, max_length=512, stride=50):
    # Konversi teks ke token dan inisialisasi segmen
    initial_tokens = tokenizer.encode(text, add_special_tokens=False)
    total_tokens = len(initial_tokens)
    window_segments = []

    if total_tokens <= max_length:
        # Jika total token kurang dari max_length, tidak perlu sliding window
        return [tokenizer.encode(text, add_special_tokens=True)]

    for i in range(0, total_tokens, stride):
        # Memilih segmen token dengan mempertimbangkan token spesial
        window_segment = [tokenizer.cls_token_id] + initial_tokens[i:i + max_length - 2] + [tokenizer.sep_token_id]
        window_segments.append(window_segment)

    return window_segments

In [18]:
# Fungsi untuk menggabungkan token menjadi string
def tokens_to_string(tokens):
    return tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(tokens))

In [19]:
# Memuat dataset
df = pd.read_csv('Merged_data.csv')

In [20]:
# Pembersihan teks
df['cleaned_OCR_text'] = df['OCR_Text'].apply(clean_text)
# Sliding window tokenization
df['tokenized_segments'] = df['cleaned_OCR_text'].apply(sliding_window_tokenize)

Token indices sequence length is longer than the specified maximum sequence length for this model (622 > 512). Running this sequence through the model will result in indexing errors


In [21]:
# Padding dan Attention Mask
MAX_LEN = 512

def pad_segments(segments):
    return pad_sequences(segments, maxlen=MAX_LEN, padding='post', truncating='post')

def create_attention_masks(padded_segments):
    return np.where(padded_segments != 0, 1, 0)

In [22]:
df['padded_segments'] = df['tokenized_segments'].apply(pad_segments)
df['attention_masks'] = df['padded_segments'].apply(create_attention_masks)

In [23]:
# Memuat model mBERT
model = BertModel.from_pretrained('bert-base-multilingual-cased').to('cpu')

In [24]:
# Fungsi untuk mengolah segmen dengan mBERT
def bert_encode(segments, attention_mask):
    input_ids = torch.tensor(segments).type(torch.LongTensor)
    attention_mask = torch.tensor(attention_mask).type(torch.LongTensor)

    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)
    
    return last_hidden_states[0][:, 0, :].numpy()

In [None]:
# Mengolah setiap baris data dengan mBERT
df['bert_features'] = df.apply(lambda row: bert_encode(row['padded_segments'], row['attention_masks']), axis=1)
# Rata-ratakan vektor fitur
df['average_bert_features'] = df['bert_features'].apply(lambda x: np.mean(x, axis=0))

In [None]:
# Menyiapkan label
df['encoded_label'] = df['Label'].apply(lambda x: 1 if x == 'Fraud' else 0)