In [22]:
import pandas as pd
import re
from transformers import BertTokenizer

In [23]:
def clean_text(text):
    # Menghilangkan karakter non-alfanumerik dan mengganti beberapa karakter
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Opsi tambahan: mengganti newline dengan spasi, dll.
    text = text.replace('\n', ' ').replace('\r', '')
    return text

In [24]:
# Inisialisasi Tokenizer mBERT
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [30]:
def sliding_window_tokenize(text, max_length=512, stride=50):
    # Konversi teks ke token dan inisialisasi segmen
    initial_tokens = tokenizer.encode(text, add_special_tokens=False)
    total_tokens = len(initial_tokens)
    window_segments = []

    if total_tokens <= max_length:
        # Jika total token kurang dari max_length, tidak perlu sliding window
        return [tokenizer.encode(text, add_special_tokens=True)]

    for i in range(0, total_tokens, stride):
        # Memilih segmen token dengan mempertimbangkan token spesial
        window_segment = [tokenizer.cls_token_id] + initial_tokens[i:i + max_length - 2] + [tokenizer.sep_token_id]
        window_segments.append(window_segment)

    return window_segments

In [26]:
# Fungsi untuk menggabungkan token menjadi string
def tokens_to_string(tokens):
    return tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(tokens))

In [27]:
# Memuat dataset
df = pd.read_csv('validate_company_updated.csv')

In [31]:
# Pembersihan teks
df['cleaned_OCR_text'] = df['OCR_Text'].apply(clean_text)
# Sliding window tokenization
df['tokenized_segments'] = df['cleaned_OCR_text'].apply(sliding_window_tokenize)

In [32]:
# Menggabungkan segmen teks yang telah ditokenisasi
df['processed_text'] = df['tokenized_segments'].apply(lambda segs: ' '.join([tokens_to_string(seg) for seg in segs]))

In [33]:
df['processed_text']

0       [CLS] UWFO MALANG SUGARNUTS INDONESIA PROGRAM ...
1       [CLS] WFO JAKARTA PSI PSI JAKARTA PROGRAM MAGA...
2       [CLS] WFH FULLTIME Asia Land YES MUNGKIN INI B...
3       [CLS] HYBRID SEMARANG hercodigital OPEN INTERN...
4       [CLS] UWFH ONLINE KLTC Anude Jiraorshioifaliea...
                              ...                        
2889    [CLS] RECRUITMENT BeRKAS YANG DibawaH Perusaha...
2890    [CLS] PT SOLID WK WE ARE HIRING APPLY NOW Posi...
2891    [CLS] We af hiring Designo BAGIAN POTONG SAYUR...
2892    [CLS] InHouse Program MCF MAF Kami Sedang Mere...
2893    [CLS] Alfamart Alfamidi BERGERAKBERSAMADISABIL...
Name: processed_text, Length: 2894, dtype: object

In [34]:
df.head()

Unnamed: 0,Image URL,Timestamp,Filename,OCR_Text,Perusahaan tidak jelas atau tidak ada informasi valid (40%),"Deskripsi posisi, persyaratan, dan lokasi kerja yang tidak jelas (25%)",Deskripsi pekerjaan tidak spesifik (Jobdesc) (20%),Tidak ada proses wawancara dan seleksi yang jelas(5%),Ada permintaan biaya pendaftaran(5%),Gaji tidak realistis (5%),Total,Label,Extracted_Companies,semantic_validate,cleaned_OCR_text,tokenized_segments,processed_text
0,https://scontent-xsp2-1.cdninstagram.com/v/t51...,2022-12-28T07:12:37.000Z,2022-12-28_post_lokermagang0_image.jpg,UWFO MALANG SUGARNUTS INDONESIA PROGRAM MAGANG...,0,25,20,0,0,0,45,Not Fraud,Sugarnutsco,,UWFO MALANG SUGARNUTS INDONESIA PROGRAM MAGANG...,"[[101, 158, 83695, 11403, 27277, 44376, 34065,...",[CLS] UWFO MALANG SUGARNUTS INDONESIA PROGRAM ...
1,https://scontent-xsp1-3.cdninstagram.com/v/t51...,2022-12-26T08:03:04.000Z,2022-12-26_post_lokermagang1_combined.jpg,WFO JAKARTA PSI PSI JAKARTA PROGRAM MAGANG PSI...,0,0,20,0,0,0,20,Not Fraud,PSI,,WFO JAKARTA PSI PSI JAKARTA PROGRAM MAGANG PSI...,"[[101, 160, 11565, 11403, 53399, 67813, 46935,...",[CLS] WFO JAKARTA PSI PSI JAKARTA PROGRAM MAGA...
2,https://instagram.fsub8-2.fna.fbcdn.net/v/t51....,2022-12-26T05:05:33.000Z,2022-12-26_post_lokermagang2_combined.jpg,"WFH FULLTIME Asia Land YES, MUNGKIN INI BUAT K...",0,0,0,0,0,0,0,Not Fraud,Asia Land,PT Jakarta Land,WFH FULLTIME Asia Land YES MUNGKIN INI BUAT KA...,"[[101, 160, 11565, 12396, 143, 100673, 11369, ...",[CLS] WFH FULLTIME Asia Land YES MUNGKIN INI B...
3,https://instagram.fsub8-2.fna.fbcdn.net/v/t51....,2022-12-24T06:54:18.000Z,2022-12-24_post_lokermagang3_image.jpg,HYBRID SEMARANG hercodigital OPEN INTERNSHIBer...,0,0,20,0,0,0,20,Not Fraud,Herco Digital Indonesia,PT Herco Digital Indonesia,HYBRID SEMARANG hercodigital OPEN INTERNSHIBer...,"[[101, 145, 14703, 11274, 46876, 11490, 23056,...",[CLS] HYBRID SEMARANG hercodigital OPEN INTERN...
4,https://instagram.fsub8-1.fna.fbcdn.net/v/t51....,2022-12-22T10:00:46.000Z,2022-12-22_post_lokermagang4_image.jpg,UWFH ONLINE KLTC Anude Jiraorshioifalieaia(iu ...,0,0,0,0,0,0,0,Not Fraud,KLTC,,UWFH ONLINE KLTC Anude Jiraorshioifalieaiaiu 6...,"[[101, 158, 83695, 12396, 49339, 11369, 83198,...",[CLS] UWFH ONLINE KLTC Anude Jiraorshioifaliea...
