Dataset yang digunakan untuk fine tuning dan transfer learning model IndoBERT sebanyak 1057 teks dengan proporsi data train test 9:1. Pelabelan data dilakukan dengan manual dan dibantu oleh generative AI.

# Modeling

In [89]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report, accuracy_score

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoProcessor, TrainingArguments, Trainer, pipeline, DataCollatorForTokenClassification

## Dataset Preparation

In [3]:
df = pd.read_csv('data/NER_DATASET.tsv', sep='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37979 entries, 0 to 37978
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tokens  36923 non-null  object
 1   labels  36923 non-null  object
dtypes: object(2)
memory usage: 593.6+ KB


In [4]:
df.fillna("", inplace=True)
df['labels'].value_counts()

labels
O             30376
B-ACTION       1758
B-MODUS        1718
               1056
B-PRODUK        705
B-PERSON        469
B-PLATFORM      349
B-NOMINAL       299
I-PRODUK        219
B-LAYANAN       218
I-MODUS         218
I-PERSON        113
B-KONTAK        104
B-REK           101
I-LAYANAN        78
I-NOMINAL        70
I-KONTAK         36
I-REK            34
I-PLATFORM       30
I-ACTION         28
Name: count, dtype: int64

In [5]:
all_sentences = []
all_labels = []

tokens = []
ner_tags = []
for idx, row in df.iterrows():
    token = row['tokens']
    label = row['labels']

    if token == "" and label == "":
        if tokens:
            all_sentences.append(tokens.copy())
            all_labels.append(ner_tags.copy())
            tokens.clear()
            ner_tags.clear()
    else:
        tokens.append(token)
        ner_tags.append(label)

if tokens:
    all_sentences.append(tokens.copy())
    all_labels.append(ner_tags.copy())
    tokens.clear()
    ner_tags.clear()

print(all_sentences[:2])
print(all_labels[:2])

[['sebetulnya', 'bukannya', 'nggak', 'mau', 'usaha', 'tapi', 'banyak', 'penipuan', 'mindset', 'udah', 'bener', 'tapi', 'kalo', 'misalnya', 'tools', 'nya', 'ternyata', 'menuju', 'scam', 'gimana', '?', 'ketipu', 'terus', 'donk'], ['sebelumnya', 'gw', 'juga', 'pernah', 'hampir', 'ketipu', 'via', 'interview', 'palsu', 'di', 'linkedin', '.', 'pernah', 'gw', 'tulis', 'di', 'blog', 'ini', ':', 'dan', 'sekarang', '.', 'kejadian', 'mirip', 'keulang', 'lagi', 'bedanya', 'kali', 'ini', 'gw', 'udah', 'diblok', 'duluan', '.', 'chat', 'penawarannya', 'ialng', 'jir', 'keburu', 'diblok', 'duluan', '2/9']]
[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MODUS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MODUS', 'O', 'O', 'B-ACTION', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'B-ACTION', 'O', 'B-MODUS', 'I-MODUS', 'O', 'B-PLATFORM', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ACTION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-ACTION', 'O', 'O']]


Membuat struktur data sequence:

``` python
all_sentences = [[sentence], [sentence], ... ]
all_labels = [[ner_tags], [ner_tags], ... ]
```

Setiap sequence dibungkus dengan list, begitu juga dengan tag NER-nya

In [11]:
LABEL = sorted(set(lbl for tags in all_labels for lbl in tags))
LABEL2ID = {lbl : i for i, lbl in enumerate(LABEL)}
ID2LABEL = {i : lbl for lbl, i in LABEL2ID.items()}

print(LABEL)
print(LABEL2ID)
print(ID2LABEL)

['B-ACTION', 'B-KONTAK', 'B-LAYANAN', 'B-MODUS', 'B-NOMINAL', 'B-PERSON', 'B-PLATFORM', 'B-PRODUK', 'B-REK', 'I-ACTION', 'I-KONTAK', 'I-LAYANAN', 'I-MODUS', 'I-NOMINAL', 'I-PERSON', 'I-PLATFORM', 'I-PRODUK', 'I-REK', 'O']
{'B-ACTION': 0, 'B-KONTAK': 1, 'B-LAYANAN': 2, 'B-MODUS': 3, 'B-NOMINAL': 4, 'B-PERSON': 5, 'B-PLATFORM': 6, 'B-PRODUK': 7, 'B-REK': 8, 'I-ACTION': 9, 'I-KONTAK': 10, 'I-LAYANAN': 11, 'I-MODUS': 12, 'I-NOMINAL': 13, 'I-PERSON': 14, 'I-PLATFORM': 15, 'I-PRODUK': 16, 'I-REK': 17, 'O': 18}
{0: 'B-ACTION', 1: 'B-KONTAK', 2: 'B-LAYANAN', 3: 'B-MODUS', 4: 'B-NOMINAL', 5: 'B-PERSON', 6: 'B-PLATFORM', 7: 'B-PRODUK', 8: 'B-REK', 9: 'I-ACTION', 10: 'I-KONTAK', 11: 'I-LAYANAN', 12: 'I-MODUS', 13: 'I-NOMINAL', 14: 'I-PERSON', 15: 'I-PLATFORM', 16: 'I-PRODUK', 17: 'I-REK', 18: 'O'}


In [12]:
all_labelids = [[LABEL2ID[lbl] for lbl in tags] for tags in all_labels]
dataset = [{"sentence": sentence, "ner_tags": tags} for sentence, tags in zip(all_sentences, all_labelids)]

In [13]:
train_data, val_data = train_test_split(dataset, test_size=0.1, random_state=42)
print(f"Total data untuk train : {len(train_data)}")
print(f"Total data untuk validation : {len(val_data)}")

Total data untuk train : 950
Total data untuk validation : 106


In [14]:
ner_dataset = DatasetDict({
    "train" : Dataset.from_list(train_data),
    "validation" : Dataset.from_list(val_data)
})

## Model Fine Tuning

### Load Model & Tokenizer

In [15]:
model_checkpoint = "indobenchmark/indobert-base-p2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
processor = AutoProcessor.from_pretrained(model_checkpoint)

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(LABEL),
    id2label=ID2LABEL,
    label2id=LABEL2ID
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Tokenizing Dataset

In [16]:
def tokenize_and_align_labels(examples):

    tokenized_inputs = tokenizer(
        examples["sentence"],
        truncation=True,
        is_split_into_words=True,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        max_length=250,
        stride=128
    )

    labels = []
    sample_mapping = tokenized_inputs["overflow_to_sample_mapping"]

    for i in range(len(tokenized_inputs["input_ids"])):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        sample_index = sample_mapping[i]
        original_labels = examples["ner_tags"][sample_index]

        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                if word_idx < len(original_labels):
                    label_ids.append(original_labels[word_idx])
                else:
                    label_ids.append(-100)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs.pop("offset_mapping", None)
    tokenized_inputs["labels"] = labels

    return tokenized_inputs

tokenized_dataset = DatasetDict({
    'train': Dataset.from_dict(tokenize_and_align_labels(ner_dataset['train'].to_dict())),
    'validation': Dataset.from_dict(tokenize_and_align_labels(ner_dataset['validation'].to_dict()))
})

In [17]:
args = TrainingArguments(
    output_dir="./model/ner-indobert-p2",
    learning_rate=2e-5,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    use_cpu=True
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.245284
2,No log,0.171927
3,No log,0.159005
4,No log,0.163453
5,No log,0.162199


TrainOutput(global_step=310, training_loss=0.28681788290700605, metrics={'train_runtime': 2514.0527, 'train_samples_per_second': 1.965, 'train_steps_per_second': 0.123, 'total_flos': 405707188378104.0, 'train_loss': 0.28681788290700605, 'epoch': 5.0})

In [18]:
trainer.save_model("./model/ner-indobert-finetuned")

### Evaluation Model

In [19]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [ID2LABEL[label] for label in label_row if label != -100]
        for label_row in labels
    ]
    true_predictions = [
        [ID2LABEL[pred] for pred, label in zip(pred_row, label_row) if label != -100]
        for pred_row, label_row in zip(predictions, labels)
    ]

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "accuracy": accuracy_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "report": classification_report(true_labels, true_predictions),
    }

In [20]:
trainer.compute_metrics = compute_metrics
eval_model = trainer.evaluate()

In [21]:
for k in eval_model.keys():
    print(k)
    print(eval_model[k])
    print('\n')

eval_loss
0.1590048372745514


eval_precision
0.7896825396825397


eval_recall
0.805668016194332


eval_accuracy
0.9540736080528468


eval_f1
0.7975951903807615


eval_report
              precision    recall  f1-score   support

      ACTION       0.95      0.93      0.94       151
      KONTAK       0.50      0.80      0.62        10
     LAYANAN       0.71      0.57      0.63        21
       MODUS       0.86      0.86      0.86       165
     NOMINAL       0.81      0.85      0.83        26
      PERSON       0.63      0.53      0.58        32
    PLATFORM       0.65      0.81      0.72        27
      PRODUK       0.49      0.57      0.53        54
         REK       0.57      0.50      0.53         8

   micro avg       0.79      0.81      0.80       494
   macro avg       0.69      0.71      0.69       494
weighted avg       0.80      0.81      0.80       494



eval_runtime
4.7887


eval_samples_per_second
22.135


eval_steps_per_second
1.462


epoch
5.0




# Model usage

In [22]:
model_path = "./model/ner-indobert-finetuned"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

In [23]:
df_teks = pd.read_csv('data/clean_datatext.csv')
df_teks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3005 entries, 0 to 3004
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   waktu       3005 non-null   object
 1   clean_text  3005 non-null   object
dtypes: object(2)
memory usage: 47.1+ KB


In [24]:
df_teks.reset_index(inplace=True)
df_teks.rename(columns={
    'index':'text_ID'
}, inplace=True)
df_teks['len_text'] = df_teks['clean_text'].str.len()
df_teks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3005 entries, 0 to 3004
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text_ID     3005 non-null   int64 
 1   waktu       3005 non-null   object
 2   clean_text  3005 non-null   object
 3   len_text    3005 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 94.0+ KB


## Building pipeline

### Exploration

In [26]:
long_idx = df_teks['len_text'].argmax()
longest_text = df_teks.iloc[long_idx, 2]
print(longest_text)

(sebelumnya mohon maaf kalau ada kata-kata saya yg kasar, mengumpat dan gak enak dibaca) temen-temen yg merasa dirugikan materi, boleh dm saya untuk mencatat nominal kerugiannya. sudah terdata 42 orang korban. saya nyesel enggak speak up dan cari tau tentanguser-11350117449701790883dari dulu. ada urusan piutang sejak bulan agustus sebesar 925rb. tapi karena saya ga enakan dan memaklumi segala musibah dia, pokoknyauser-11350117449701790883ini jago banget gaslighting, seolah-olah bikin saya merasa bersalah dan ga sabaran karena terus-terusan nagih uangnya. barusan saya udah investigasi. anjay investigasi bikin saya lemes bgt ga nyangkauser-11350117449701790883bener-benerseorang penipu.semua keluarga yg ada di ceritanya dia alhamdulillahsemuanya masih hidup.ada beberapa bukti yg saya foto dan saya ngobrol sama keluarga tetangganya. untuk temen-temen yg udah tertipu sama dia, jangan harap uangnya balik lagi. ikhlasin aja dan doain rame-rame semoga kagiles treuk, gak deng, doain aja. terser

In [27]:
tokenized_inputs = tokenizer(
    longest_text,
    truncation=True,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    max_length=250,
    stride=128,
    return_tensors='pt',
    padding=True
)

dict(tokenized_inputs).keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])

In [28]:
pred_model = model(
    input_ids = tokenized_inputs['input_ids'],
    token_type_ids = tokenized_inputs['token_type_ids'],
    attention_mask = tokenized_inputs['attention_mask'],
)

In [34]:
tokenized_inputs['input_ids'].size()

torch.Size([27, 250])

In [31]:
pred_model['logits'].size()

torch.Size([27, 250, 19])

In [52]:
pred_logits = pred_model['logits']
pred_ID = pred_logits.argmax(dim=2)

In [59]:
batch_inp = 0
sentence_exm = tokenizer.convert_ids_to_tokens(tokenized_inputs['input_ids'][batch_inp])
idlabel_exm = pred_ID[batch_inp]
word_id = tokenized_inputs.word_ids(batch_inp)
word_mapping = tokenized_inputs['offset_mapping'][batch_inp]


dct_temp = {
        'ACTION':[],
        'KONTAK':[],
        'LAYANAN':[],
        'MODUS':[],
        'NOMINAL':[],
        'PERSON':[],
        'PLATFORM':[],
        'PRODUK':[],
        'REK':[]
    }

for w, idlbl, wid, wmap in zip(sentence_exm, idlabel_exm, word_id, word_mapping):
    if idlbl != 18:
        lbl = ID2LABEL[int(idlbl)]
        dct_temp[lbl[2:]].append(w)

dct_temp

{'ACTION': ['mengu',
  'dirugikan',
  'dm',
  'mencatat',
  'terd',
  'spe',
  'cari',
  'nag',
  '##ih',
  'investigasi',
  'balik',
  'tr',
  'lapor'],
 'KONTAK': ['113',
  '##117',
  '##44',
  '##97',
  '##01',
  '##79',
  '##08',
  '113',
  '##50',
  '##117',
  '##44',
  '##97',
  '##01',
  '##79',
  '##08',
  '-',
  '113',
  '##50',
  '##117',
  '##44',
  '##97',
  '##01',
  '##79',
  '##08',
  '##02'],
 'LAYANAN': [],
 'MODUS': ['piutang', 'gas', '##light', '##ing', 'penipu', 'tertipu'],
 'NOMINAL': ['nominal',
  'kerugian',
  '92',
  '##5',
  '##rb',
  'uangnya',
  'uangnya',
  '9000'],
 'PERSON': ['##user', '##83', 'pokoknya', '##user', '##83'],
 'PLATFORM': [],
 'PRODUK': [],
 'REK': ['##50', '##83', 'rekening', 'mandiri']}

In [63]:
def merge_token(list_token : list) -> tuple:
    full_word = ''
    label = 'O'
    for i in range(len(list_token)):
        token = list_token[i][0]
        token = token.replace('##', '')
        full_word = full_word + token
        if (list_token[i][1] != 'O') and (label == 'O'):
            label = list_token[i][1]
    
    return (full_word, label)

In [None]:
dct_temp = {
        'ACTION':[],
        'KONTAK':[],
        'LAYANAN':[],
        'MODUS':[],
        'NOMINAL':[],
        'PERSON':[],
        'PLATFORM':[],
        'PRODUK':[],
        'REK':[]
    }

lastid_batch = -1
past_wid = None
list_to_merge = []
is_added = False
past_added_lbl = None
for w, idlbl, wid, wmap in zip(sentence_exm, idlabel_exm, word_id, word_mapping):
    if wid == None:
        continue
    elif wid < lastid_batch:
        continue
    else:
        lastid_batch = wid

    lbl = ID2LABEL[int(idlbl)]
    if wid == past_wid:
        list_to_merge.append((w, lbl))
        if is_added:
            dct_temp[past_added_lbl].pop()
            is_added = False
        continue
    else:
        if len(list_to_merge) > 1:
            wpast, lblpast = merge_token(list_to_merge)
            if lblpast != 'O':
                dct_temp[lblpast[2:]].append(wpast)
        past_wid = wid
        list_to_merge = [(w, lbl)]


    if lbl != 'O':
        is_added = True
        dct_temp[lbl[2:]].append(w)
        past_added_lbl = lbl[2:]
    else:
        is_added = False

dct_temp

{'ACTION': ['mengumpat',
  'dirugikan',
  'dm',
  'mencatat',
  'terdata',
  'speak',
  'cari',
  'nagih',
  'investigasi',
  'balik',
  'treuk',
  'laporin'],
 'KONTAK': ['11350117449701790883dari',
  '11350117449701790883ini',
  '-',
  '11350117449701790883bener'],
 'LAYANAN': [],
 'MODUS': ['piutang', 'gaslighting', 'penipu', 'tertipu'],
 'NOMINAL': ['nominal', 'kerugiannya', '925rb', 'uangnya', 'uangnya'],
 'PERSON': ['tentanguser', 'pokoknyauser'],
 'PLATFORM': [],
 'PRODUK': [],
 'REK': ['rekening', 'mandiri']}

In [72]:
dct_temp = {
        'ACTION':[],
        'KONTAK':[],
        'LAYANAN':[],
        'MODUS':[],
        'NOMINAL':[],
        'PERSON':[],
        'PLATFORM':[],
        'PRODUK':[],
        'REK':[]
    }

In [81]:
def extractionbatch_result(dct_container, words_list, idlabel, word_id, word_mapping, lastid_batch):
    past_wid = None
    list_to_merge = []
    is_added = False
    past_added_lbl = None
    for w, idlbl, wid, wmap in zip(words_list, idlabel, word_id, word_mapping):
        if wid == None:
            continue
        elif wid < lastid_batch:
            continue
        else:
            lastid_batch = wid - 1

        lbl = ID2LABEL[int(idlbl)]
        if wid == past_wid:
            list_to_merge.append((w, lbl))
            if is_added:
                dct_container[past_added_lbl].pop()
                is_added = False
            continue
        else:
            if len(list_to_merge) > 1:
                wpast, lblpast = merge_token(list_to_merge)
                if lblpast != 'O':
                    dct_container[lblpast[2:]].append(wpast)
            past_wid = wid
            list_to_merge = [(w, lbl)]


        if lbl != 'O':
            is_added = True
            dct_container[lbl[2:]].append(w)
            past_added_lbl = lbl[2:]
        else:
            is_added = False
    
    return lastid_batch

In [82]:
def extract_prediction(pred_model, tokenized_inputs):
    dct_temp = {
            'ACTION':[],
            'KONTAK':[],
            'LAYANAN':[],
            'MODUS':[],
            'NOMINAL':[],
            'PERSON':[],
            'PLATFORM':[],
            'PRODUK':[],
            'REK':[]
        }

    pred_logits = pred_model['logits']
    pred_ID = pred_logits.argmax(dim=2)

    lastid_batch = -1
    for i in range(pred_ID.size()[0]):
        batch_inp = 0
        wordslist = tokenizer.convert_ids_to_tokens(tokenized_inputs['input_ids'][batch_inp])
        idlabel = pred_ID[batch_inp]
        word_id = tokenized_inputs.word_ids(batch_inp)
        word_mapping = tokenized_inputs['offset_mapping'][batch_inp]

        lastid_batch = extractionbatch_result(
            dct_container= dct_temp,
            words_list=wordslist,
            idlabel= idlabel,
            word_id= word_id,
            word_mapping= word_mapping,
            lastid_batch= lastid_batch
        )

    return dct_temp

## Full NER extraction

In [91]:
dct_extraction = {
    'text_ID':[],
    'ACTION':[],
    'KONTAK':[],
    'LAYANAN':[],
    'MODUS':[],
    'NOMINAL':[],
    'PERSON':[],
    'PLATFORM':[],
    'PRODUK':[],
    'REK':[]
}

for i in tqdm(range(len(df_teks))):
    dct_extraction['text_ID'].append(df_teks.iloc[i, 0])
    text = df_teks.iloc[i, 2]
    tokenized_inputs = tokenizer(
        text,
        truncation=True,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        max_length=250,
        stride=128,
        return_tensors='pt',
        padding=True
    )
    pred_model = model(
        input_ids = tokenized_inputs['input_ids'],
        token_type_ids = tokenized_inputs['token_type_ids'],
        attention_mask = tokenized_inputs['attention_mask'],
    )
    result_extraction = extract_prediction(pred_model, tokenized_inputs)

    for key, val in result_extraction.items():
        dct_extraction[key].append(' | '.join(val))

df_extraction = pd.DataFrame.from_dict(dct_extraction)
df_extraction

100%|██████████| 3005/3005 [06:32<00:00,  7.66it/s] 


Unnamed: 0,text_ID,ACTION,KONTAK,LAYANAN,MODUS,NOMINAL,PERSON,PLATFORM,PRODUK,REK
0,0,,nomer,,,,,,,
1,1,,,jual | beli,scam | skema,,,,mobil,
2,2,,,,,,,,,
3,3,,,,,,,,,
4,4,,,,love | scamming | internasional,,,,,
...,...,...,...,...,...,...,...,...,...,...
3000,3000,ketipu,,,scam,,cindy | cindy | cindy,,,
3001,3001,beli | wts | ketipu | beli,,,scam,,,,cat | cat | 2 | tiketnya | cat | 2,
3002,3002,ketipu,,,scam,,,x,,
3003,3003,beli | ketipu,,,penipuan | scam,jutaan,,,baju,


In [88]:
df_extraction.to_csv('data/extraction_result.tsv', sep='\t', index= False)