# character identification

In [1]:
import torch
print(torch.cuda.is_available())                     # True
print(torch.version.__version__)                     # Torch version (e.g., 2.6.0)
print(torch.version.cuda)                            # CUDA version
print(torch.cuda.get_device_name(0))                 # Your GPU name


True
2.6.0+cu126
12.6
NVIDIA GeForce RTX 4070 Laptop GPU


In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from sklearn.model_selection import train_test_split
import torch

# Load annotated dataset
df = pd.read_csv("even_semi_annotated.csv")

# Only keep relevant columns
df = df[['story_id', 'judul', 'sentence_id', 'word', 'TYPE2']]


  from .autonotebook import tqdm as notebook_tqdm





In [3]:
# --- Count total number of sentences ---

n_sentences = df[['story_id', 'sentence_id']].drop_duplicates().shape[0]

print(f"Total number of sentences: {n_sentences}")


Total number of sentences: 1157


In [4]:
# --- Mean number of sentences per story ---

sentences_per_story = df[['story_id', 'sentence_id']].drop_duplicates().groupby('story_id').size()
mean_sentences_per_story = sentences_per_story.mean()

print(f"Mean number of sentences per story: {mean_sentences_per_story:.2f}")


Mean number of sentences per story: 115.70


In [5]:
# Convert labels to unique integers
unique_labels = df['TYPE2'].unique().tolist()
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {v: k for k, v in label2id.items()}

# Create list of sentences (grouped by sentence_id)
grouped = df.groupby(['story_id', 'sentence_id'])
sentences = []
labels = []
story_ids = []

for (story_id, _), group in grouped:
    word_list = group['word'].tolist()
    label_list = group['TYPE2'].map(label2id).tolist()
    sentences.append(word_list)
    labels.append(label_list)
    story_ids.append(story_id)


In [6]:
# Create HuggingFace dataset
dataset = Dataset.from_dict({
    'tokens': sentences,
    'ner_tags': labels,
    'story_id': story_ids
})

# Train/test split
dataset = dataset.train_test_split(test_size=0.2)

# Load tokenizer
model_checkpoint = "cahya/bert-base-indonesian-1.5G"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [7]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        is_split_into_words=True
    )

    all_labels = []
    for i, labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        new_labels = []
        current_label = None

        for word_idx in word_ids:
            if word_idx is None:
                new_labels.append(-100)
            else:
                label = labels[word_idx]
                label_name = id2label[label]

                # Start of a new word
                if word_idx != current_label:
                    current_label = word_idx
                    new_labels.append(label)
                # Continuation of the same word
                else:
                    # Convert B-PER to I-PER for subwords
                    if label_name == "B-PER":
                        new_labels.append(label2id["I-PER"])
                    else:
                        new_labels.append(label)

        all_labels.append(new_labels)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

In [8]:
# Apply preprocessing
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# Load model
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label2id), id2label=id2label, label2id=label2id)


Map:   0%|          | 0/925 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 925/925 [00:00<00:00, 15421.28 examples/s]
Map: 100%|██████████| 232/232 [00:00<00:00, 16977.73 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at cahya/bert-base-indonesian-1.5G and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_labels = []
    pred_labels = []

    for true, pred in zip(labels, predictions):
        true_seq = []
        pred_seq = []
        for t, p in zip(true, pred):
            if t != -100:
                true_seq.append(id2label[t])
                pred_seq.append(id2label[p])
        true_labels.append(true_seq)
        pred_labels.append(pred_seq)

    return {
        "accuracy": accuracy_score(true_labels, pred_labels),
        "precision": precision_score(true_labels, pred_labels),
        "recall": recall_score(true_labels, pred_labels),
        "f1": f1_score(true_labels, pred_labels),
    }


In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.062691,0.977824,0.847896,0.882155,0.864686
2,No log,0.055506,0.980126,0.87541,0.89899,0.887043
3,No log,0.073246,0.978033,0.883162,0.86532,0.87415
4,No log,0.076871,0.98159,0.896667,0.905724,0.901173
5,No log,0.092698,0.981381,0.89899,0.89899,0.89899
6,No log,0.093831,0.982427,0.895082,0.919192,0.906977
7,No log,0.094119,0.981799,0.88961,0.922559,0.905785
8,No log,0.101568,0.982008,0.891447,0.912458,0.90183
9,0.045000,0.104151,0.98159,0.879085,0.905724,0.892206
10,0.045000,0.104119,0.982218,0.879479,0.909091,0.89404


TrainOutput(global_step=580, training_loss=0.03917294784352697, metrics={'train_runtime': 207.3518, 'train_samples_per_second': 44.61, 'train_steps_per_second': 2.797, 'total_flos': 525733286343660.0, 'train_loss': 0.03917294784352697, 'epoch': 10.0})

In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.078114,0.974005,0.851441
2,No log,0.072988,0.977953,0.8844
3,No log,0.078335,0.978776,0.886937
4,No log,0.078946,0.982396,0.916951
5,No log,0.0844,0.981573,0.902439
6,No log,0.085543,0.983054,0.915825
7,No log,0.092043,0.981573,0.905077
8,No log,0.093117,0.982231,0.907263
9,0.044300,0.093897,0.98256,0.909497
10,0.044300,0.094984,0.982396,0.909699




TrainOutput(global_step=580, training_loss=0.038465827600709324, metrics={'train_runtime': 209.5451, 'train_samples_per_second': 44.143, 'train_steps_per_second': 2.768, 'total_flos': 489342824583408.0, 'train_loss': 0.038465827600709324, 'epoch': 10.0})

In [10]:
# Save final model
model.save_pretrained("./ner_model")
tokenizer.save_pretrained("./ner_model")

('./ner_model\\tokenizer_config.json',
 './ner_model\\special_tokens_map.json',
 './ner_model\\vocab.txt',
 './ner_model\\added_tokens.json',
 './ner_model\\tokenizer.json')

In [None]:
from transformers import TokenClassificationPipeline
from collections import defaultdict

# Setup pipeline
ner_pipe = TokenClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",  # Merge subwords like B-PER + I-PER
    device=0 if torch.cuda.is_available() else -1
)

# Clean token (remove subword markers like "##")
def clean_word(word):
    return word.replace("##", "").strip()

# Final character storage
storywise_characters = defaultdict(set)

# Process each test sentence
for story_id, tokens in zip(dataset["test"]["story_id"], dataset["test"]["tokens"]):
    sentence = " ".join(tokens)
    preds = ner_pipe(sentence)

    entity_buffer = []
    prev_entity_type = None

    for pred in preds:
        label = pred["entity_group"]
        word = clean_word(pred["word"])

        if label.startswith("PER"):
            # Flush previous entity buffer
            if entity_buffer:
                name = tokenizer.convert_tokens_to_string(entity_buffer).strip().title()
                if name:
                    storywise_characters[story_id].add(name)
            entity_buffer = [word]
        else:
            if entity_buffer:
                name = tokenizer.convert_tokens_to_string(entity_buffer).strip().title()
                if name:
                    storywise_characters[story_id].add(name)
                entity_buffer = []

        prev_entity_type = label

    # Final flush
    if entity_buffer:
        name = tokenizer.convert_tokens_to_string(entity_buffer).strip().title()
        if name:
            storywise_characters[story_id].add(name)

# ✅ Print final result
for story, chars in storywise_characters.items():
    print(f"Story ID: {story}\nCharacters: {sorted(chars)}\n")


Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Story ID: 10
Characters: ['- Dayang', 'Anak Raja', 'Ayahandamu', 'Budak', 'Bunda', 'Ibundanya', 'Isterinya', 'Kakandaku', 'Kepala Rumah Tangga', 'Kunal', 'Menteri', 'Naga Sorai', 'Nagai Sori', 'Orang Menteri', 'Pembesar Kerajaannya', 'Pengawal Rombongan', 'Permaisuri', 'Permaisuri Haroharo', 'Permaisurinya', 'Putera Haroharo', 'Puteri', 'Puteri Raja Margolang', 'Puterinya', 'Raja', 'Raja Haroharo', 'Raja Margolang', 'Rakyat', 'Rakyat Raja Margolang', 'Sipakpak Kunal', 'Tamunya', 'Tukang Dayung']

Story ID: 87
Characters: ['Laki - Laki', 'Laki - Laki Belulusan Ulakh', 'Penduduk Di', 'Sahabat Lain', 'Teman', 'Ular', 'Usun']

Story ID: 26
Characters: ['Abang', 'Adik Ipar Laki - Lakinya', 'Adik Iparnya', 'Dik', 'Istri', 'Istrinya', 'Nusa', 'Warga']

Story ID: 25
Characters: ['Anaknya', 'Ayah', 'Beberi', 'Burung', 'Dewi Anjani', 'Doyan Nada', 'Ibunya', 'Limandaru', 'Manusia', 'Orang Tuanya', 'Pertapa', 'Prajuritnya', 'Raksasa']

Story ID: 16
Characters: ['Anak Semata Wayangnya', 'Ayah', 'Ib

In [12]:
import pandas as pd

# Assuming this mapping exists from earlier
story_id_to_title = df.groupby('story_id')['judul'].first().to_dict()

# Flatten the results into one row per character
char_data_flat = {
    "story_id": [],
    "judul": [],
    "character": []
}

for story_id, characters in storywise_characters.items():
    for character in sorted(characters):
        char_data_flat["story_id"].append(story_id)
        char_data_flat["judul"].append(story_id_to_title.get(story_id, "Unknown"))
        char_data_flat["character"].append(character)

# Convert to DataFrame
df_flat = pd.DataFrame(char_data_flat)

# Show it
df_flat


Unnamed: 0,story_id,judul,character
0,10,Sipakpak Kunal Dan Nagai Sori,- Dayang
1,10,Sipakpak Kunal Dan Nagai Sori,Anak Raja
2,10,Sipakpak Kunal Dan Nagai Sori,Ayahandamu
3,10,Sipakpak Kunal Dan Nagai Sori,Budak
4,10,Sipakpak Kunal Dan Nagai Sori,Bunda
...,...,...,...
162,23,Rawang Tengkuluk,Ibunya
163,23,Rawang Tengkuluk,Mak Siyah
164,23,Rawang Tengkuluk,Siti
165,23,Rawang Tengkuluk,Suaminya


## comparison

In [13]:
from collections import defaultdict

ground_truth_characters = defaultdict(list)

for story_id, group in df[df['TYPE2'].isin(['B-PER', 'I-PER'])].groupby(['story_id', 'sentence_id']):
    current_entity = []
    prev_type = None

    for _, row in group.iterrows():
        label = row['TYPE2']
        word = row['word']

        if label == 'B-PER':
            if current_entity:
                full_name = " ".join(current_entity).strip().title()
                ground_truth_characters[story_id[0]].append(full_name)
            current_entity = [word]
        elif label == 'I-PER':
            if prev_type in ['B-PER', 'I-PER']:
                current_entity.append(word)
            else:
                if current_entity:
                    full_name = " ".join(current_entity).strip().title()
                    ground_truth_characters[story_id[0]].append(full_name)
                current_entity = []

        prev_type = label

    # Flush last entity
    if current_entity:
        full_name = " ".join(current_entity).strip().title()
        ground_truth_characters[story_id[0]].append(full_name)


In [14]:
for story_id in sorted(set(list(ground_truth_characters.keys()) + list(storywise_characters.keys()))):
    gt_set = set(ground_truth_characters.get(story_id, []))
    pred_set = set(storywise_characters.get(story_id, []))
    
    print(f"\n📚 Story ID: {story_id}")
    print(f"🟢 Ground Truth Characters: {sorted(gt_set)}")
    print(f"🔵 Predicted Characters:     {sorted(pred_set)}")
    
    matched = gt_set & pred_set
    missed = gt_set - pred_set
    extra = pred_set - gt_set

    print(f"✅ Matched: {sorted(matched)}")
    print(f"❌ Missed:  {sorted(missed)}")
    print(f"🌀 Extra:   {sorted(extra)}")



📚 Story ID: 7
🟢 Ground Truth Characters: ['Abang', 'Abangku', 'Abangnya', 'Adik', 'Adiknya', 'Ahmad', 'Anak Dara', 'Anak Laki Laki', 'Anak Muda', 'Anak Perempuan', 'Ayah', 'Ayahanda', 'Ayahandanya', 'Ayahmu', 'Ayahnya', 'Baginda Raja', 'Bang', 'Bu', 'Burung', 'Burung Kesayangannya', 'Burung Merbuk', 'Burung Merbuknya', 'Burung Rajawali', 'Datuk Bendahara', 'Gajah Gajah', 'Gajah Putih', 'Ibu', 'Ibu Ahmad', 'Ibumu', 'Ibunya', 'Isterinya', 'Istri', 'Mak', 'Mak Inang', 'Merbuk', 'Muhammad', 'Orang Muda', 'Orang Tua', 'Pak Ahmad', 'Pawang Merbuk', 'Pembantu', 'Pembantunya', 'Pemuda', 'Pengawal', 'Pengawal Istana', 'Pengawal Raja', 'Perdana Menteri', 'Permaisuri', 'Puteranya', 'Puteri', 'Puteri Bungsu', 'Puteri Sulung', 'Putri Bungsu', 'Raja', 'Rajawali', 'Rakyat', 'Rakyatnya', 'Suami', 'Tuan Puteri', 'Tuan Puteri Bungsu', 'Wak', 'Wak Pawang', 'Wak Pawang Merbuk', 'Yang Maha Kuasa']
🔵 Predicted Characters:     ['Abang', 'Abang Beradik', 'Abangnya', 'Adik', 'Adiknya', 'Ahmad', 'Anak Muda', '

In [15]:
from sklearn.metrics import precision_score, recall_score, f1_score

def precision_recall_f1(y_true, y_pred):
    true_set = set(y_true)
    pred_set = set(y_pred)
    intersection = true_set & pred_set

    precision = len(intersection) / len(pred_set) if pred_set else 0.0
    recall = len(intersection) / len(true_set) if true_set else 0.0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0

    return precision, recall, f1


In [16]:
eval_data = {
    "story_id": [],
    "judul": [],
    "ground_truth": [],
    "predicted": [],
    "matched": [],
    "precision": [],
    "recall": [],
    "f1_score": []
}

for story_id in sorted(set(ground_truth_characters) | set(storywise_characters)):
    gt = ground_truth_characters.get(story_id, [])
    pred = list(storywise_characters.get(story_id, set()))
    matched = list(set(gt) & set(pred))

    p, r, f1 = precision_recall_f1(gt, pred)

    eval_data["story_id"].append(story_id)
    eval_data["judul"].append(story_id_to_title.get(story_id, "Unknown"))
    eval_data["ground_truth"].append(len(gt))
    eval_data["predicted"].append(len(pred))
    eval_data["matched"].append(len(matched))
    eval_data["precision"].append(round(p, 3))
    eval_data["recall"].append(round(r, 3))
    eval_data["f1_score"].append(round(f1, 3))

df_eval = pd.DataFrame(eval_data)
print(df_eval)


   story_id                          judul  ground_truth  predicted  matched  \
0         7             Tuah Burung Merbak           538         48       40   
1        10  Sipakpak Kunal Dan Nagai Sori           217         31       24   
2        16           Asal Usul Laut Tador            93          8        7   
3        23               Rawang Tengkuluk           170          8        8   
4        25               Kisah Doyan Nada           164         13       11   
5        26           Asal Mula Pulau Nusa           139          8        7   
6        35                  Banta Seudang           191         19       18   
7        87                  Tanjung Siman            38          7        1   
8       109                        Ine Ude           155          9        8   
9       111                 Mentiko Betuah           173         16       15   

   precision  recall  f1_score  
0      0.833   0.625     0.714  
1      0.774   0.343     0.475  
2      0.875   0.368

# pseudo-labelling

## pseudo labelling round 1

In [24]:
import pandas as pd
from transformers import TokenClassificationPipeline, AutoTokenizer, AutoModelForTokenClassification
from collections import defaultdict

# Load and group the unlabeled data
df_unlabeled = pd.read_csv("../preprocessing/cerita_rakyat_tokenized_clean.csv")

# Group into sentences
grouped = df_unlabeled.groupby(['story_id', 'sentence_id'])
sentence_map = []
for (story_id, sentence_id), group in grouped:
    words = group['word'].astype(str).tolist()
    sentence_map.append((story_id, sentence_id, words))

# Load fine-tuned model
model = AutoModelForTokenClassification.from_pretrained("./ner_model")
tokenizer = AutoTokenizer.from_pretrained("./ner_model")

pseudo_pipe = TokenClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1
)

# Store sentence-level characters
sentence_character_data = []

# Store confidence scores by story
story_confidence_data = defaultdict(list)

for story_id, sentence_id, tokens in sentence_map:
    sentence_text = " ".join(tokens)
    preds = pseudo_pipe(sentence_text)

    char_names = set()

    for pred in preds:
        if pred["entity_group"] == "PER" and pred["score"] >= 0.96:
            char_name = pred["word"].replace("##", "").strip().title()
            if char_name and len(char_name) > 1 and not char_name.isspace():
                char_names.add(char_name)
                story_confidence_data[story_id].append(pred["score"])

    if char_names:
        sentence_character_data.append({
            "story_id": story_id,
            "sentence_id": sentence_id,
            "characters": list(char_names)
        })

# Save sentence-level character mentions
df_chars = pd.DataFrame(sentence_character_data)
df_chars.to_csv("pseudo_characters_by_sentence.csv", index=False)

# Build story-level confidence summary
story_conf_stats = []
for story_id, confs in story_confidence_data.items():
    story_conf_stats.append({
        "story_id": story_id,
        "avg_confidence": round(sum(confs) / len(confs), 4),
        "min_confidence": round(min(confs), 4),
        "max_confidence": round(max(confs), 4),
        "mention_count": len(confs)
    })

df_conf = pd.DataFrame(story_conf_stats).sort_values("avg_confidence", ascending=False)
df_conf.to_csv("confidence_per_story.csv", index=False)

print(f"✅ Saved {len(df_chars)} rows of sentence-level character mentions.")
print(f"📊 Saved confidence stats for {len(df_conf)} stories.")


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


✅ Saved 7706 rows of sentence-level character mentions.
📊 Saved confidence stats for 222 stories.


In [25]:
import pandas as pd

# Load the sentence-level character data
df = pd.read_csv("pseudo_characters_by_sentence.csv")

# Count unique story IDs
num_stories = df["story_id"].nunique()

print(f"📖 Number of unique stories with character predictions: {num_stories}")


📖 Number of unique stories with character predictions: 222


In [1]:
import pandas as pd
import torch
from transformers import TokenClassificationPipeline, AutoTokenizer, AutoModelForTokenClassification
from collections import defaultdict

# === Parameters ===
confidence_threshold = 0.95  # Set your threshold (95%)

# === Step 1: Load Unlabeled Data ===
df_unlabeled = pd.read_csv("../preprocessing/cerita_rakyat_tokenized_clean.csv")

# Group into sentences
grouped = df_unlabeled.groupby(['story_id', 'sentence_id'])
sentence_map = []
for (story_id, sentence_id), group in grouped:
    words = group['word'].astype(str).tolist()
    sentence_map.append((story_id, sentence_id, words))

# === Step 2: Load Model ===
model = AutoModelForTokenClassification.from_pretrained("./ner_model")
tokenizer = AutoTokenizer.from_pretrained("./ner_model")

pseudo_pipe = TokenClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1
)

# === Step 3: Pseudo-Labeling ===
high_conf_pseudo_labels = []  # Will be used to augment training data
leftover_sentences = []       # Sentences that need reprocessing later

for story_id, sentence_id, tokens in sentence_map:
    sentence_text = " ".join(tokens)
    preds = pseudo_pipe(sentence_text)

    char_names = []
    char_confidences = []
    low_conf = False

    for pred in preds:
        if pred["entity_group"] == "PER":
            if pred["score"] >= confidence_threshold:
                char_name = pred["word"].replace("##", "").strip().title()
                if char_name and len(char_name) > 1 and not char_name.isspace():
                    char_names.append(char_name)
                    char_confidences.append(round(pred["score"], 4))
            else:
                low_conf = True

    if char_names:
        high_conf_pseudo_labels.append({
            "story_id": story_id,
            "sentence_id": sentence_id,
            "characters": list(set(char_names)),
            "confidences": char_confidences
        })

    # If any low confidence detected, add sentence to leftovers
    if low_conf:
        leftover_sentences.append({
            "story_id": story_id,
            "sentence_id": sentence_id,
            "tokens": tokens
        })

# === Step 4: Save Results ===

# Save high confidence pseudo labels
df_high_conf = pd.DataFrame(high_conf_pseudo_labels)
df_high_conf.to_csv("pseudo_characters_high_confidence.csv", index=False)

# Save leftover sentences for next round
df_leftover = pd.DataFrame(leftover_sentences)
df_leftover.to_csv("leftover_sentences_for_next_round.csv", index=False)

print(f"✅ High-confidence pseudo-labeled sentences: {len(df_high_conf)}")
print(f"🔄 Sentences saved for next pseudo-labeling round: {len(df_leftover)}")


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


✅ High-confidence pseudo-labeled sentences: 7905
🔄 Sentences saved for next pseudo-labeling round: 3378


## psedo labelling round 2

In [4]:
import pandas as pd

# Load original sentence-tokenized data
df_tokens = pd.read_csv("../preprocessing/cerita_rakyat_tokenized_clean.csv")

# Load sentence-level pseudo labels
df_pseudo = pd.read_csv("pseudo_characters_high_confidence.csv")
df_pseudo['characters'] = df_pseudo['characters'].apply(eval)  # Convert from string to list

# Merge pseudo labels with the token-level data
joined_ids = set(zip(df_pseudo['story_id'], df_pseudo['sentence_id']))
df_matched = df_tokens[df_tokens.apply(lambda row: (row['story_id'], row['sentence_id']) in joined_ids, axis=1)].copy()

# Merge character labels into each sentence row
df_merged = df_matched.merge(df_pseudo[['story_id', 'sentence_id', 'characters']], on=['story_id', 'sentence_id'], how='left')

# Function to assign B-PER / I-PER / O
def label_token(word, char_list):
    word = str(word).lower()  # <- force word to be string first
    for char in char_list:
        char_tokens = char.lower().split()
        if word == char_tokens[0]:
            return 'B-PER' if len(char_tokens) == 1 else 'B-PER'
        elif word in char_tokens:
            return 'I-PER'
    return 'O'

# Apply labeling
df_merged['TYPE2'] = df_merged.apply(lambda row: label_token(row['word'], row['characters']), axis=1)

# Drop extras
df_pseudo_labeled = df_merged[['story_id', 'sentence_id', 'word', 'TYPE2']]

# Save as pseudo training data
df_pseudo_labeled.to_csv("pseudo_word_level_labeled_v1.csv", index=False)


In [5]:
# Load original manually labeled dataset
df_manual = pd.read_csv("even_semi_annotated.csv")[['story_id', 'sentence_id', 'word', 'TYPE2']]

# Load pseudo-labeled data
df_pseudo = pd.read_csv("pseudo_word_level_labeled_v1.csv")

# Combine both
df_train = pd.concat([df_manual, df_pseudo], ignore_index=True)

# Save final training set
df_train.to_csv("training_data_for_model_v2.csv", index=False)


In [10]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

# === Step 1: Load combined dataset ===
df = pd.read_csv("training_data_for_model_v2.csv")
df = df[['story_id', 'sentence_id', 'word', 'TYPE2']]

# Force all words to be string and no NaN
df['word'] = df['word'].fillna('').astype(str)

# Build label mappings
labels = sorted(df['TYPE2'].unique())
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Assign token-level fields
df['ner_tags'] = df['TYPE2'].map(label2id)
df = df[['story_id', 'sentence_id', 'word', 'ner_tags']]

# === Step 2: Group by sentences ===
grouped = df.groupby(['story_id', 'sentence_id'])
sentences = []
tags = []

for _, group in grouped:
    sentences.append(group['word'].tolist())
    tags.append(group['ner_tags'].tolist())

dataset = Dataset.from_dict({
    'tokens': sentences,
    'ner_tags': tags
})


# === Step 3: Train/test split ===
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# === Step 4: Tokenizer and Model ===
model_checkpoint = "cahya/bert-base-indonesian-1.5G"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# === Step 5: Tokenize and Align Labels ===
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# === Step 6: Metric computation ===
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    true_labels = [[id2label[label] for label in sent if label != -100] for sent in labels]
    true_preds = [[id2label[pred] for (pred, label) in zip(sent_pred, sent_label) if label != -100] for sent_pred, sent_label in zip(preds, labels)]

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds)
    }

# === Step 7: Training Arguments ===
training_args = TrainingArguments(
    output_dir="./ner_model_v2",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir="./logs_v2",
    logging_steps=100,
    report_to="tensorboard"
)

# === Step 8: Trainer Setup ===
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# === Step 9: Train ===
trainer.train()

# Save final model
trainer.save_model("./ner_model_v2")
tokenizer.save_pretrained("./ner_model_v2")

print("✅ Training complete! Model saved at ./ner_model_v2")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at cahya/bert-base-indonesian-1.5G and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map:   0%|          | 0/6817 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 6817/6817 [00:00<00:00, 13452.17 examples/s]
Map: 100%|██████████| 1705/1705 [00:00<00:00, 22336.82 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0716,0.074433,0.867597,0.927267,0.89644
2,0.0434,0.068128,0.914024,0.912355,0.913189
3,0.0272,0.078789,0.889567,0.931528,0.910064
4,0.0135,0.093686,0.907757,0.922398,0.915019
5,0.0125,0.09504,0.909861,0.915399,0.912621
6,0.0076,0.104345,0.908434,0.917833,0.913109
7,0.0049,0.121911,0.907835,0.920268,0.914009
8,0.0041,0.120726,0.918268,0.916312,0.917289
9,0.0026,0.128447,0.915285,0.92392,0.919582
10,0.001,0.134376,0.910998,0.925137,0.918013




✅ Training complete! Model saved at ./ner_model_v2


In [11]:
import pandas as pd
import torch
from transformers import TokenClassificationPipeline, AutoTokenizer, AutoModelForTokenClassification

# === Step 1: Load model v2 ===
model = AutoModelForTokenClassification.from_pretrained("./ner_model_v2")
tokenizer = AutoTokenizer.from_pretrained("./ner_model_v2")

pipeline = TokenClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1
)

# === Step 2: Load leftovers ===
df_leftover = pd.read_csv("leftover_sentences_for_next_round.csv")
df_leftover['tokens'] = df_leftover['tokens'].apply(eval)  # Convert from string to list

# === Step 3: Pseudo-labeling round 2 ===
high_conf_pseudo_labels = []
still_low_conf = []

confidence_threshold = 0.95

for _, row in df_leftover.iterrows():
    story_id = row['story_id']
    sentence_id = row['sentence_id']
    tokens = row['tokens']
    sentence = " ".join(tokens)

    preds = pipeline(sentence)

    char_names = []
    confidences = []
    low_conf = False

    for pred in preds:
        if pred["entity_group"] == "PER":
            score = pred["score"]
            word = pred["word"].replace("##", "").strip().title()

            if score >= confidence_threshold:
                if word and len(word) > 1 and not word.isspace():
                    char_names.append(word)
                    confidences.append(round(score, 4))
            else:
                low_conf = True

    if char_names:
        high_conf_pseudo_labels.append({
            "story_id": story_id,
            "sentence_id": sentence_id,
            "characters": list(set(char_names)),
            "confidences": confidences
        })

    if low_conf:
        still_low_conf.append({
            "story_id": story_id,
            "sentence_id": sentence_id,
            "tokens": tokens
        })

# === Step 4: Save outputs ===
pd.DataFrame(high_conf_pseudo_labels).to_csv("pseudo_characters_round2.csv", index=False)
pd.DataFrame(still_low_conf).to_csv("leftover_sentences_for_next_round2.csv", index=False)

print(f"✅ Round 2 pseudo-labeled sentences: {len(high_conf_pseudo_labels)}")
print(f"🔄 Still left for round 3: {len(still_low_conf)}")


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


✅ Round 2 pseudo-labeled sentences: 2842
🔄 Still left for round 3: 367


In [12]:
import pandas as pd

# Load both rounds
df_r1 = pd.read_csv("pseudo_characters_high_confidence.csv")
df_r2 = pd.read_csv("pseudo_characters_round2.csv")

# Convert character/confidences to lists if needed
df_r1['characters'] = df_r1['characters'].apply(eval)
df_r2['characters'] = df_r2['characters'].apply(eval)
df_r1['confidences'] = df_r1['confidences'].apply(eval)
df_r2['confidences'] = df_r2['confidences'].apply(eval)

# Concatenate both
df_combined = pd.concat([df_r1, df_r2], ignore_index=True)

# Drop any duplicates (just in case same sentence appeared in both rounds)
df_combined = df_combined.drop_duplicates(subset=['story_id', 'sentence_id'])

# Save merged result
df_combined.to_csv("pseudo_characters_combined_round1_2.csv", index=False)

# Count total
print(f"✅ Total pseudo-labeled sentences after round 2: {len(df_combined)}")


✅ Total pseudo-labeled sentences after round 2: 9013


## pseudo labelling round 3

In [13]:
import pandas as pd

# Load sentence-tokenized full dataset
df_tokens = pd.read_csv("../preprocessing/cerita_rakyat_tokenized_clean.csv")

# Load pseudo-labeled sentence-level characters
df_chars = pd.read_csv("pseudo_characters_combined_round1_2.csv")
df_chars['characters'] = df_chars['characters'].apply(eval)

# Filter token rows that match labeled sentences
matched_ids = set(zip(df_chars['story_id'], df_chars['sentence_id']))
df_matched = df_tokens[df_tokens.apply(lambda row: (row['story_id'], row['sentence_id']) in matched_ids, axis=1)].copy()

# Merge character names to each token row
df_merge = df_matched.merge(df_chars[['story_id', 'sentence_id', 'characters']], on=['story_id', 'sentence_id'], how='left')

# Clean up possible float words
df_merge['word'] = df_merge['word'].fillna('').astype(str)

# Labeling function: B-PER / I-PER / O
def label_token(word, char_list):
    word = word.lower()
    for char in char_list:
        tokens = char.lower().split()
        if word == tokens[0]:
            return 'B-PER' if len(tokens) == 1 else 'B-PER'
        elif word in tokens:
            return 'I-PER'
    return 'O'

df_merge['TYPE2'] = df_merge.apply(lambda row: label_token(row['word'], row['characters']), axis=1)

# Save word-level pseudo-labeled data
df_pseudo = df_merge[['story_id', 'sentence_id', 'word', 'TYPE2']]
df_pseudo.to_csv("pseudo_word_level_labeled_combined_round1_2.csv", index=False)


In [14]:
# Load original manually labeled data
df_manual = pd.read_csv("even_semi_annotated.csv")[['story_id', 'sentence_id', 'word', 'TYPE2']]

# Load new pseudo-labeled tokens
df_pseudo = pd.read_csv("pseudo_word_level_labeled_combined_round1_2.csv")

# Combine for training model v3
df_train = pd.concat([df_manual, df_pseudo], ignore_index=True)
df_train.to_csv("training_data_for_model_v3.csv", index=False)

print("✅ Training dataset for ner_model_v3 is ready!")


✅ Training dataset for ner_model_v3 is ready!


In [16]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

# === Step 1: Load & preprocess ===
df = pd.read_csv("training_data_for_model_v3.csv")[['story_id', 'sentence_id', 'word', 'TYPE2']]
df['word'] = df['word'].fillna('').astype(str)

# Label mapping
labels = sorted(df['TYPE2'].unique())
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Map TYPE2 to numeric
df['ner_tags'] = df['TYPE2'].map(label2id)

# Group into sentences
grouped = df.groupby(['story_id', 'sentence_id'])
sentences = []
tags = []

for _, group in grouped:
    sentences.append(group['word'].tolist())
    tags.append(group['ner_tags'].tolist())

# HuggingFace Dataset
dataset = Dataset.from_dict({
    'tokens': sentences,
    'ner_tags': tags
})

# 80/20 split
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# === Step 2: Tokenizer and Model ===
checkpoint = "cahya/bert-base-indonesian-1.5G"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# === Step 3: Tokenize and align labels ===
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# === Step 4: Metrics ===
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    true_labels = [[id2label[l] for l in sent if l != -100] for sent in labels]
    true_preds = [[id2label[p] for p, l in zip(sent_pred, sent_label) if l != -100]
                  for sent_pred, sent_label in zip(preds, labels)]

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds)
    }

# === Step 5: TrainingArguments ===
training_args = TrainingArguments(
    output_dir="./ner_model_v3",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir="./logs_v3",
    logging_steps=100,
    report_to="tensorboard"
)

# === Step 6: Trainer ===
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# === Step 7: Train ===
trainer.train()

# Save final model
trainer.save_model("./ner_model_v3")
tokenizer.save_pretrained("./ner_model_v3")

print("✅ Training complete! Model saved to ./ner_model_v3 🎉")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at cahya/bert-base-indonesian-1.5G and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map:   0%|          | 0/7674 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 7674/7674 [00:00<00:00, 10637.09 examples/s]
Map: 100%|██████████| 1919/1919 [00:00<00:00, 10685.27 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0829,0.0672,0.880402,0.884709,0.88255
2,0.0529,0.069445,0.855508,0.914373,0.883962
3,0.0301,0.074701,0.873874,0.919572,0.896141
4,0.0203,0.090075,0.882248,0.907339,0.894618
5,0.0099,0.119955,0.870968,0.924771,0.897063
6,0.008,0.112612,0.882109,0.910703,0.896178
7,0.0058,0.121673,0.880562,0.919878,0.899791
8,0.0027,0.134505,0.882146,0.920183,0.900763
9,0.0017,0.140003,0.886586,0.915596,0.900858
10,0.0011,0.146722,0.877361,0.923242,0.899717


✅ Training complete! Model saved to ./ner_model_v3 🎉


In [17]:
import pandas as pd
import torch
from transformers import TokenClassificationPipeline, AutoTokenizer, AutoModelForTokenClassification

# === Step 1: Load ner_model_v3 ===
model = AutoModelForTokenClassification.from_pretrained("./ner_model_v3")
tokenizer = AutoTokenizer.from_pretrained("./ner_model_v3")

pipeline = TokenClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",  # merge subwords nicely
    device=0 if torch.cuda.is_available() else -1
)

# === Step 2: Load leftover sentences ===
df_leftover = pd.read_csv("leftover_sentences_for_next_round2.csv")
df_leftover['tokens'] = df_leftover['tokens'].apply(eval)  # convert string to list

# === Step 3: Pseudo-labeling leftover round 3 ===
confidence_threshold = 0.95

high_conf_pseudo_labels = []
still_low_conf = []

for _, row in df_leftover.iterrows():
    story_id = row['story_id']
    sentence_id = row['sentence_id']
    tokens = row['tokens']
    sentence = " ".join(tokens)

    preds = pipeline(sentence)

    char_names = []
    confidences = []
    low_conf = False

    for pred in preds:
        if pred["entity_group"] == "PER":
            score = pred["score"]
            word = pred["word"].replace("##", "").strip().title()

            if score >= confidence_threshold:
                if word and len(word) > 1 and not word.isspace():
                    char_names.append(word)
                    confidences.append(round(score, 4))
            else:
                low_conf = True

    if char_names:
        high_conf_pseudo_labels.append({
            "story_id": story_id,
            "sentence_id": sentence_id,
            "characters": list(set(char_names)),
            "confidences": confidences
        })

    if low_conf:
        still_low_conf.append({
            "story_id": story_id,
            "sentence_id": sentence_id,
            "tokens": tokens
        })

# === Step 4: Save results ===
pd.DataFrame(high_conf_pseudo_labels).to_csv("pseudo_characters_round3.csv", index=False)
pd.DataFrame(still_low_conf).to_csv("leftover_sentences_for_next_round3.csv", index=False)

print(f"✅ Round 3 pseudo-labeled sentences: {len(high_conf_pseudo_labels)}")
print(f"🔄 Still left for round 4: {len(still_low_conf)}")


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


✅ Round 3 pseudo-labeled sentences: 257
🔄 Still left for round 4: 58


In [18]:
import pandas as pd

# Load all rounds
df_r1 = pd.read_csv("pseudo_characters_high_confidence.csv")
df_r2 = pd.read_csv("pseudo_characters_round2.csv")
df_r3 = pd.read_csv("pseudo_characters_round3.csv")

# Ensure lists are evaluated properly
for df in [df_r1, df_r2, df_r3]:
    df['characters'] = df['characters'].apply(eval)
    df['confidences'] = df['confidences'].apply(eval)

# Merge
df_all = pd.concat([df_r1, df_r2, df_r3], ignore_index=True)

# Drop duplicate sentences just in case
df_all = df_all.drop_duplicates(subset=['story_id', 'sentence_id'])

# Save combined result
df_all.to_csv("pseudo_characters_combined_round1_2_3.csv", index=False)

print(f"✅ Total pseudo-labeled sentences after round 3: {len(df_all)}")


✅ Total pseudo-labeled sentences after round 3: 9113


## pseudo labelling round 4

In [19]:
import pandas as pd

# Load tokenized dataset and pseudo characters
df_tokens = pd.read_csv("../preprocessing/cerita_rakyat_tokenized_clean.csv")
df_chars = pd.read_csv("pseudo_characters_combined_round1_2_3.csv")

df_chars['characters'] = df_chars['characters'].apply(eval)

# Match sentences
matched_ids = set(zip(df_chars['story_id'], df_chars['sentence_id']))
df_matched = df_tokens[df_tokens.apply(lambda row: (row['story_id'], row['sentence_id']) in matched_ids, axis=1)].copy()

# Merge character info into matched tokens
df_merge = df_matched.merge(df_chars[['story_id', 'sentence_id', 'characters']], on=['story_id', 'sentence_id'], how='left')

df_merge['word'] = df_merge['word'].fillna('').astype(str)

# Label each token with B-PER/I-PER/O
def label_token(word, char_list):
    word = word.lower()
    for char in char_list:
        tokens = char.lower().split()
        if word == tokens[0]:
            return 'B-PER' if len(tokens) == 1 else 'B-PER'
        elif word in tokens:
            return 'I-PER'
    return 'O'

df_merge['TYPE2'] = df_merge.apply(lambda row: label_token(row['word'], row['characters']), axis=1)

# Save token-level pseudo labels
df_pseudo_tokens = df_merge[['story_id', 'sentence_id', 'word', 'TYPE2']]
df_pseudo_tokens.to_csv("pseudo_word_level_labeled_round1_2_3.csv", index=False)

print("✅ Token-level pseudo labels saved for ner_model_v4")


✅ Token-level pseudo labels saved for ner_model_v4


In [20]:
# Load manual labels
df_manual = pd.read_csv("even_semi_annotated.csv")[['story_id', 'sentence_id', 'word', 'TYPE2']]

# Load pseudo labels
df_pseudo = pd.read_csv("pseudo_word_level_labeled_round1_2_3.csv")

# Merge them
df_final = pd.concat([df_manual, df_pseudo], ignore_index=True)
df_final.to_csv("training_data_for_model_v4.csv", index=False)

print("✅ Training data ready for ner_model_v4 🎉")


✅ Training data ready for ner_model_v4 🎉


## pseudo labelling round 4

In [24]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

# === Step 1: Load training dataset ===
df = pd.read_csv("training_data_for_model_v4.csv")[['story_id', 'sentence_id', 'word', 'TYPE2']]
df['word'] = df['word'].fillna('').astype(str)

# Build label mappings
labels = sorted(df['TYPE2'].unique())
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Map TYPE2 to numeric
df['ner_tags'] = df['TYPE2'].map(label2id)

# Group into sentences
grouped = df.groupby(['story_id', 'sentence_id'])
sentences = []
tags = []

for _, group in grouped:
    sentences.append(group['word'].tolist())
    tags.append(group['ner_tags'].tolist())

# HuggingFace Dataset
dataset = Dataset.from_dict({
    'tokens': sentences,
    'ner_tags': tags
})

# Train/test split (80/20)
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# === Step 2: Tokenizer and Model ===
checkpoint = "cahya/bert-base-indonesian-1.5G"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# === Step 3: Tokenize and align labels ===
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# === Step 4: Metric Computation ===
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    true_labels = [[id2label[label] for label in sent if label != -100] for sent in labels]
    true_preds = [[id2label[pred] for (pred, label) in zip(sent_pred, sent_label) if label != -100]
                  for sent_pred, sent_label in zip(preds, labels)]

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds)
    }

# === Step 5: TrainingArguments ===
training_args = TrainingArguments(
    output_dir="./ner_model_v4",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir="./logs_v4",
    logging_steps=100,
    report_to="tensorboard"
)

# 💬 Add Early Stopping
from transformers import EarlyStoppingCallback
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2))

# === Step 6: Trainer ===
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# === Step 7: Start Training ===
trainer.train()

# Save final model
trainer.save_model("./ner_model_v4")
tokenizer.save_pretrained("./ner_model_v4")

print("✅ Training complete! Model saved at ./ner_model_v4 🎉")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at cahya/bert-base-indonesian-1.5G and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map:   0%|          | 0/7752 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 7752/7752 [00:00<00:00, 10566.21 examples/s]
Map: 100%|██████████| 1939/1939 [00:00<00:00, 10821.64 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0826,0.073729,0.859521,0.903038,0.880742
2,0.0538,0.075625,0.871487,0.91347,0.891985
3,0.0344,0.087268,0.871362,0.918687,0.894399
4,0.0178,0.100282,0.877521,0.907947,0.892475
5,0.0138,0.107603,0.884719,0.904265,0.894385
6,0.007,0.133624,0.866955,0.921755,0.893516
7,0.005,0.135506,0.86769,0.919607,0.892894
8,0.0042,0.137684,0.875734,0.914698,0.894792
9,0.0021,0.147369,0.865263,0.920221,0.891896
10,0.0011,0.146127,0.870537,0.920221,0.89469


✅ Training complete! Model saved at ./ner_model_v4 🎉


In [27]:
import pandas as pd
import torch
from transformers import TokenClassificationPipeline, AutoTokenizer, AutoModelForTokenClassification

# === Step 1: Load ner_model_v4 ===
model = AutoModelForTokenClassification.from_pretrained("./ner_model_v4")
tokenizer = AutoTokenizer.from_pretrained("./ner_model_v4")

pipeline = TokenClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1
)

# === Step 2: Load leftover sentences ===
df_leftover = pd.read_csv("leftover_sentences_for_next_round3.csv")
df_leftover['tokens'] = df_leftover['tokens'].apply(eval)  # Convert from string to list

# === Step 3: Pseudo-labeling leftover round 4 ===
confidence_threshold = 0.95

high_conf_pseudo_labels = []
still_low_conf = []

for _, row in df_leftover.iterrows():
    story_id = row['story_id']
    sentence_id = row['sentence_id']
    tokens = row['tokens']
    sentence = " ".join(tokens)

    preds = pipeline(sentence)

    char_names = []
    confidences = []
    low_conf = False

    for pred in preds:
        if pred["entity_group"] == "PER":
            score = pred["score"]
            word = pred["word"].replace("##", "").strip().title()

            if score >= confidence_threshold:
                if word and len(word) > 1 and not word.isspace():
                    char_names.append(word)
                    confidences.append(round(score, 4))
            else:
                low_conf = True

    if char_names:
        high_conf_pseudo_labels.append({
            "story_id": story_id,
            "sentence_id": sentence_id,
            "characters": list(set(char_names)),
            "confidences": confidences
        })

    if low_conf:
        still_low_conf.append({
            "story_id": story_id,
            "sentence_id": sentence_id,
            "tokens": tokens
        })

# === Step 4: Save results ===
pd.DataFrame(high_conf_pseudo_labels).to_csv("pseudo_characters_round4.csv", index=False)
pd.DataFrame(still_low_conf).to_csv("leftover_sentences_for_next_round4.csv", index=False)

print(f"✅ Round 4 pseudo-labeled sentences: {len(high_conf_pseudo_labels)}")
print(f"🔄 Still left for round 5: {len(still_low_conf)}")


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


✅ Round 4 pseudo-labeled sentences: 36
🔄 Still left for round 5: 21


In [28]:
import pandas as pd

# Load all pseudo-labeled rounds
df_r1 = pd.read_csv("pseudo_characters_high_confidence.csv")
df_r2 = pd.read_csv("pseudo_characters_round2.csv")
df_r3 = pd.read_csv("pseudo_characters_round3.csv")
df_r4 = pd.read_csv("pseudo_characters_round4.csv")

# Ensure 'characters' and 'confidences' columns are real lists
for df in [df_r1, df_r2, df_r3, df_r4]:
    df['characters'] = df['characters'].apply(eval)
    df['confidences'] = df['confidences'].apply(eval)

# Merge everything
df_combined = pd.concat([df_r1, df_r2, df_r3, df_r4], ignore_index=True)

# Drop duplicates (in case same sentence was predicted in multiple rounds)
df_combined = df_combined.drop_duplicates(subset=['story_id', 'sentence_id'])

# Save merged output
df_combined.to_csv("pseudo_characters_combined_round1_2_3_4.csv", index=False)

# Show summary
print(f"✅ Total final pseudo-labeled sentences after round 4: {len(df_combined)}")


✅ Total final pseudo-labeled sentences after round 4: 9128


## model 4 rerun

In [30]:
import pandas as pd
import torch
from transformers import TokenClassificationPipeline, AutoTokenizer, AutoModelForTokenClassification

# === Step 1: Load ner_model_v4 ===
model = AutoModelForTokenClassification.from_pretrained("./ner_model_v4")
tokenizer = AutoTokenizer.from_pretrained("./ner_model_v4")

pipeline = TokenClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1
)

# === Step 2: Load leftovers ===
df_leftover = pd.read_csv("leftover_sentences_for_next_round3.csv")
df_leftover['tokens'] = df_leftover['tokens'].apply(eval)  # Convert from string to list

# === Step 3: Pseudo-label properly ===
confidence_threshold = 0.95

high_conf_pseudo_labels = []
still_low_conf = []

for _, row in df_leftover.iterrows():
    story_id = row['story_id']
    sentence_id = row['sentence_id']
    tokens = row['tokens']
    sentence = " ".join(tokens)

    preds = pipeline(sentence)

    char_names = []
    confidences = []

    for pred in preds:
        if pred["entity_group"] == "PER":
            score = pred["score"]
            word = pred["word"].replace("##", "").strip().title()

            if score >= confidence_threshold:
                if word and len(word) > 1 and not word.isspace():
                    char_names.append(word)
                    confidences.append(round(score, 4))

    # === New fixed logic:
    if char_names:
        high_conf_pseudo_labels.append({
            "story_id": story_id,
            "sentence_id": sentence_id,
            "characters": list(set(char_names)),
            "confidences": confidences
        })
    else:
        # Always keep unconfident or empty sentences for next round
        still_low_conf.append({
            "story_id": story_id,
            "sentence_id": sentence_id,
            "tokens": tokens
        })

# === Step 4: Save outputs ===
pd.DataFrame(high_conf_pseudo_labels).to_csv("pseudo_characters_round4_fixed.csv", index=False)
pd.DataFrame(still_low_conf).to_csv("leftover_sentences_for_next_round4_fixed.csv", index=False)

print(f"✅ Round 4 (fixed) pseudo-labeled sentences: {len(high_conf_pseudo_labels)}")
print(f"🔄 Still leftover for round 5: {len(still_low_conf)}")


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


✅ Round 4 (fixed) pseudo-labeled sentences: 36
🔄 Still leftover for round 5: 22


In [31]:
import pandas as pd

# Load all pseudo-labeled rounds
df_r1 = pd.read_csv("pseudo_characters_high_confidence.csv")
df_r2 = pd.read_csv("pseudo_characters_round2.csv")
df_r3 = pd.read_csv("pseudo_characters_round3.csv")
df_r4_fixed = pd.read_csv("pseudo_characters_round4_fixed.csv")  # <<< use the FIXED one!

# Ensure 'characters' and 'confidences' are real lists
for df in [df_r1, df_r2, df_r3, df_r4_fixed]:
    df['characters'] = df['characters'].apply(eval)
    df['confidences'] = df['confidences'].apply(eval)

# Merge everything
df_all = pd.concat([df_r1, df_r2, df_r3, df_r4_fixed], ignore_index=True)

# Drop duplicates (same story_id + sentence_id)
df_all = df_all.drop_duplicates(subset=['story_id', 'sentence_id'])

# Save final combined file
df_all.to_csv("pseudo_characters_combined_round1_2_3_4_fixed.csv", index=False)

# Show total
print(f"✅ Final pseudo-labeled sentences (after fix): {len(df_all)}")


✅ Final pseudo-labeled sentences (after fix): 9128


In [50]:
import pandas as pd

# Load the CSV
df = pd.read_csv("pseudo_characters_combined_round1_2_3_4_fixed.csv")

# Count unique (story_id, sentence_id) pairs
unique_sentences = df[['story_id', 'sentence_id']].drop_duplicates()
total_sentences = unique_sentences.shape[0]

print(f"✅ Total unique sentences: {total_sentences}")


✅ Total unique sentences: 9128


In [9]:
import pandas as pd

# Load the CSV
df = pd.read_csv("pseudo_characters_combined_round1_2_3_4.csv")

# Count unique (story_id, sentence_id) pairs
unique_sentences = df[['story_id', 'sentence_id']].drop_duplicates()
total_sentences = unique_sentences.shape[0]

print(f"✅ Total unique sentences: {total_sentences}")


✅ Total unique sentences: 9128


## pseudo labelling round 5

In [32]:
import pandas as pd

# === Step 1: Load tokenized clean text and combined pseudo labels ===
df_tokens = pd.read_csv("../preprocessing/cerita_rakyat_tokenized_clean.csv")
df_chars = pd.read_csv("pseudo_characters_combined_round1_2_3_4_fixed.csv")

# Ensure characters are lists
df_chars['characters'] = df_chars['characters'].apply(eval)

# === Step 2: Match sentences ===
matched_ids = set(zip(df_chars['story_id'], df_chars['sentence_id']))
df_matched = df_tokens[df_tokens.apply(lambda row: (row['story_id'], row['sentence_id']) in matched_ids, axis=1)].copy()

# Merge character info into tokens
df_merge = df_matched.merge(df_chars[['story_id', 'sentence_id', 'characters']], on=['story_id', 'sentence_id'], how='left')

# Clean up NaNs
df_merge['word'] = df_merge['word'].fillna('').astype(str)

# === Step 3: Assign B-PER, I-PER, or O ===
def label_token(word, char_list):
    word = word.lower()
    for char in char_list:
        tokens = char.lower().split()
        if word == tokens[0]:
            return 'B-PER' if len(tokens) == 1 else 'B-PER'
        elif word in tokens:
            return 'I-PER'
    return 'O'

df_merge['TYPE2'] = df_merge.apply(lambda row: label_token(row['word'], row['characters']), axis=1)

# === Step 4: Save the word-level pseudo labeled dataset ===
df_pseudo_tokens = df_merge[['story_id', 'sentence_id', 'word', 'TYPE2']]
df_pseudo_tokens.to_csv("pseudo_word_level_labeled_round1_2_3_4_fixed.csv", index=False)

print(f"✅ Token-level pseudo-labeled data saved as pseudo_word_level_labeled_round1_2_3_4_fixed.csv")


✅ Token-level pseudo-labeled data saved as pseudo_word_level_labeled_round1_2_3_4_fixed.csv


In [33]:
import pandas as pd

# Load manual annotated data
df_manual = pd.read_csv("even_semi_annotated.csv")[['story_id', 'sentence_id', 'word', 'TYPE2']]

# Load full pseudo-labeled token data
df_pseudo = pd.read_csv("pseudo_word_level_labeled_round1_2_3_4_fixed.csv")

# Merge together
df_final = pd.concat([df_manual, df_pseudo], ignore_index=True)

# Save final merged training dataset
df_final.to_csv("training_data_for_model_v5.csv", index=False)

print(f"✅ Final training dataset for ner_model_v5 is ready! Total rows: {len(df_final)}")


✅ Final training dataset for ner_model_v5 is ready! Total rows: 154532


In [35]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

# === Step 1: Load the dataset ===
df = pd.read_csv("training_data_for_model_v5.csv")[['story_id', 'sentence_id', 'word', 'TYPE2']]
df['word'] = df['word'].fillna('').astype(str)

# Build label mappings
labels = sorted(df['TYPE2'].unique())
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Map TYPE2 to numeric
df['ner_tags'] = df['TYPE2'].map(label2id)

# Group into sentences
grouped = df.groupby(['story_id', 'sentence_id'])
sentences = []
tags = []

for _, group in grouped:
    sentences.append(group['word'].tolist())
    tags.append(group['ner_tags'].tolist())

dataset = Dataset.from_dict({
    'tokens': sentences,
    'ner_tags': tags
})

# 80/20 split
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# === Step 2: Load model and tokenizer ===
checkpoint = "cahya/bert-base-indonesian-1.5G"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# === Step 3: Tokenize and align labels ===
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# === Step 4: Metrics ===
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    true_labels = [[id2label[label] for label in sent if label != -100] for sent in labels]
    true_preds = [[id2label[pred] for (pred, label) in zip(sent_pred, sent_label) if label != -100]
                  for sent_pred, sent_label in zip(preds, labels)]

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds)
    }

# === Step 5: TrainingArguments ===
training_args = TrainingArguments(
    output_dir="./ner_model_v5",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir="./logs_v5",
    logging_steps=100,
    report_to="tensorboard"
)

# === Step 6: Trainer ===
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Early stopping (optional but recommended!)
from transformers import EarlyStoppingCallback
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2))

# === Step 7: Start Training ===
trainer.train()

# Save model
trainer.save_model("./ner_model_v5")
tokenizer.save_pretrained("./ner_model_v5")

print("✅ Training complete! Model saved at ./ner_model_v5 🎉")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at cahya/bert-base-indonesian-1.5G and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map:   0%|          | 0/7764 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 7764/7764 [00:00<00:00, 10520.87 examples/s]
Map: 100%|██████████| 1942/1942 [00:00<00:00, 10698.77 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0861,0.065725,0.896552,0.891473,0.894005
2,0.0506,0.074815,0.876579,0.889386,0.882936
3,0.0281,0.073273,0.890737,0.926058,0.908054
4,0.0212,0.078921,0.892744,0.920692,0.906502
5,0.0122,0.093736,0.902696,0.918306,0.910435
6,0.01,0.093422,0.908821,0.912343,0.910579
7,0.0048,0.107302,0.907149,0.911747,0.909442
8,0.0037,0.106827,0.900748,0.933512,0.916837
9,0.0021,0.116154,0.897711,0.923673,0.910507
10,0.0024,0.117777,0.895887,0.928742,0.912019




✅ Training complete! Model saved at ./ner_model_v5 🎉


In [36]:
import pandas as pd
import torch
from transformers import TokenClassificationPipeline, AutoTokenizer, AutoModelForTokenClassification

# === Step 1: Load ner_model_v5 ===
model = AutoModelForTokenClassification.from_pretrained("./ner_model_v5")
tokenizer = AutoTokenizer.from_pretrained("./ner_model_v5")

pipeline = TokenClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1
)

# === Step 2: Load leftover sentences ===
df_leftover = pd.read_csv("leftover_sentences_for_next_round4_fixed.csv")
df_leftover['tokens'] = df_leftover['tokens'].apply(eval)  # Convert from string to list

# === Step 3: Pseudo-labeling round 5 ===
confidence_threshold = 0.95

high_conf_pseudo_labels = []
still_low_conf = []

for _, row in df_leftover.iterrows():
    story_id = row['story_id']
    sentence_id = row['sentence_id']
    tokens = row['tokens']
    sentence = " ".join(tokens)

    preds = pipeline(sentence)

    char_names = []
    confidences = []

    for pred in preds:
        if pred["entity_group"] == "PER":
            score = pred["score"]
            word = pred["word"].replace("##", "").strip().title()

            if score >= confidence_threshold:
                if word and len(word) > 1 and not word.isspace():
                    char_names.append(word)
                    confidences.append(round(score, 4))

    if char_names:
        high_conf_pseudo_labels.append({
            "story_id": story_id,
            "sentence_id": sentence_id,
            "characters": list(set(char_names)),
            "confidences": confidences
        })
    else:
        # Always keep leftovers even if nothing detected
        still_low_conf.append({
            "story_id": story_id,
            "sentence_id": sentence_id,
            "tokens": tokens
        })

# === Step 4: Save results ===
pd.DataFrame(high_conf_pseudo_labels).to_csv("pseudo_characters_round5.csv", index=False)
pd.DataFrame(still_low_conf).to_csv("leftover_sentences_for_next_round5.csv", index=False)

print(f"✅ Round 5 pseudo-labeled sentences: {len(high_conf_pseudo_labels)}")
print(f"🔄 Still leftover for round 6: {len(still_low_conf)}")


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


✅ Round 5 pseudo-labeled sentences: 6
🔄 Still leftover for round 6: 16


In [37]:
import pandas as pd

# Load your pseudo-word-level file
df = pd.read_csv("pseudo_word_level_labeled_round1_2_3_4_fixed.csv")

# === Step 1: Define targets globally ===
phrases_to_fix = {
    ("Beru", "Tandang", "Karo"): ["B-PER", "I-PER", "I-PER"],
    ("Beru", "Tandang", "Meriah"): ["B-PER", "I-PER", "I-PER"]
}

# === Step 2: Go through all story_id = 13 ===
for sentence_id in df[df['story_id'] == 13]['sentence_id'].unique():
    mask = (df['story_id'] == 13) & (df['sentence_id'] == sentence_id)
    words = df[mask]['word'].tolist()

    for phrase, labels in phrases_to_fix.items():
        phrase_length = len(phrase)

        for idx in range(len(words) - phrase_length + 1):
            if tuple(words[idx:idx+phrase_length]) == phrase:
                for j in range(phrase_length):
                    df_idx = df[mask].index[idx+j]
                    df.at[df_idx, 'TYPE2'] = labels[j]
                print(f"✅ Fixed phrase {phrase} in sentence {sentence_id}")

# === Step 3: Save new corrected file ===
df.to_csv("pseudo_word_level_labeled_round1_2_3_4_fixed_corrected.csv", index=False)
print("✅ Full correction saved to pseudo_word_level_labeled_round1_2_3_4_fixed_corrected.csv")


✅ Fixed phrase ('Beru', 'Tandang', 'Karo') in sentence 14
✅ Fixed phrase ('Beru', 'Tandang', 'Karo') in sentence 31
✅ Fixed phrase ('Beru', 'Tandang', 'Meriah') in sentence 31
✅ Fixed phrase ('Beru', 'Tandang', 'Karo') in sentence 58
✅ Fixed phrase ('Beru', 'Tandang', 'Karo') in sentence 89
✅ Full correction saved to pseudo_word_level_labeled_round1_2_3_4_fixed_corrected.csv


## pseudo labelling v5 part2

In [40]:
import pandas as pd

# Load manual annotated data
df_manual = pd.read_csv("even_semi_annotated.csv")[['story_id', 'sentence_id', 'word', 'TYPE2']]

# Load corrected pseudo-labeled data
df_pseudo = pd.read_csv("pseudo_word_level_labeled_round1_2_3_4_fixed_corrected.csv")[['story_id', 'sentence_id', 'word', 'TYPE2']]

# Merge them together
df_final = pd.concat([df_manual, df_pseudo], ignore_index=True)

# Save final training file
df_final.to_csv("training_data_for_model_v5.csv", index=False)

print(f"✅ Final training dataset created: {len(df_final)} rows.")


✅ Final training dataset created: 154532 rows.


In [41]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

# === Step 1: Load your final dataset ===
df = pd.read_csv("training_data_for_model_v5.csv")[['story_id', 'sentence_id', 'word', 'TYPE2']]
df['word'] = df['word'].fillna('').astype(str)

# Build label mappings
labels = sorted(df['TYPE2'].unique())
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Map TYPE2 to numeric
df['ner_tags'] = df['TYPE2'].map(label2id)

# Group into sentences
grouped = df.groupby(['story_id', 'sentence_id'])
sentences = []
tags = []

for _, group in grouped:
    sentences.append(group['word'].tolist())
    tags.append(group['ner_tags'].tolist())

dataset = Dataset.from_dict({
    'tokens': sentences,
    'ner_tags': tags
})

# Train/test split (80/20)
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# === Step 2: Load model and tokenizer ===
checkpoint = "cahya/bert-base-indonesian-1.5G"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# === Step 3: Tokenize and align labels ===
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# === Step 4: Metrics ===
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    true_labels = [[id2label[label] for label in sent if label != -100] for sent in labels]
    true_preds = [[id2label[pred] for (pred, label) in zip(sent_pred, sent_label) if label != -100]
                  for sent_pred, sent_label in zip(preds, labels)]

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds)
    }

# === Step 5: TrainingArguments ===
training_args = TrainingArguments(
    output_dir="./ner_model_v5",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir="./logs_v5",
    logging_steps=100,
    report_to="tensorboard"
)

# === Step 6: Trainer setup ===
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Early stopping (recommended)
from transformers import EarlyStoppingCallback
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2))

# === Step 7: Start training ===
trainer.train()

# Save model
trainer.save_model("./ner_model_v5")
tokenizer.save_pretrained("./ner_model_v5")

print("✅ Training complete! Model saved at ./ner_model_v5 🎉")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at cahya/bert-base-indonesian-1.5G and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map:   0%|          | 0/7764 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 7764/7764 [00:00<00:00, 10120.09 examples/s]
Map: 100%|██████████| 1942/1942 [00:00<00:00, 10484.77 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0865,0.070019,0.892463,0.867998,0.88006
2,0.0499,0.072435,0.888688,0.877831,0.883226
3,0.027,0.073406,0.896329,0.909416,0.902825
4,0.0192,0.081731,0.897003,0.918653,0.907699
5,0.0134,0.091615,0.889813,0.924017,0.906593
6,0.0082,0.098325,0.888186,0.920739,0.90417


✅ Training complete! Model saved at ./ner_model_v5 🎉


In [42]:
import pandas as pd
import torch
from transformers import TokenClassificationPipeline, AutoTokenizer, AutoModelForTokenClassification

# === Step 1: Load ner_model_v5 ===
model = AutoModelForTokenClassification.from_pretrained("./ner_model_v5")
tokenizer = AutoTokenizer.from_pretrained("./ner_model_v5")

pipeline = TokenClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1
)

# === Step 2: Load leftovers ===
df_leftover = pd.read_csv("leftover_sentences_for_next_round4_fixed.csv")
df_leftover['tokens'] = df_leftover['tokens'].apply(eval)  # convert from string to list

# === Step 3: Pseudo-label all, with confidence logging ===
all_predictions = []

for _, row in df_leftover.iterrows():
    story_id = row['story_id']
    sentence_id = row['sentence_id']
    tokens = row['tokens']
    sentence = " ".join(tokens)

    preds = pipeline(sentence)

    for pred in preds:
        if pred["entity_group"] == "PER":
            all_predictions.append({
                "story_id": story_id,
                "sentence_id": sentence_id,
                "word": pred["word"].replace("##", "").strip(),
                "confidence": round(pred["score"], 4)
            })

# === Step 4: Save and show ===
df_preds = pd.DataFrame(all_predictions)
df_preds.to_csv("pseudo_predictions_from_leftovers_v5.csv", index=False)

print("✅ Finished pseudo-labeling. Here’s a sample:")
print(df_preds.head(20))


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


✅ Finished pseudo-labeling. Here’s a sample:
    story_id  sentence_id                 word  confidence
0         10           66            mataniari      0.9681
1         13           72                  ber      0.9983
2         13           72       u tandang karo      0.9925
3         13           72                  ber      0.9982
4         13           72     u tandang meriah      0.9944
5         18           90              ketujuh      0.7520
6         31            0  sri baduga maharaja      0.7927
7         31            0      prabu siliwangi      0.8769
8         32           75             ayam adu      0.8743
9         32           75      raja riak bakau      0.8818
10        32           75                kakek      0.7870
11        32           75                 ayam      0.7738
12        32           75      raja riak bakau      0.6435
13        76           13                kakek      0.9610
14        76           13                nenek      0.8871
15        9

In [44]:
import pandas as pd

# Load your raw prediction result
df = pd.read_csv("pseudo_predictions_from_leftovers_v5.csv")

# === Step 1: Define merging rules ===
# For combining "ber" + "u tandang karo" and "ber" + "u tandang meriah"
merge_targets = {
    ("ber", "u tandang karo"): "Beru Tandang Karo",
    ("ber", "u tandang meriah"): "Beru Tandang Meriah"
}

fixed_rows = []
skip_next = False

# === Step 2: Scan and merge ===
for i in range(len(df) - 1):
    if skip_next:
        skip_next = False
        continue

    row1 = df.iloc[i]
    row2 = df.iloc[i+1]

    key = (row1['word'].lower(), row2['word'].lower())

    if key in merge_targets:
        merged_name = merge_targets[key]
        avg_conf = round((row1['confidence'] + row2['confidence']) / 2, 4)

        fixed_rows.append({
            "story_id": row1['story_id'],
            "sentence_id": row1['sentence_id'],
            "word": merged_name,
            "confidence": avg_conf
        })

        skip_next = True  # Skip next row because it's already merged
    else:
        fixed_rows.append({
            "story_id": row1['story_id'],
            "sentence_id": row1['sentence_id'],
            "word": row1['word'],
            "confidence": row1['confidence']
        })

# Handle last row if not merged
if not skip_next and len(df) > 0:
    last_row = df.iloc[-1]
    fixed_rows.append({
        "story_id": last_row['story_id'],
        "sentence_id": last_row['sentence_id'],
        "word": last_row['word'],
        "confidence": last_row['confidence']
    })

# === Step 3: Save clean fixed version ===
df_fixed = pd.DataFrame(fixed_rows)
df_fixed.to_csv("pseudo_predictions_from_leftovers_v5_fixed.csv", index=False)

print("✅ Merging complete! Corrected file saved as pseudo_predictions_from_leftovers_v5_fixed.csv")
print(df_fixed.head(20))


✅ Merging complete! Corrected file saved as pseudo_predictions_from_leftovers_v5_fixed.csv
    story_id  sentence_id                 word  confidence
0         10           66            mataniari      0.9681
1         13           72    Beru Tandang Karo      0.9954
2         13           72  Beru Tandang Meriah      0.9963
3         18           90              ketujuh      0.7520
4         31            0  sri baduga maharaja      0.7927
5         31            0      prabu siliwangi      0.8769
6         32           75             ayam adu      0.8743
7         32           75      raja riak bakau      0.8818
8         32           75                kakek      0.7870
9         32           75                 ayam      0.7738
10        32           75      raja riak bakau      0.6435
11        76           13                kakek      0.9610
12        76           13                nenek      0.8871
13        94           82                   ha      0.7412
14        94           8

In [45]:
import pandas as pd
from collections import defaultdict

# Load corrected token-level pseudo labels
df_tokens = pd.read_csv("pseudo_word_level_labeled_round1_2_3_4_fixed_corrected.csv")

# Group by sentence
grouped = df_tokens.groupby(['story_id', 'sentence_id'])

sentence_level = []

for (story_id, sentence_id), group in grouped:
    tokens = group['word'].tolist()
    labels = group['TYPE2'].tolist()

    character_names = []
    current_char = []

    for token, label in zip(tokens, labels):
        if label == "B-PER":
            if current_char:
                character_names.append(" ".join(current_char))
                current_char = []
            current_char = [token]
        elif label == "I-PER":
            current_char.append(token)
        else:
            if current_char:
                character_names.append(" ".join(current_char))
                current_char = []

    if current_char:
        character_names.append(" ".join(current_char))

    # Save if any character found
    if character_names:
        sentence_level.append({
            "story_id": story_id,
            "sentence_id": sentence_id,
            "characters": character_names,
            "confidences": [-1.0 for _ in character_names]  # -1 because original token labels have no confidence
        })

# Save new clean sentence-level pseudo characters
df_sentences = pd.DataFrame(sentence_level)
df_sentences.to_csv("pseudo_characters_combined_round1_2_3_4_fixed_rebuild.csv", index=False)

print("✅ Rebuilt pseudo_characters_combined_round1_2_3_4_fixed_rebuild.csv from corrected token labels")


✅ Rebuilt pseudo_characters_combined_round1_2_3_4_fixed_rebuild.csv from corrected token labels


In [47]:
import pandas as pd

# Load the corrected pseudo prediction file
df_fixed = pd.read_csv("pseudo_predictions_from_leftovers_v5_fixed.csv")

# Filter only high-confidence predictions (>95%)
df_high_conf_fixed = df_fixed[df_fixed['confidence'] >= 0.95]

# Save high-confidence results
df_high_conf_fixed.to_csv("pseudo_predictions_high_confidence_round5_fixed.csv", index=False)

print(f"✅ Exported {len(df_high_conf_fixed)} high-confidence predictions to pseudo_predictions_high_confidence_round5_fixed.csv")


✅ Exported 9 high-confidence predictions to pseudo_predictions_high_confidence_round5_fixed.csv


In [48]:
# Load corrected rebuilt pseudo characters
df_old = pd.read_csv("pseudo_characters_combined_round1_2_3_4_fixed_rebuild.csv")

# Load new round 5 high confidence predictions
df_new = pd.read_csv("pseudo_predictions_high_confidence_round5_fixed.csv")

# Prepare new format to match
df_new['characters'] = df_new['word'].apply(lambda x: [x])
df_new['confidences'] = df_new['confidence'].apply(lambda x: [round(x, 4)])

df_new = df_new[['story_id', 'sentence_id', 'characters', 'confidences']]

# Merge old and new
df_all = pd.concat([df_old, df_new], ignore_index=True)

# Drop duplicates
df_all = df_all.drop_duplicates(subset=['story_id', 'sentence_id'])

# Save final merged pseudo characters
df_all.to_csv("pseudo_characters_combined_round1_2_3_4_5_final.csv", index=False)

print(f"✅ Final pseudo_characters_combined_round1_2_3_4_5_final.csv is clean and ready!")


✅ Final pseudo_characters_combined_round1_2_3_4_5_final.csv is clean and ready!


In [49]:
import pandas as pd

# Load the CSV
df = pd.read_csv("pseudo_characters_combined_round1_2_3_4_5_final.csv")

# Count unique (story_id, sentence_id) pairs
unique_sentences = df[['story_id', 'sentence_id']].drop_duplicates()
total_sentences = unique_sentences.shape[0]

print(f"✅ Total unique sentences: {total_sentences}")


✅ Total unique sentences: 8817


In [1]:
!pip freeze > requirements.txt


In [14]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("pseudo_characters_combined_round1_2_3_4_5_final.csv")

# Count unique (story_id, sentence_id) pairs
total_sentences = df.groupby(['story_id', 'sentence_id']).ngroups

print(f"✅ Total unique sentences: {total_sentences}")

✅ Total unique sentences: 8817


In [17]:
import pandas as pd

df = pd.read_csv("pseudo_characters_combined_round1_2_3_4.csv")
total_entries = len(df)
print(f"🧾 Total entries (rows): {total_entries}")


🧾 Total entries (rows): 9128


In [18]:
import pandas as pd

df = pd.read_csv("pseudo_characters_combined_round1_2_3_4_fixed.csv")
total_entries = len(df)
print(f"🧾 Total entries (rows): {total_entries}")


🧾 Total entries (rows): 9128


In [20]:
import pandas as pd

df = pd.read_csv("pseudo_characters_combined_round1_2_3_4_fixed_rebuild.csv")
total_entries = len(df)
print(f"🧾 Total entries (rows): {total_entries}")


🧾 Total entries (rows): 8813


In [21]:
df_all = pd.read_csv("pseudo_characters_combined_round1_2_3_4_fixed.csv")
df_rebuild = pd.read_csv("pseudo_characters_combined_round1_2_3_4_fixed_rebuild.csv")

# Find missing (story_id, sentence_id) from rebuild
merged = df_all.merge(df_rebuild, on=["story_id", "sentence_id"], how="left", indicator=True)
missing = merged[merged['_merge'] == 'left_only']
print(f"Missing rows: {len(missing)}")


Missing rows: 315


In [22]:
import pandas as pd

# Load both versions
df_original = pd.read_csv("pseudo_characters_combined_round1_2_3_4_fixed.csv")
df_rebuild = pd.read_csv("pseudo_characters_combined_round1_2_3_4_fixed_rebuild.csv")

# Merge to find what's missing in rebuild
merged = df_original.merge(
    df_rebuild,
    on=["story_id", "sentence_id"],
    how="left",
    indicator=True
)

# Filter to keep only rows that are missing in the rebuild
missing = merged[merged['_merge'] == 'left_only']

# Show the number and the actual rows
print(f"❌ Total missing rows in rebuild: {len(missing)}")
print(missing[['story_id', 'sentence_id', 'characters_x', 'confidences_x']].head(10))  # show first 10

# Optionally save to file
missing[['story_id', 'sentence_id', 'characters_x', 'confidences_x']].to_csv("missing_from_rebuild.csv", index=False)


❌ Total missing rows in rebuild: 315
      story_id  sentence_id             characters_x confidences_x
179          6           32               ['Ibunya']      [0.9975]
300          9           48  ['Bidadari - Bidadari']      [0.9839]
409         10          208    ['Menteri - Menteri']        [0.98]
662         15           77              ['Hatoban']      [0.9706]
1331        22           43      ['Burung - Burung']       [0.975]
1333        22           51               ['Sitiii']      [0.9906]
1336        22           59               ['Sitiii']      [0.9913]
1686        26           97          ['Ikan - Ikan']      [0.9936]
1687        26           98          ['Ikan - Ikan']      [0.9891]
3310        75           38        ['Lipan - Lipan']      [0.9512]


In [13]:
duplicates = df.duplicated(subset=['story_id', 'sentence_id']).sum()
print(f"🔍 Duplicate (story_id, sentence_id) pairs: {duplicates}")


🔍 Duplicate (story_id, sentence_id) pairs: 0
