In [None]:
!nvidia-smi

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
import datasets
from torch.utils.data import DataLoader
import torch.nn as nn




'/mnt/data/Flowchart_Alur_Penelitian.png'

In [None]:
# Define model and tokenizer
model_name = "bert-base-multilingual-cased"  # Or any other suitable model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define labels (Universal Dependencies POS tags)
ud_tags = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
tag2id = {tag: id for id, tag in enumerate(ud_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [None]:
def conllu_to_dataframe(file_path):
    sentences = []
    sentence = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                if sentence:
                    sentences.append(sentence)
                    sentence = []
                continue
            parts = line.split("\t")
            if len(parts) >= 10:
                sentence.append(parts)
    df = pd.DataFrame(sentences, columns=['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC'])
    return df

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["FORM"], truncation=True, is_split_into_words=True)

    labels = []
    for i, labels_in_example in enumerate(examples["UPOS"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(tag2id[labels_in_example[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
# Load data (replace with your file paths)
train_file = "train01.conllu" # example file
test_file = "01test.conllu" # example file

train_df = conllu_to_dataframe(train_file)
test_df = conllu_to_dataframe(test_file)

train_dataset = datasets.Dataset.from_pandas(train_df)
test_dataset = datasets.Dataset.from_pandas(test_df)


In [None]:
# Preprocess the datasets
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=train_dataset.column_names)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=test_dataset.column_names)

# Training Parameters (Adjust as needed!)
batch_size = 16  # Reduced for demonstration purposes
learning_rate = 5e-5  # Standard value for fine-tuning
beta_2 = 0.98
dropout_rate = 0.3 # not used directly, handled by the pretrained model.
label_smoothing = 0.6
num_epochs = 3

In [None]:
# DataLoaders
train_dataloader = DataLoader(tokenized_train_dataset, batch_size=batch_size, shuffle=True)
eval_dataloader = DataLoader(tokenized_test_dataset, batch_size=batch_size)

# Initialize the model
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(ud_tags))

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate, betas=(0.9, beta_2))
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)



In [None]:
# Loss Function with Label Smoothing
loss_fct = nn.CrossEntropyLoss(label_smoothing=label_smoothing, ignore_index=-100)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        loss = loss_fct(logits.view(-1, model.num_labels), batch["labels"].view(-1))
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

In [None]:
# Evaluation with Accuracy
model.eval()
total_correct = 0
total_predictions = 0

with torch.no_grad():
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        for i in range(predictions.shape[0]):
            for j in range(predictions.shape[1]):
                if batch["labels"][i, j] != -100:
                    total_predictions += 1
                    if predictions[i, j] == batch["labels"][i, j]:
                        total_correct += 1

accuracy = total_correct / total_predictions if total_predictions > 0 else 0
print(f"Evaluation Accuracy: {accuracy}")

print("Training and Evaluation Complete!")