In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd

def load_conll_to_dataframe(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line:  # If the line is not empty
                tokens = line.split()  # Split by whitespace
                if len(tokens) >= 2:  # Ensure there are at least 2 columns (token and label)
                    token, label = tokens[0], tokens[1]  # First token is the word, second is the label
                    data.append((token, label))  # Append as a tuple

    # Create a DataFrame with appropriate columns
    df = pd.DataFrame(data, columns=['Token', 'Label'])
    return df

In [5]:
# Usage
conll_file_path = '/content/drive/MyDrive/@mertteka_labeled_data.conll'
df = load_conll_to_dataframe(conll_file_path)

df.head()

Unnamed: 0,Token,Label
0,ውሃ,O
1,የማያስገባ,O
2,የፍራሽ,O
3,ልብስ,O
4,1.20ሜ,O


In [6]:
# Split the dataset into training and test portions
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=200, random_state=21)

In [7]:
import pandas as pd
from transformers import XLMRobertaTokenizerFast, XLMRobertaForTokenClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

# Map labels to IDs
label_list = df['Label'].unique().tolist()
label_map = {label: idx for idx, label in enumerate(label_list)}
num_labels = len(label_map)

# Encode tokens and labels
def encode_data(df):
    tokens = []
    labels = []
    for _, group in df.groupby((df['Label'] != df['Label'].shift()).cumsum()):
        tokenized_input = tokenizer(list(group['Token']),
                                    is_split_into_words=True,
                                    padding='max_length',
                                    truncation=True,
                                    # max_length=128,
                                    return_tensors='pt')
        tokens.append(tokenized_input)
        label_ids = [label_map[label] for label in group['Label']]
        label_ids = label_ids + [label_map['O']] * (tokenized_input['input_ids'].shape[1] - len(label_ids))  # Padding
        labels.append(torch.tensor(label_ids))
    return tokens, labels

tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base')
train_df, val_df = train_test_split(df_train, test_size=0.2)
train_tokens, train_labels = encode_data(train_df)
val_tokens, val_labels = encode_data(val_df)

# Create a dataset class
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, tokens, labels):
        self.tokens = tokens
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.tokens[idx]['input_ids'].squeeze(),
            'attention_mask': self.tokens[idx]['attention_mask'].squeeze(),
            'labels': self.labels[idx]
        }

train_dataset = NERDataset(train_tokens, train_labels)
val_dataset = NERDataset(val_tokens, val_labels)

# Model and Training Setup
model = XLMRobertaForTokenClassification.from_pretrained('xlm-roberta-base', num_labels=num_labels)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,  # Adjust as needed
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 batches
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,
    logging_dir='./logs',
    eval_strategy="epoch",  # Evaluate after each epoch
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Start training
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,No log,0.002617
2,0.366800,0.000893
3,0.003400,0.000477


TrainOutput(global_step=1305, training_loss=0.14210692114300197, metrics={'train_runtime': 1690.406, 'train_samples_per_second': 12.35, 'train_steps_per_second': 0.772, 'total_flos': 5455289667004416.0, 'train_loss': 0.14210692114300197, 'epoch': 3.0})

In [8]:
test_tokens, test_labels = encode_data(df_test)
test_dataset = NERDataset(test_tokens, test_labels)

# Run predictions on test set
predictions, label_ids, metrics = trainer.predict(test_dataset)

In [9]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Convert logits to predicted label IDs
predicted_labels = np.argmax(predictions, axis=2)

# Flatten the predictions and true labels for evaluation
true_labels_flat = []
predicted_labels_flat = []

for i in range(len(label_ids)):
    true_labels_flat.extend(label_ids[i])
    predicted_labels_flat.extend(predicted_labels[i])

# Remove padding tokens (if your label_map includes a 'PAD' token or 'O', exclude it)
true_labels_no_pad = [label for label in true_labels_flat if label != label_map['O']]
predicted_labels_no_pad = [predicted_labels_flat[i] for i, label in enumerate(true_labels_flat) if label != label_map['O']]

# Calculate accuracy
accuracy = accuracy_score(true_labels_flat, predicted_labels_flat)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Calculate precision, recall, and F1 score (using 'weighted' to handle class imbalance)
precision = precision_score(true_labels_flat, predicted_labels_flat, average='weighted')
recall = recall_score(true_labels_flat, predicted_labels_flat, average='weighted')
f1 = f1_score(true_labels_flat, predicted_labels_flat, average='weighted')

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Test Accuracy: 99.99%
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
