<a href="https://colab.research.google.com/github/BelayAbAb/Centralized-Telegram-E-Commerce-Platform-for-EthioMart_I/blob/Task-1-2-3-%26-4/Fine-tuning%20NER%20Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install Required Libraries
!pip install datasets transformers scikit-learn

# Step 2: Import Libraries
import os
from datasets import Dataset
from transformers import XLMRobertaTokenizerFast, XLMRobertaForTokenClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Step 3: Model Selection
model_name = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name)

# Step 4: Dataset Loading
def load_conll_dataset(file_path):
    dataset = []
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        tokens, labels = [], []
        for line in lines:
            if line.strip():
                parts = line.split('\t')
                if len(parts) == 2:
                    tokens.append(parts[0])
                    labels.append(parts[1].strip())
            else:
                if tokens:  # End of a sentence
                    dataset.append((tokens, labels))
                    tokens, labels = [], []
    return dataset

# Load the dataset from Google Drive or upload directly
from google.colab import files
uploaded = files.upload()  # You can upload your .conll file directly here

# Assuming you upload the file named 'labeled_data_all_channels.conll'
data = load_conll_dataset('labeled_data_all_channels.conll')
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_list([{'tokens': tokens, 'labels': labels} for tokens, labels in train_data])
val_dataset = Dataset.from_list([{'tokens': tokens, 'labels': labels} for tokens, labels in val_data])

# Step 5: Tokenization and Label Alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, padding='max_length', max_length=128)
    labels = []

    # Create a mapping from labels to IDs
    label_to_id = {label: idx for idx, label in enumerate(set(label for sublist in examples['labels'] for label in sublist))}

    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100] * len(tokenized_inputs['input_ids'][i])  # -100 is used to ignore certain tokens

        for j, label_id in enumerate(label):
            if j < len(word_ids) and word_ids[j] is not None:  # Adjusted index check
                label_ids[word_ids[j]] = label_to_id[label_id]  # Convert label to ID

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Tokenize the datasets
train_tokenized = train_dataset.map(tokenize_and_align_labels, batched=True)
val_tokenized = val_dataset.map(tokenize_and_align_labels, batched=True)

# Step 6: Training Configuration
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Step 7: Model Training
num_labels = len(set(label for _, labels in train_data for label in labels))
model = XLMRobertaForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
)

trainer.train()

# Step 8: Performance Evaluation
trainer.evaluate()

# Step 9: Model Saving
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:0

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]



Saving labeled_data_all_channels.conll to labeled_data_all_channels.conll


Map:   0%|          | 0/26085 [00:00<?, ? examples/s]

Map:   0%|          | 0/2899 [00:00<?, ? examples/s]



model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
