In [None]:
!pip install datasets



In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m515.8 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=213ae7fdc96d14a6b07f793d78e1cdad7f7f7b90ac18435df6a949957fa89f43
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
# First, ensure you have the necessary libraries installed or upgraded
!pip install --upgrade transformers
#!pip install peft

# Then, the original imports should work with the updated libraries
import torch
import numpy as np
from transformers import AutoModelForTokenClassification, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import Dataset
import matplotlib.pyplot as plt
from seqeval.metrics import classification_report



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
MODEL_NAME = "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

In [None]:
# Load dataset from text file (token \t NER tag format)
def load_data(file_path):
    sentences, labels = [], []
    sentence, label = [], []

    with open(file_path, encoding='utf-8') as f:
        for line in f:
            if line.strip():
                parts = line.strip().split('\t')
                if len(parts) == 2:
                    sentence.append(parts[0])
                    label.append(parts[1])
            else:
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
    return sentences, labels

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Load and prepare the dataset
#train_sentences, train_labels = load_data("/content/drive/MyDrive/SDP_U11/dataset/train.txt")
#dev_sentences, dev_labels = load_data("/content/drive/MyDrive/SDP_U11/dataset/validation.txt")
#test_sentences, test_labels = load_data("/content/drive/MyDrive/SDP_U11/dataset/test.txt")



In [None]:
# Load the entire dataset from a single file
all_sentences, all_labels = load_data("/content/drive/MyDrive/SDP_U11/dataset/full1.txt") # Replace with your file path

# Split data into train, validation, and test sets
# First split: 80% for training, 20% for temp (validation + test)
train_sentences, temp_sentences, train_labels, temp_labels = train_test_split(
    all_sentences, all_labels, test_size=0.2, random_state=42
)

# Second split: 50% of temp for validation, 50% for test (10% of total for each)
dev_sentences, test_sentences, dev_labels, test_labels = train_test_split(
    temp_sentences, temp_labels, test_size=0.5, random_state=42
)

In [None]:
print(f"Number of sentences in training set: {len(train_sentences)}")
print(f"Number of sentences in validation set: {len(dev_sentences)}")
print(f"Number of sentences in test set: {len(test_sentences)}")

Number of sentences in training set: 5458
Number of sentences in validation set: 682
Number of sentences in test set: 683


In [None]:
# Create label map
unique_labels = list(set(sum(train_labels + dev_labels + test_labels, [])))
label_map = {label: i for i, label in enumerate(unique_labels)}
num_labels = len(unique_labels)

In [None]:
num_labels

8

In [None]:
unique_labels

['P', 'S', 'A', 'D', 'B', 'M', 'O', 'T']

In [None]:
# Convert data to Hugging Face Dataset format
def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = tokenizer(sentences, truncation=True, is_split_into_words=True, padding=True)
    tokenized_labels = []

    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(label_map[label[word_idx]])
        tokenized_labels.append(label_ids)

    tokenized_inputs["labels"] = tokenized_labels
    return tokenized_inputs

train_dataset = Dataset.from_dict(tokenize_and_align_labels(train_sentences, train_labels))
dev_dataset = Dataset.from_dict(tokenize_and_align_labels(dev_sentences, dev_labels))
test_dataset = Dataset.from_dict(tokenize_and_align_labels(test_sentences, test_labels))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
 # Load the model
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

data_collator = DataCollatorForTokenClassification(tokenizer)

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[unique_labels[label] for label in sent if label != -100] for sent in labels]
    pred_labels = [[unique_labels[pred] for pred, lab in zip(preds, labs) if lab != -100] for preds, labs in zip(predictions, labels)]

    return {
        "accuracy_score": accuracy_score(true_labels, pred_labels),
        "precision": precision_score(true_labels, pred_labels),
        "recall": recall_score(true_labels, pred_labels),
        "f1": f1_score(true_labels, pred_labels),
    }

In [None]:
from transformers import TrainingArguments, Trainer, AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification
from transformers.trainer_callback import EarlyStoppingCallback # Import EarlyStoppingCallback

In [None]:
training_args = TrainingArguments(
    output_dir="./odia_ner_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy_score", # Monitor validation accuracy
    greater_is_better=True, # Higher accuracy is better
    # Remove the callbacks argument from TrainingArguments
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)]
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics, # Add this line to include the metrics function
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)] # Add callbacks here
)

  trainer = Trainer(


In [None]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtusarkantadalai22[0m ([33msushilmaurya-soa[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
print(trainer.state.log_history)

In [None]:
# Extract accuracy with epochs
train_epochs = []
train_accuracies = []
eval_epochs = []
eval_accuracies = []

for log in trainer.state.log_history:
    if 'epoch' in log:
        if 'eval_accuracy_score' in log:
            # This is likely from evaluation
            eval_epochs.append(log['epoch'])
            eval_accuracies.append(log['eval_accuracy_score'])
        elif 'loss' in log and 'learning_rate' in log:
            # This is likely from training logs, though accuracy is not directly logged by default
            # If you had a custom callback logging training accuracy, you'd handle it here
            pass # Placeholder - adjust if you are logging training accuracy separately


# Plot the accuracies
plt.figure(figsize=(10, 6))
# If you were logging training accuracy:
# plt.plot(train_epochs, train_accuracies, label='Training Accuracy')
plt.plot(eval_epochs, eval_accuracies, label='Validation Accuracy')
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Accuracy Over Epochs")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
print("Evaluation Epochs:", eval_epochs)
print("Evaluation Accuracies:", eval_accuracies)

In [None]:
# Extract training and evaluation loss with epochs
train_epochs = []
train_losses = []
eval_epochs = []
eval_losses = []

for log in trainer.state.log_history:
    if 'epoch' in log and 'loss' in log:
        train_epochs.append(log['epoch'])
        train_losses.append(log['loss'])
    if 'epoch' in log and 'eval_loss' in log:
        eval_epochs.append(log['epoch'])
        eval_losses.append(log['eval_loss'])

# Plot the losses
plt.figure(figsize=(10, 6))
plt.plot(train_epochs, train_losses, label='Training Loss')
plt.plot(eval_epochs, eval_losses, label='Validation Loss')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Validation Loss Over Epochs")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Evaluate the model
def evaluate_model(trainer, dataset):
    predictions, labels, _ = trainer.predict(dataset)
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[unique_labels[label] for label in sent if label != -100] for sent in labels]
    pred_labels = [[unique_labels[pred] for pred, lab in zip(preds, labs) if lab != -100] for preds, labs in zip(predictions, labels)]
    print(classification_report(true_labels, pred_labels))

evaluate_model(trainer, test_dataset)

In [None]:
# Visualizing entity recognition
def plot_entity_distribution(labels):
    unique_labels, counts = np.unique(sum(labels, []), return_counts=True)
    plt.figure(figsize=(10, 5))
    plt.bar(unique_labels, counts)
    plt.xlabel("Entities")
    plt.ylabel("Count")
    plt.title("Entity Distribution in Dataset")
    plt.xticks(rotation=90)
    plt.show()

plot_entity_distribution(train_labels)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [None]:
# Evaluate the model with Confusion Matrix
def evaluate_model_with_cm(trainer, dataset, unique_labels):
    predictions, labels, _ = trainer.predict(dataset)
    predictions = np.argmax(predictions, axis=2)

    # Create list of lists for seqeval metrics
    true_labels = [[unique_labels[label] for label in sent if label != -100] for sent in labels]
    pred_labels = [[unique_labels[pred] for pred, lab in zip(preds, labs) if lab != -100] for preds, labs in zip(predictions, labels)]

    # Print classification report
    # Use the list of lists directly
    print(classification_report(true_labels, pred_labels))

    # For confusion matrix, we still need flat lists
    true_labels_flat = [item for sublist in true_labels for item in sublist]
    pred_labels_flat = [item for sublist in pred_labels for item in sublist]


    # Calculate and plot confusion matrix
    # Ensure all unique labels are included in display labels for the plot
    cm = confusion_matrix(true_labels_flat, pred_labels_flat, labels=unique_labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=unique_labels)
    disp.plot(cmap=plt.cm.Blues, values_format='d', xticks_rotation='vertical')
    plt.title("Confusion Matrix")
    plt.show()

# Call the modified evaluation function
evaluate_model_with_cm(trainer, test_dataset, unique_labels)