In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tqdm import tqdm

# Load and preprocess data
def load_and_prepare_data(train_path, val_path):
    train_df = pd.read_csv(train_path).dropna()
    validation_df = pd.read_csv(val_path).dropna()

    train_texts, train_labels = train_df['Word'].tolist(), train_df['Tag'].tolist()
    validation_texts, validation_labels = validation_df['Word'].tolist(), validation_df['Tag'].tolist()

    label_encoder = LabelEncoder()
    train_labels_encoded = label_encoder.fit_transform(train_labels)
    validation_labels_encoded = label_encoder.transform(validation_labels)

    return train_texts, train_labels_encoded, validation_texts, validation_labels_encoded, label_encoder

# Custom dataset creation
def create_dataset(texts, labels, tokenizer, max_length):
    input_ids_list = []
    attention_mask_list = []
    labels_list = []

    for text, label in zip(texts, labels):
        encoding = tokenizer(
            str(text),
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt'
        )
        input_ids_list.append(encoding['input_ids'].squeeze())
        attention_mask_list.append(encoding['attention_mask'].squeeze())
        labels_list.append(torch.tensor(label, dtype=torch.long))

    dataset = list(zip(input_ids_list, attention_mask_list, labels_list))
    return dataset

# Training and evaluation function
def train_and_evaluate(train_texts, train_labels_encoded, validation_texts, validation_labels_encoded, label_encoder, batch_size=8, max_length=128, num_epochs=2):
    bert_model_name = 'google/muril-base-cased'
    tokenizer = BertTokenizer.from_pretrained(bert_model_name)
    model = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=len(label_encoder.classes_))

    train_dataset = create_dataset(train_texts, train_labels_encoded, tokenizer, max_length)
    validation_dataset = create_dataset(validation_texts, validation_labels_encoded, tokenizer, max_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    validation_loader = DataLoader(validation_dataset, batch_size=batch_size)

    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=2e-5)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    print("Start training")
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        correct_train = 0
        total_train = 0

        with tqdm(train_loader, unit="batch") as t:
            for batch in t:
                input_ids, attention_mask, labels = batch
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                logits = outputs.logits

                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
                optimizer.step()

                total_loss += loss.item()
                _, predicted = torch.max(logits, 1)
                total_train += labels.size(0)
                correct_train += (predicted == labels).sum().item()

                t.set_postfix({'loss': total_loss / (t.n + 1), 'accuracy': correct_train / total_train})

    model.eval()
    y_true, y_pred = [], []

    with torch.no_grad():
        with tqdm(validation_loader, unit="batch") as t:
            for batch in t:
                input_ids, attention_mask, labels = batch
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                _, predicted = torch.max(logits, 1)

                y_true.extend(labels.cpu().numpy())
                y_pred.extend(predicted.cpu().numpy())

    # Metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Return y_true, y_pred, and label encoder for the combined report
    return y_true, y_pred, label_encoder

# File paths for datasets
datasets = {
    "Tamil": {
        "train": "/content/drive/MyDrive/HYBRID MURIL/all language correct format dataset /correct_tamil_dataset.csv",
        "validation": "/content/drive/MyDrive/HYBRID MURIL/all language correct format dataset /correct_tamil_validation"
    },
    "Malayalam": {
        "train": "/content/drive/MyDrive/HYBRID MURIL/all language correct format dataset /Final_mal_train(80_)  (1).csv",
        "validation": "/content/drive/MyDrive/HYBRID MURIL/all language correct format dataset /Final_mal_dev(20_) (1).csv"
    },
    "Tulu": {
        "train": "/content/drive/MyDrive/HYBRID MURIL/all language correct format dataset /correct_tulu_train_set",
        "validation": "/content/drive/MyDrive/HYBRID MURIL/all language correct format dataset /correct_tulu_validation_set"
    },
    "Kannada": {
        "train": "/content/drive/MyDrive/HYBRID MURIL/all language correct format dataset /correct_kannada_train",
        "validation": "/content/drive/MyDrive/HYBRID MURIL/all language correct format dataset /correct_kannada_validation"
    }
}

# Evaluate the model on different datasets and save reports
for language, paths in datasets.items():
    print(f"Evaluating for {language}")
    train_texts, train_labels_encoded, validation_texts, validation_labels_encoded, label_encoder = load_and_prepare_data(paths['train'], paths['validation'])
    y_true, y_pred, label_encoder = train_and_evaluate(train_texts, train_labels_encoded, validation_texts, validation_labels_encoded, label_encoder)

    # Generate and print classification report for each language
    target_names = label_encoder.classes_
    labels = list(range(len(target_names)))  # Ensure this matches the number of classes

    # Generate the classification report with labels and target_names
    classification_report_str = classification_report(y_true, y_pred, labels=labels, target_names=target_names)
    print(f"\nClassification Report for {language}:")
    print(classification_report_str)

    # Save the classification report to a text file
    with open(f"{language}_classification_report.txt", "w") as f:
        f.write(f"Classification Report for {language}:\n")
        f.write(classification_report_str)


Evaluating for Tamil


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start training


100%|██████████| 1692/1692 [06:42<00:00,  4.21batch/s, loss=0.929, accuracy=0.742]
100%|██████████| 1692/1692 [06:45<00:00,  4.17batch/s, loss=0.422, accuracy=0.908]
100%|██████████| 248/248 [00:14<00:00, 17.27batch/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9425
Precision: 0.9416
Recall: 0.9425
F1 Score: 0.9409

Classification Report for Tamil:
              precision    recall  f1-score   support

    Location       0.00      0.00      0.00         0
       Other       0.00      0.00      0.00         1
          en       0.98      0.94      0.96       496
        name       0.84      0.68      0.75       160
         sym       1.00      1.00      1.00       183
          tm       0.94      0.98      0.96      1000
        tmen       0.86      0.95      0.90       144

   micro avg       0.94      0.94      0.94      1984
   macro avg       0.66      0.65      0.65      1984
weighted avg       0.94      0.94      0.94      1984

Evaluating for Malayalam


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start training


100%|██████████| 4728/4728 [18:56<00:00,  4.16batch/s, loss=0.695, accuracy=0.806]
100%|██████████| 4728/4728 [18:56<00:00,  4.16batch/s, loss=0.339, accuracy=0.918]
100%|██████████| 1179/1179 [01:08<00:00, 17.19batch/s]


Accuracy: 0.8904
Precision: 0.9007
Recall: 0.8904
F1 Score: 0.8930

Classification Report for Malayalam:
              precision    recall  f1-score   support

     ENGLISH       0.97      0.87      0.92      2230
   MALAYALAM       0.94      0.95      0.95      4371
       MIXED       0.75      0.56      0.64       375
        NAME       0.64      0.83      0.72       504
      NUMBER       0.97      1.00      0.98       203
       OTHER       0.49      0.61      0.55       641
       PLACE       0.83      0.48      0.61        63
         SYM       1.00      1.00      1.00      1042

    accuracy                           0.89      9429
   macro avg       0.82      0.79      0.80      9429
weighted avg       0.90      0.89      0.89      9429

Evaluating for Tulu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start training


100%|██████████| 3690/3690 [14:46<00:00,  4.16batch/s, loss=0.846, accuracy=0.755]
100%|██████████| 3690/3690 [14:43<00:00,  4.18batch/s, loss=0.47, accuracy=0.872]
100%|██████████| 376/376 [00:21<00:00, 17.28batch/s]


Accuracy: 0.8802
Precision: 0.8836
Recall: 0.8802
F1 Score: 0.8668

Classification Report for Tulu:
              precision    recall  f1-score   support

     English       0.94      0.93      0.94       742
     Kannada       0.74      0.64      0.69       273
    Location       0.67      0.85      0.75        41
       Mixed       0.88      0.75      0.81        57
        Name       0.80      0.67      0.73       135
       Other       1.00      0.02      0.05        85
        Tulu       0.85      0.95      0.90      1251
         sym       1.00      1.00      1.00       422

    accuracy                           0.88      3006
   macro avg       0.86      0.73      0.73      3006
weighted avg       0.88      0.88      0.87      3006

Evaluating for Kannada


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start training


100%|██████████| 3750/3750 [14:57<00:00,  4.18batch/s, loss=0.506, accuracy=0.854]
100%|██████████| 3750/3750 [14:57<00:00,  4.18batch/s, loss=0.172, accuracy=0.96]
100%|██████████| 311/311 [00:17<00:00, 17.32batch/s]


Accuracy: 0.9488
Precision: 0.9589
Recall: 0.9488
F1 Score: 0.9527

Classification Report for Kannada:
              precision    recall  f1-score   support

          en       0.98      0.99      0.99      1109
          kn       0.96      0.89      0.92       634
    location       1.00      0.69      0.82        13
       mixed       0.96      0.95      0.95       180
        name       0.91      0.97      0.94       158
       other       0.32      0.55      0.40        53
         sym       1.00      1.00      1.00       334

    accuracy                           0.95      2481
   macro avg       0.87      0.86      0.86      2481
weighted avg       0.96      0.95      0.95      2481

