# Import required Libraries

In [43]:
import pandas as pd
import re, os
import nltk
from nltk.corpus import stopwords

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Path Variables

In [44]:
ground_truth_csv = "../../data/Qualitative/NER.csv"

In [45]:
if os.path.exists(ground_truth_csv):
    df = pd.read_csv(ground_truth_csv)
else:
    df = parse_conll_for_lid()
    df.to_csv(ground_truth_csv, index=False)

In [46]:
df

Unnamed: 0,Text,NER,LID
0,shirt,PRODUCT,English
1,wesi,O,Hindi
2,hii,O,Hindi
3,thi,O,Hindi
4,jese,O,Hindi
...,...,...,...
982,full,O,English
983,bakwas,O,Telugu
984,time,O,English
985,waste,O,English


In [47]:
list(set(df.NER))

['B-GPE',
 'I-MISC',
 'B-TIME',
 'B-MISC',
 'MONEY',
 'B-DATE',
 'TIME',
 'B-LANGUAGE',
 'PRODUCT',
 'SENTIMENT',
 'PERSON',
 'I-SENTIMENT',
 'B-EVENT',
 'I-PRODUCT',
 'I-EVENT',
 'QUALITY',
 'B-LOC',
 'B-LAW',
 'B-PERSON',
 'I-TIME',
 'O',
 'B-FAC',
 'I-ATTRIBUTE',
 'B-JOB',
 'B-SENTIMENT',
 'B-MOVIE',
 'B-ORG',
 'ATTRIBUTE',
 'I-QUALITY',
 'B-ATTRIBUTE',
 'B-GROUP',
 'I-PERSON',
 'I-ORG',
 'I-FAC',
 'B-QUALITY',
 'B-WORK_OF_ART',
 'I-DATE',
 'B-ORDINAL',
 'ORG',
 'B-PRODUCT',
 'B-TRANSPORT']

# Ground Truth Preparation

In [48]:
if os.path.exists(ground_truth_csv):
    df = pd.read_csv(ground_truth_csv)
    df["Cleaned_NER"] = df["NER"].apply(lambda x: x[2:] if isinstance(x, str) and (x.startswith("B-") or x.startswith("I-")) else x)
    df.to_csv(ground_truth_csv, index=False)

In [49]:
list(set(df.Cleaned_NER))

['FAC',
 'JOB',
 'MONEY',
 'TIME',
 'GROUP',
 'PRODUCT',
 'SENTIMENT',
 'MOVIE',
 'PERSON',
 'LOC',
 'QUALITY',
 'O',
 'LAW',
 'ORDINAL',
 'LANGUAGE',
 'ATTRIBUTE',
 'EVENT',
 'DATE',
 'TRANSPORT',
 'WORK_OF_ART',
 'MISC',
 'ORG',
 'GPE']

# Named Entity Recognition

In [50]:
available_models = [
    ("xlmr", "xlm-roberta-base"),
    ("mdeberta", "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"),
    ("labse", "setu4993/LaBSE"),
    ("muril", "google/muril-base-cased")
]

hf_token = "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"

In [51]:
def get_label_descriptions(labels):
    """
    Generate descriptions for each NER label.
    """
    label_descriptions = {}
    
    for label in labels:
        base = label.replace("B-", "").replace("I-", "")
        description = ""

        if label == "O":
            description = "The word is not an entity and refers to a regular token."
        elif "PERSON" in base:
            description = "The word refers to a named person (e.g., John, Obama)."
        elif "ORG" in base:
            description = "The word refers to an organization (e.g., UN, Google)."
        elif "GPE" in base:
            description = "The word refers to a geopolitical entity (e.g., country, city)."
        elif "LOC" in base:
            description = "The word refers to a location (e.g., mountain, river)."
        elif "DATE" in base:
            description = "The word refers to a date or specific calendar reference."
        elif "TIME" in base:
            description = "The word refers to a specific time expression."
        elif "LAW" in base:
            description = "The word refers to a legal document or regulation."
        elif "PRODUCT" in base:
            description = "The word refers to a commercial product."
        elif "LANGUAGE" in base:
            description = "The word refers to a language name (e.g., English, Spanish)."
        elif "FAC" in base:
            description = "The word refers to a facility (e.g., building, airport)."
        elif "WORK_OF_ART" in base:
            description = "The word refers to a work of art (e.g., a book, song, movie)."
        elif "EVENT" in base:
            description = "The word refers to an event (e.g., Olympics, war)."
        elif "MONEY" in base:
            description = "The word refers to a monetary value or currency."
        elif "PERCENT" in base:
            description = "The word refers to a percentage expression."
        elif "ORDINAL" in base:
            description = "The word refers to an ordinal number (e.g., first, second)."
        elif "ATTRIBUTE" in base:
            description = "The word refers to a feature or property."
        elif "QUALITY" in base:
            description = "The word refers to a descriptive quality or trait."
        elif "SENTIMENT" in base:
            description = "The word conveys positive or negative sentiment."
        elif "GROUP" in base:
            description = "The word refers to a group of people or items."
        elif "MOVIE" in base:
            description = "The word refers to a movie or film title."
        elif "TRANSPORT" in base:
            description = "The word refers to a mode or type of transportation."
        elif "JOB" in base:
            description = "The word refers to a job title or occupation."
        else:
            description = f"The word refNER-1 - Sheet1ers to the entity category: {base}."

        label_descriptions[label] = description

    return label_descriptions


In [52]:
# Encode sentences
def encode_sentences(tokenizer, model, sentences,device):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
    # print(encoded_input)
    encoded_input = encoded_input.to(device)
    with torch.no_grad():
        model_output = model(**encoded_input)
    return model_output.last_hidden_state[:, 0, :]

# Zero-shot prediction
def zero_shot_predict_single(text, tokenizer, model, label_embeddings, labels, device):
    text_embedding = encode_sentences(tokenizer, model, [text], device)
    cosine_similarities = F.cosine_similarity(text_embedding.unsqueeze(1), label_embeddings.unsqueeze(0), dim=2)
    predicted_index = torch.argmax(cosine_similarities, dim=1).item()
    return labels[predicted_index]

In [53]:
#  Choose the model here (1-based index): 1 = XLM-R, 2 = mDeBERTa, 3 = LaBSE, 4 = MuRIL
for index in range(1, 5):
    df = pd.read_csv(ground_truth_csv)
    key, model_name = available_models[index - 1]

    labels_list =['DATE', 'LAW', 'TRANSPORT', 'PRODUCT', 'ORG', 'GPE', 'GROUP', 'QUALITY', 'MISC', 
              'LOC', 'ATTRIBUTE', 'LANGUAGE', 'EVENT', 'JOB', 'TIME', 'PERSON','SENTIMENT', 
              'MOVIE', 'ORDINAL', 'WORK_OF_ART', 'FAC', 'MONEY', 'O']


    descriptions = get_label_descriptions(labels_list)
    descriptions = list(descriptions.values())

    if key not in df.columns:
        device = "cuda:1" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
        model = AutoModel.from_pretrained(model_name, use_auth_token=hf_token).to(device)
        model.eval()

        label_embeddings = encode_sentences(tokenizer, model, descriptions, device)

        predictions = []
        for idx, item in enumerate(df["Text"].tolist()):
            try:
                pred = zero_shot_predict_single(item, tokenizer, model, label_embeddings, labels_list, device)
                predictions.append(pred)
            except Exception as e:
                print(f"Exception for model {model_name} at idx {idx}: {e}")
                predictions.append("O")

        df[key] = predictions
        df.to_csv(ground_truth_csv, index=False)
        print(key)



xlmr




mdeberta




labse




muril


In [54]:
df

Unnamed: 0,Text,NER,LID,Cleaned_NER,xlmr,mdeberta,labse,muril
0,shirt,PRODUCT,English,PRODUCT,O,GPE,MISC,PRODUCT
1,wesi,O,Hindi,O,O,MISC,O,EVENT
2,hii,O,Hindi,O,O,MISC,MISC,EVENT
3,thi,O,Hindi,O,O,MISC,MISC,TIME
4,jese,O,Hindi,O,O,DATE,MISC,PRODUCT
...,...,...,...,...,...,...,...,...
982,full,O,English,O,O,MISC,O,PRODUCT
983,bakwas,O,Telugu,O,O,DATE,MISC,O
984,time,O,English,O,O,MISC,TIME,TIME
985,waste,O,English,O,O,DATE,MISC,PRODUCT


# Evaluation

In [55]:
df = pd.read_csv(ground_truth_csv)

In [56]:
for index in range(1,5):
    choose_model = index
    key, model_name = available_models[choose_model - 1]
    all_true = df["Cleaned_NER"].tolist()
    all_pred = df[key].tolist()
    report = classification_report(all_true, all_pred, digits=4)
    print(f"\nCLASSIFICATION REPORT: {key}")
    print(report)


CLASSIFICATION REPORT: xlmr
              precision    recall  f1-score   support

   ATTRIBUTE     0.0000    0.0000    0.0000        15
        DATE     0.0000    0.0000    0.0000         7
       EVENT     0.0000    0.0000    0.0000        12
         FAC     0.0000    0.0000    0.0000         2
         GPE     0.0000    0.0000    0.0000         1
       GROUP     0.0000    0.0000    0.0000         1
         JOB     0.0000    0.0000    0.0000         1
    LANGUAGE     0.0000    0.0000    0.0000         1
         LAW     0.0000    0.0000    0.0000         1
         LOC     0.0000    0.0000    0.0000         9
        MISC     0.0357    0.0667    0.0465        15
       MONEY     0.0000    0.0000    0.0000         3
       MOVIE     0.0000    0.0000    0.0000         2
           O     0.8129    0.9171    0.8619       796
     ORDINAL     0.0000    0.0000    0.0000         1
         ORG     0.0000    0.0000    0.0000        13
      PERSON     0.0000    0.0000    0.0000        5

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
