# Import required Libraries

In [1]:
import pandas as pd
import re, os
import nltk
from nltk.corpus import stopwords

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Path Variables

In [2]:
root_path = "../../data/Named-Entity-Recognition/"
language_pairs = {
    "ner_hineng": "Hindi-English",
    "ner_msaea": "Modern Standard Arabic - EgyptArabic",
    "ner_spaeng": "Spanish-English"
}

ground_truth_csv = os.path.join(root_path, "gt.csv")

# Ground Truth Preparation

In [3]:
for key in language_pairs:
    test_file = os.path.join(root_path, key, "dev.conll")
    with open(test_file, "r+") as read_file:
        text = read_file.read()
        lines = text.split("\n")

In [4]:
def preprocess_tweet(text: str) -> str:
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'^RT\s*:\s*', '', text)
    text = re.sub(r'&\w+;', ' ', text)
    text = re.sub(r'&#\d+;', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\u0600-\u06FF]', '', text)
    text = text.strip()
    return text

def parse_conll_for_lid():
    words = []
    labels = []
    for key in language_pairs:
        test_file = os.path.join(root_path, key, "dev.conll")

        with open(test_file, "r+") as read_file:
            text = read_file.read()
            lines = text.split("\n")
            
        lines = [line for line in lines if len(line.strip())>1]
        
        for idx, line in enumerate(lines):
            if "sent_enum" in line:
                pass
            else:
                items = line.split("\t")
                word = items[0].strip()
                # word = preprocess_tweet(word)
                label = items[-1]
                label = label.replace("B-", "")
                label = label.replace("I-", "")
                if label == "ORG":
                    label = "ORGANISATION"
                if label == "PER":
                    label = "PERSON"
                if label == "OTHER":
                    label = "O"
                words.append(word)
                labels.append(label)
    temp_df = pd.DataFrame.from_dict(
        {
            "words": words,
            "labels": labels,
        }
    )
    return temp_df            

In [5]:
if os.path.exists(ground_truth_csv):
    df = pd.read_csv(ground_truth_csv)
else:
    df = parse_conll_for_lid()
    df.to_csv(ground_truth_csv, index=False)

In [6]:
df

Unnamed: 0,words,labels
0,stupid,O
1,move,O
2,",",O
3,considering,O
4,their,O
...,...,...
150757,un,O
150758,trabajo,O
150759,de,O
150760,verdad,O


In [7]:
list(set(df.labels))

['ORGANISATION',
 'PERSON',
 'PROD',
 'O',
 'EVENT',
 'TITLE',
 'TIME',
 'LOC',
 'GROUP',
 'PLACE']

# Named Entity Recognition

In [8]:
available_models = [
    ("xlmr", "xlm-roberta-base"),
    ("mdeberta", "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"),
    ("labse", "setu4993/LaBSE"),
    ("muril", "google/muril-base-cased")
]

hf_token = "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"

In [9]:
# Encode sentences
def encode_sentences(tokenizer, model, sentences, device):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
    # print(encoded_input)
    encoded_input = encoded_input.to(device)
    with torch.no_grad():
        model_output = model(**encoded_input)
    return model_output.last_hidden_state[:, 0, :]

# Zero-shot prediction
def zero_shot_predict_single(text, tokenizer, model, label_embeddings, labels, device):
    text_embedding = encode_sentences(tokenizer, model, [text], device)
    cosine_similarities = F.cosine_similarity(text_embedding.unsqueeze(1), label_embeddings.unsqueeze(0), dim=2)
    predicted_index = torch.argmax(cosine_similarities, dim=1).item()
    return labels[predicted_index]

In [10]:
#  Choose the model here (1-based index): 1 = XLM-R, 2 = mDeBERTa, 3 = LaBSE, 4 = MuRIL
for index in range(1,5):
    df = pd.read_csv(ground_truth_csv)
    choose_model = index
    key, model_name = available_models[choose_model - 1]
    labels_list = ['PERSON', 'PLACE', 'O', 'GROUP', 'ORGANISATION', 'LOC', 'PROD', 'TITLE', 'TIME', 'EVENT']
    descriptions = [
        'The word is an entity and refers a person',
        'The word is an entity and refers a place',
        'The word is not an entity and refers standard word',
        'The word is not an entity and refers a group',
        'The word is not an entity and refers a organisation',
        'The word is not an entity and refers a location',
        'The word is not an entity and refers a product',
        'The word is not an entity and refers a title',
        'The word is not an entity and refers a time',
        'The word is not an entity and refers an event'
    ]
    if key not in df.columns:
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
        model = AutoModel.from_pretrained(model_name, use_auth_token=hf_token).to(device)
        model.eval()

        label_embeddings = encode_sentences(tokenizer, model, descriptions, device)

        predictions = []
        for idx, item in enumerate(df["words"].tolist()):
            try:
                pred = zero_shot_predict_single(item, tokenizer, model, label_embeddings, labels_list, device)
                predictions.append(pred)
            except Exception as e:
                print(f"Exception for model {model_name} at idx {idx}: {e}")
                predictions.append("O")

        df[key] = predictions
        df.to_csv(ground_truth_csv, index=False)
        print(key)



xlmr




mdeberta




labse




muril


# Evaluation

In [11]:
df = pd.read_csv(ground_truth_csv)

In [12]:
for index in range(1,5):
    choose_model = index
    key, model_name = available_models[choose_model - 1]
    all_true = df["labels"].tolist()
    all_pred = df[key].tolist()
    report = classification_report(all_true, all_pred, digits=4)
    print(f"\nCLASSIFICATION REPORT: {key}")
    print(report)


CLASSIFICATION REPORT: xlmr
              precision    recall  f1-score   support

       EVENT     0.0007    0.0089    0.0014       224
       GROUP     0.0000    0.0000    0.0000       474
         LOC     0.0192    0.0008    0.0015      1241
           O     0.9091    0.0001    0.0001    144120
ORGANISATION     0.0024    0.1293    0.0046       588
      PERSON     0.0281    0.0181    0.0220      2702
       PLACE     0.0009    0.7519    0.0018       129
        PROD     0.0061    0.0467    0.0109       428
        TIME     0.0000    0.0000    0.0000       225
       TITLE     0.0026    0.0111    0.0042       631

    accuracy                         0.0017    150762
   macro avg     0.0969    0.0967    0.0047    150762
weighted avg     0.8697    0.0017    0.0006    150762



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



CLASSIFICATION REPORT: mdeberta
              precision    recall  f1-score   support

       EVENT     0.0007    0.0938    0.0014       224
       GROUP     0.0029    0.1899    0.0057       474
         LOC     0.0000    0.0000    0.0000      1241
           O     0.0000    0.0000    0.0000    144120
ORGANISATION     0.0000    0.0000    0.0000       588
      PERSON     0.0229    0.2002    0.0411      2702
       PLACE     0.0008    0.4264    0.0017       129
        PROD     0.0000    0.0000    0.0000       428
        TIME     0.0000    0.0000    0.0000       225
       TITLE     0.0000    0.0000    0.0000       631

    accuracy                         0.0047    150762
   macro avg     0.0027    0.0910    0.0050    150762
weighted avg     0.0004    0.0047    0.0008    150762


CLASSIFICATION REPORT: labse
              precision    recall  f1-score   support

       EVENT     0.0033    0.0670    0.0062       224
       GROUP     0.0143    0.2764    0.0272       474
         LOC   