# Import required Libraries

In [1]:
import pandas as pd
import re, os
import nltk
from nltk.corpus import stopwords

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Path Variables

In [2]:
root_path = "../../data/Language-Identification/"
language_pairs = {
    "lid_hineng": "Hindi-English",
    "lid_msaea": "Modern Standard Arabic - EgyptArabic",
    "lid_nepeng": "Nepali-English",
    "lid_spaeng": "Spanish-English"
}

ground_truth_csv = os.path.join(root_path, "gt.csv")

# Ground Truth Preparation

In [3]:
def preprocess_tweet(text: str) -> str:
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'^RT\s*:\s*', '', text)
    text = re.sub(r'&\w+;', ' ', text)
    text = re.sub(r'&#\d+;', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\u0600-\u06FF]', '', text)
    text = text.strip()
    return text

def parse_conll_for_lid():
    words = []
    labels = []
    for key in language_pairs:
        test_file = os.path.join(root_path, key, "dev.conll")
        lang1 = "English"
        if key == "lid_hineng":
            lang2 = "Hindi"
        elif key == "lid_msaea":
            lang1 = "Modern Standard Arabic"
            lang2 = "Egypt Arabic"
        elif key == "lid_nepeng":
            lang2 = "Nepali"
        elif key == "lid_spaeng":
            lang2 = "Nepali"
        else:
            raise "Unsupport language pairs"
    
        with open(test_file, "r+") as read_file:
            text = read_file.read()
            text = text.replace("lang1", lang1)
            text = text.replace("lang2", lang2)
            lines = text.split("\n")
            
        lines = [line for line in lines if len(line.strip())>1]
        
        for idx, line in enumerate(lines):
            if "sent_enum" in line:
                pass
            else:
                items = line.split()
                word = items[0].strip()
                # word = preprocess_tweet(word)
                label = " ".join(items[1:]).strip()
                # if len(word) >= 1:
                words.append(word)
                labels.append(label)
    temp_df = pd.DataFrame.from_dict(
        {
            "words": words,
            "labels": labels,
        }
    )
    return temp_df            

In [4]:
if os.path.exists(ground_truth_csv):
    df = pd.read_csv(ground_truth_csv)
else:
    df = parse_conll_for_lid()
    df.to_csv(ground_truth_csv, index=False)

In [5]:
df

Unnamed: 0,words,labels
0,@ZahirJ,other
1,@BinyavangaW,other
2,Loved,English
3,the,English
4,ending,English
...,...,...
97083,OLEEE,Nepali
97084,!!!,other
97085,ABOTABOTABOTABOOOOOO,Nepali
97086,!!!!!!,other


In [6]:
list(set(df.labels))

['ne',
 'English',
 'mixed',
 'fw',
 'ambiguous',
 'Egypt Arabic',
 'unk',
 'Modern Standard Arabic',
 'Nepali',
 'Hindi',
 'other']

# Language Identification

In [7]:
available_models = [
    ("xlmr", "xlm-roberta-base"),
    ("mdeberta", "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"),
    ("labse", "setu4993/LaBSE"),
    ("muril", "google/muril-base-cased")
]

hf_token = "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"

In [8]:
# Encode sentences
def encode_sentences(tokenizer, model, sentences, device):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
    # print(encoded_input)
    encoded_input = encoded_input.to(device)
    with torch.no_grad():
        model_output = model(**encoded_input)
    return model_output.last_hidden_state[:, 0, :]

# Zero-shot prediction
def zero_shot_predict_single(text, tokenizer, model, label_embeddings, labels, device):
    text_embedding = encode_sentences(tokenizer, model, [text], device)
    cosine_similarities = F.cosine_similarity(text_embedding.unsqueeze(1), label_embeddings.unsqueeze(0), dim=2)
    predicted_index = torch.argmax(cosine_similarities, dim=1).item()
    return labels[predicted_index]

In [9]:
#  Choose the model here (1-based index): 1 = XLM-R, 2 = mDeBERTa, 3 = LaBSE, 4 = MuRIL
for index in range(1,5):
    df = pd.read_csv(ground_truth_csv)
    choose_model = index
    key, model_name = available_models[choose_model - 1]
    labels_list = ['Modern Standard Arabic', 'Hindi', 'Egypt Arabic', 'English', 'Nepali', 'ne',  'other', 'unk', 'ambiguous', 'mixed', 'fw']
    descriptions = [
        "This word is in Modern Standard Arabic",
        "This word is in Hindi",
        "This word is in Egypt Arabic",
        "This word is in English",
        "This word is in Nepali",
        "This word is a named entity such as a person, hashtag, or organization.",
        "This token is a special token like a mention or punctuation.",
        "Unknown – the language of the token could not be determined.",
        "it's unclear which language the token belongs to.",
        "the token contains parts from two languages",
        "this token is a word borrowed from a other language."
    ]
    if key not in df.columns:
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
        model = AutoModel.from_pretrained(model_name, use_auth_token=hf_token).to(device)
        model.eval()

        label_embeddings = encode_sentences(tokenizer, model, descriptions, device)

        predictions = []
        for idx, item in enumerate(df["words"].tolist()):
            try:
                pred = zero_shot_predict_single(item, tokenizer, model, label_embeddings, labels_list, device)
                predictions.append(pred)
            except Exception as e:
                print(f"Exception for model {model_name} at idx {idx}: {e}")
                predictions.append("other")

        df[key] = predictions
        df.to_csv(ground_truth_csv, index=False)



Exception for model xlm-roberta-base at idx 76474: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).




Exception for model MoritzLaurer/mDeBERTa-v3-base-mnli-xnli at idx 76474: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).




Exception for model setu4993/LaBSE at idx 76474: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).




Exception for model google/muril-base-cased at idx 76474: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).


In [10]:
predictions

['English',
 'ne',
 'English',
 'mixed',
 'mixed',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'fw',
 'English',
 'English',
 'English',
 'mixed',
 'English',
 'English',
 'English',
 'mixed',
 'English',
 'Nepali',
 'English',
 'English',
 'English',
 'English',
 'mixed',
 'English',
 'English',
 'English',
 'fw',
 'English',
 'Hindi',
 'English',
 'English',
 'English',
 'mixed',
 'English',
 'Hindi',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'Nepali',
 'English',
 'Hindi',
 'English',
 'fw',
 'English',
 'English',
 'English',
 'mixed',
 'English',
 'English',
 'English',
 'mixed',
 'English',
 'English',
 'mixed',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 '

In [11]:
print("hello")

hello


# Evaluation

In [12]:
df = pd.read_csv(ground_truth_csv)

In [13]:
for index in range(1,5):
    choose_model = index
    key, model_name = available_models[choose_model - 1]
    all_true = df["labels"].tolist()
    all_pred = df[key].tolist()
    report = classification_report(all_true, all_pred, digits=4)
    print(f"\nCLASSIFICATION REPORT: {key}")
    print(report)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



CLASSIFICATION REPORT: xlmr
                        precision    recall  f1-score   support

          Egypt Arabic     0.0401    0.4168    0.0731      4100
               English     0.4421    0.0195    0.0373     31358
                 Hindi     0.0785    0.0178    0.0291      3306
Modern Standard Arabic     0.0000    0.0000    0.0000     13317
                Nepali     0.2905    0.3257    0.3071     23372
             ambiguous     0.0006    0.0138    0.0011       217
                    fw     0.0000    0.0000    0.0000        31
                 mixed     0.0006    0.2667    0.0012        30
                    ne     0.0000    0.0000    0.0000      4892
                 other     0.0000    0.0000    0.0000     16431
                   unk     0.0000    0.0000    0.0000        34

              accuracy                         0.1030     97088
             macro avg     0.0775    0.0964    0.0408     97088
          weighted avg     0.2171    0.1030    0.0901     97088



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



CLASSIFICATION REPORT: mdeberta
                        precision    recall  f1-score   support

          Egypt Arabic     0.0579    0.0083    0.0145      4100
               English     0.2313    0.4925    0.3148     31358
                 Hindi     0.0000    0.0000    0.0000      3306
Modern Standard Arabic     0.0522    0.0010    0.0019     13317
                Nepali     0.0104    0.0000    0.0001     23372
             ambiguous     0.0007    0.0323    0.0014       217
                    fw     0.0000    0.0000    0.0000        31
                 mixed     0.0000    0.0000    0.0000        30
                    ne     0.0356    0.0315    0.0334      4892
                 other     0.0103    0.0029    0.0045     16431
                   unk     0.0000    0.0000    0.0000        34

              accuracy                         0.1617     97088
             macro avg     0.0362    0.0517    0.0337     97088
          weighted avg     0.0904    0.1617    0.1050     97088


CLA