# Import required Libraries

In [1]:
import pandas as pd
import re, os
import nltk
from nltk.corpus import stopwords

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Path Variables

In [2]:
root_path = "../../data/Offensive-Language-Identification/"
test_file = os.path.join(root_path, "test.csv")
ground_truth_csv = os.path.join(root_path, "gt.csv")

# Ground Truth Preparation

In [3]:
if not os.path.exists(ground_truth_csv):
    temp_df = pd.read_csv(test_file)
    df = pd.DataFrame()
    df["sentences"] = temp_df.text
    df["labels"] = temp_df.label
    df.to_csv(ground_truth_csv, index=False)
else:
    df = pd.read_csv(ground_truth_csv)

# Basic Pre-process

In [4]:
def preprocess_tweet(text: str) -> str:
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'RT : ', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'(\n)+', ' ', text)
    text = re.sub(r'&#\d+;', ' ', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'#(\w+)', '', text)
    text = re.sub(r'&amp', 'and', text)
    text = re.sub(r'&(\w+);', '', text)
    text = re.sub(r'[^\w\s\d]+', '', text)
    text = text.lower()
    
    return text.strip()

In [5]:
if "processed_sentence" not in df.columns:
    df['processed_sentence'] = df['sentences'].apply(preprocess_tweet)
    df.to_csv(ground_truth_csv, index=False)
else:
    df = pd.read_csv(ground_truth_csv)

In [6]:
df

Unnamed: 0,sentences,labels,processed_sentence,xlmr
0,विश्लेषण | डेविन Nunes explains ‘antifa’ এবং '...,NOT,वशलषण डवन nunes explains antifa এব সমসত य दग ...,OFF
1,@USER Diversity बनाता है आप मजबूत,NOT,diversity बनत ह आप मजबत,OFF
2,... आत्म-आक्रामक उदारवादियों का एक समूह in fan...,OFF,आतमआकरमक उदरवदय क एक समह in fancy dresses mock...,OFF
3,@USER @USER को इससे उबरने की जरूरत है। काफ़ी ह...,NOT,क इसस उबरन क जररत ह कफ हद तक 5 বছর ধর তন পখদর ...,OFF
4,@USER @USER GET OUT मेरा MENTIONS (i stand cor...,NOT,get out मर mentions i stand corrected,OFF
...,...,...,...,...
19995,@USER क्या आपको पता चला সিনেটর কলিন্স কোথায় ঘ...,NOT,कय आपक पत चल সনটর কলনস কথয ঘমচছন let us know s...,OFF
19996,@USER @USER @USER @USER @USER You are so full ...,OFF,you are so full of shit a हसक अपरध नच ह और दशक...,OFF
19997,@USER ইয়ো क्यों অবশ্যই आप করতে this,NOT,ইয कय অবশযই आप করত this,OFF
19998,@USER आप হয় জ্ঞানী,NOT,आप হয জঞন,OFF


# Sentiment-Analysis

In [7]:
available_models = [
    ("xlmr", "xlm-roberta-base"),
    ("mdeberta", "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"),
    ("labse", "setu4993/LaBSE"),
    ("muril", "google/muril-base-cased")
]

In [8]:
# Encode sentences
def encode_sentences(tokenizer, model, sentences):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
    encoded_input = encoded_input.to("cuda")
    with torch.no_grad():
        model_output = model(**encoded_input)
    return model_output.last_hidden_state[:, 0, :]

# Zero-shot prediction
def zero_shot_predict(texts, labels, label_description,  model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model = model.to("cuda")
    batch_size = 1000
    label_embeddings = encode_sentences(tokenizer, model, label_description)
    predicted_indices = []
    for batch in range(0, int(len(texts)/batch_size)):
        batch_texts = texts[(batch*batch_size):((batch+1)*batch_size)]
        text_embeddings = encode_sentences(tokenizer, model, batch_texts)
        cosine_similarities = F.cosine_similarity(text_embeddings.unsqueeze(1), label_embeddings.unsqueeze(0), dim=2)
        predicted_indices += torch.argmax(cosine_similarities, dim=1)
    return [labels[i] for i in predicted_indices]

In [9]:
#  Choose the model here (1-based index): 1 = XLM-R, 2 = mDeBERTa, 3 = LaBSE, 4 = MuRIL
for index in range(1,5):
    df = pd.read_csv(ground_truth_csv)
    choose_model = index
    key, model_name = available_models[choose_model - 1]
    labels_list = ["NOT", "OFF"]
    descriptions = ["not an offensive text", "an offensive text"]
    if key not in df.columns:
        df[key] = zero_shot_predict(df["sentences"].tolist(), labels_list, descriptions, model_name=model_name)
        df.to_csv(ground_truth_csv, index=False)

In [10]:
df

Unnamed: 0,sentences,labels,processed_sentence,xlmr,mdeberta,labse,muril
0,विश्लेषण | डेविन Nunes explains ‘antifa’ এবং '...,NOT,वशलषण डवन nunes explains antifa এব সমসত य दग ...,OFF,NOT,OFF,NOT
1,@USER Diversity बनाता है आप मजबूत,NOT,diversity बनत ह आप मजबत,OFF,OFF,OFF,NOT
2,... आत्म-आक्रामक उदारवादियों का एक समूह in fan...,OFF,आतमआकरमक उदरवदय क एक समह in fancy dresses mock...,OFF,NOT,OFF,NOT
3,@USER @USER को इससे उबरने की जरूरत है। काफ़ी ह...,NOT,क इसस उबरन क जररत ह कफ हद तक 5 বছর ধর তন পখদর ...,OFF,OFF,OFF,NOT
4,@USER @USER GET OUT मेरा MENTIONS (i stand cor...,NOT,get out मर mentions i stand corrected,OFF,NOT,OFF,NOT
...,...,...,...,...,...,...,...
19995,@USER क्या आपको पता चला সিনেটর কলিন্স কোথায় ঘ...,NOT,कय आपक पत चल সনটর কলনস কথয ঘমচছন let us know s...,OFF,OFF,OFF,NOT
19996,@USER @USER @USER @USER @USER You are so full ...,OFF,you are so full of shit a हसक अपरध नच ह और दशक...,OFF,NOT,OFF,NOT
19997,@USER ইয়ো क्यों অবশ্যই आप করতে this,NOT,ইয कय অবশযই आप করত this,OFF,NOT,OFF,NOT
19998,@USER आप হয় জ্ঞানী,NOT,आप হয জঞন,OFF,OFF,OFF,NOT


# Evaluation

In [11]:
df = pd.read_csv(ground_truth_csv)

In [12]:
for index in range(1,5):
    choose_model = index
    key, model_name = available_models[choose_model - 1]
    all_true = df["labels"].tolist()
    all_pred = df[key].tolist()
    report = classification_report(all_true, all_pred, digits=4)
    print(f"\nCLASSIFICATION REPORT: {key}")
    print(report)


CLASSIFICATION REPORT: xlmr
              precision    recall  f1-score   support

         NOT     0.6732    0.0647    0.1180     13340
         OFF     0.3334    0.9371    0.4918      6660

    accuracy                         0.3552     20000
   macro avg     0.5033    0.5009    0.3049     20000
weighted avg     0.5600    0.3552    0.2425     20000


CLASSIFICATION REPORT: mdeberta
              precision    recall  f1-score   support

         NOT     0.6707    0.6694    0.6700     13340
         OFF     0.3403    0.3416    0.3410      6660

    accuracy                         0.5603     20000
   macro avg     0.5055    0.5055    0.5055     20000
weighted avg     0.5607    0.5603    0.5605     20000


CLASSIFICATION REPORT: labse
              precision    recall  f1-score   support

         NOT     0.8231    0.0160    0.0315     13340
         OFF     0.3351    0.9931    0.5011      6660

    accuracy                         0.3414     20000
   macro avg     0.5791    0.5046   