# Import required Libraries

In [28]:
import pandas as pd
import re, os
import nltk
from nltk.corpus import stopwords

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Path Variables

In [29]:
ground_truth_csv = "../../data/Qualitative/Hate Speech.csv"

In [30]:
ground_truth_csv

'../../data/Qualitative/Hate Speech.csv'

In [31]:
if os.path.exists(ground_truth_csv):
    df = pd.read_csv(ground_truth_csv)

# Basic Pre-process

In [32]:
def preprocess_tweet(text: str) -> str:
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'RT : ', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'(\n)+', ' ', text)
    text = re.sub(r'&#\d+;', ' ', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'#(\w+)', '', text)
    text = re.sub(r'&amp', 'and', text)
    text = re.sub(r'&(\w+);', '', text)
    text = re.sub(r'[^\w\s\d]+', '', text)
    text = text.lower()
    
    return text.strip()

In [33]:
if "processed_sentence" not in df.columns:
    df['processed_sentence'] = df['Text'].apply(preprocess_tweet)
    df.to_csv(ground_truth_csv, index=False)
else:
    df = pd.read_csv(ground_truth_csv)

In [34]:
df

Unnamed: 0,Text,Code-mix-language,Label,processed_sentence
0,Ada paavingala kgf oda vachathukaga engaluku y...,English-Tamil,Hate Speech,ada paavingala kgf oda vachathukaga engaluku y...
1,dei kelthu kuthi ajith ivanya yaru nadika sona,English-Tamil,Hate Speech,dei kelthu kuthi ajith ivanya yaru nadika sona
2,Trailer mokka da ajith kelatu payale,English-Tamil,Hate Speech,trailer mokka da ajith kelatu payale
3,Rajini poi thokula thoku itha Veda yaru Una ke...,English-Tamil,Hate Speech,rajini poi thokula thoku itha veda yaru una ke...
4,கடைசில rajini ய வச்சு செஞ்சுடிங்களே டா,English-Tamil,Hate Speech,கடசல rajini ய வசச சஞசடஙகள ட
...,...,...,...,...
95,nuvvv atteee nakuuu chalaa pichiii prabhas I m...,English-Telugu,Non-Hate Speech,nuvvv atteee nakuuu chalaa pichiii prabhas i m...
96,TELUGU సినిమా చరిత్రర్లో లో MAA PRABAS నీ ఓ గొ...,English-Telugu,Non-Hate Speech,telugu సనమ చరతరరల ల maa prabas న ఓ గపప నటడగ నల...
97,ఇప్పటికీ కూడా ఈ ట్రైలర్ చూస్తుంటే రోమాలు నిక్క...,English-Telugu,Non-Hate Speech,ఇపపటక కడ ఈ టరలర చసతట రమల నకకడచకటయ
98,ఒక మినీ మూవీ ఈ సినిమా....music takes to that l...,English-Telugu,Non-Hate Speech,ఒక మన మవ ఈ సనమmusic takes to that level


# Offensive Language Identification

In [35]:
available_models = [
    ("xlmr", "xlm-roberta-base"),
    ("mdeberta", "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"),
    ("labse", "setu4993/LaBSE"),
    ("muril", "google/muril-base-cased")
]

In [36]:
# Encode sentences
def encode_sentences(tokenizer, model, sentences,device):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
    encoded_input = encoded_input.to(device)
    with torch.no_grad():
        model_output = model(**encoded_input)
    return model_output.last_hidden_state[:, 0, :]

# Zero-shot prediction
def zero_shot_predict(texts, labels, label_description,  model_name):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    batch_size = 1000
    label_embeddings = encode_sentences(tokenizer, model, label_description,device)
    predicted_indices = []
    for start in range(0, len(texts), batch_size):
        end = start + batch_size
        batch_texts = texts[start:end]
        text_embeddings = encode_sentences(tokenizer, model, batch_texts, device)
        cosine_similarities = F.cosine_similarity(text_embeddings.unsqueeze(1), label_embeddings.unsqueeze(0), dim=2)
        predicted_indices += torch.argmax(cosine_similarities, dim=1).tolist()
    return [labels[i] for i in predicted_indices]

In [37]:
#  Choose the model here (1-based index): 1 = XLM-R, 2 = mDeBERTa, 3 = LaBSE, 4 = MuRIL
for index in range(1,5):
    df = pd.read_csv(ground_truth_csv)
    choose_model = index
    key, model_name = available_models[choose_model - 1]
    labels_list = ["Non-Hate Speech", "Hate Speech"]
    descriptions = ["not an offensive text", "an offensive text"]
    if key not in df.columns:
        df[key] = zero_shot_predict(df["Text"].tolist(), labels_list, descriptions, model_name=model_name)
        df.to_csv(ground_truth_csv, index=False)



In [38]:
df

Unnamed: 0,Text,Code-mix-language,Label,processed_sentence,xlmr,mdeberta,labse,muril
0,Ada paavingala kgf oda vachathukaga engaluku y...,English-Tamil,Hate Speech,ada paavingala kgf oda vachathukaga engaluku y...,Non-Hate Speech,Non-Hate Speech,Hate Speech,Non-Hate Speech
1,dei kelthu kuthi ajith ivanya yaru nadika sona,English-Tamil,Hate Speech,dei kelthu kuthi ajith ivanya yaru nadika sona,Hate Speech,Non-Hate Speech,Hate Speech,Non-Hate Speech
2,Trailer mokka da ajith kelatu payale,English-Tamil,Hate Speech,trailer mokka da ajith kelatu payale,Hate Speech,Non-Hate Speech,Hate Speech,Non-Hate Speech
3,Rajini poi thokula thoku itha Veda yaru Una ke...,English-Tamil,Hate Speech,rajini poi thokula thoku itha veda yaru una ke...,Non-Hate Speech,Non-Hate Speech,Hate Speech,Non-Hate Speech
4,கடைசில rajini ய வச்சு செஞ்சுடிங்களே டா,English-Tamil,Hate Speech,கடசல rajini ய வசச சஞசடஙகள ட,Hate Speech,Non-Hate Speech,Hate Speech,Non-Hate Speech
...,...,...,...,...,...,...,...,...
95,nuvvv atteee nakuuu chalaa pichiii prabhas I m...,English-Telugu,Non-Hate Speech,nuvvv atteee nakuuu chalaa pichiii prabhas i m...,Hate Speech,Non-Hate Speech,Hate Speech,Non-Hate Speech
96,TELUGU సినిమా చరిత్రర్లో లో MAA PRABAS నీ ఓ గొ...,English-Telugu,Non-Hate Speech,telugu సనమ చరతరరల ల maa prabas న ఓ గపప నటడగ నల...,Hate Speech,Hate Speech,Hate Speech,Non-Hate Speech
97,ఇప్పటికీ కూడా ఈ ట్రైలర్ చూస్తుంటే రోమాలు నిక్క...,English-Telugu,Non-Hate Speech,ఇపపటక కడ ఈ టరలర చసతట రమల నకకడచకటయ,Hate Speech,Hate Speech,Hate Speech,Non-Hate Speech
98,ఒక మినీ మూవీ ఈ సినిమా....music takes to that l...,English-Telugu,Non-Hate Speech,ఒక మన మవ ఈ సనమmusic takes to that level,Hate Speech,Non-Hate Speech,Hate Speech,Non-Hate Speech


# Evaluation

In [39]:
for index in range(1,5):
    choose_model = index
    key, model_name = available_models[choose_model - 1]
    all_true = df["Label"].tolist()
    all_pred = df[key].tolist()
    report = classification_report(all_true, all_pred, digits=4)
    print(f"\nCLASSIFICATION REPORT: {key}")
    print(report)


CLASSIFICATION REPORT: xlmr
                 precision    recall  f1-score   support

    Hate Speech     0.3103    0.3600    0.3333        50
Non-Hate Speech     0.2381    0.2000    0.2174        50

       accuracy                         0.2800       100
      macro avg     0.2742    0.2800    0.2754       100
   weighted avg     0.2742    0.2800    0.2754       100


CLASSIFICATION REPORT: mdeberta
                 precision    recall  f1-score   support

    Hate Speech     0.2500    0.0600    0.0968        50
Non-Hate Speech     0.4659    0.8200    0.5942        50

       accuracy                         0.4400       100
      macro avg     0.3580    0.4400    0.3455       100
   weighted avg     0.3580    0.4400    0.3455       100


CLASSIFICATION REPORT: labse
                 precision    recall  f1-score   support

    Hate Speech     0.4949    0.9800    0.6577        50
Non-Hate Speech     0.0000    0.0000    0.0000        50

       accuracy                         0.490

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
