# Import required Libraries

In [1]:
import pandas as pd
import re, os
import nltk
from nltk.corpus import stopwords

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Path Variables

In [2]:
ground_truth_csv = "../../data/Qualitative/Sentiment-Analysis.csv"

In [3]:
ground_truth_csv

'../../data/Qualitative/Sentiment-Analysis.csv'

In [4]:
if os.path.exists(ground_truth_csv):
    df = pd.read_csv(ground_truth_csv)

# Basic Pre-process

In [5]:
def preprocess_tweet(text: str) -> str:
    text = re.sub(r'(\n)+', ' ', text)
    text = re.sub(r'&#\d+;', ' ', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'&(\w+);', '', text)
    text = re.sub(r'[^\w\s\d]+', '', text)
    text = text.lower()
    
    return text.strip()

In [6]:
if "processed_sentence" not in df.columns:
    df['processed_sentence'] = df['Text'].apply(preprocess_tweet)
    df.to_csv(ground_truth_csv, index=False)
else:
    df = pd.read_csv(ground_truth_csv)

# Sentiment-Analysis

In [7]:
available_models = [
    ("xlmr", "xlm-roberta-base"),
    ("mdeberta", "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"),
    ("labse", "setu4993/LaBSE"),
    ("muril", "google/muril-base-cased")
]
hf_token = "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"

In [8]:
# Encode sentences
def encode_sentences(tokenizer, model, sentences):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    return model_output.last_hidden_state[:, 0, :]

# Zero-shot prediction
def zero_shot_predict(texts, labels, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
    model = AutoModel.from_pretrained(model_name, token=hf_token)

    text_embeddings = encode_sentences(tokenizer, model, texts)
    label_embeddings = encode_sentences(tokenizer, model, labels)

    cosine_similarities = F.cosine_similarity(text_embeddings.unsqueeze(1), label_embeddings.unsqueeze(0), dim=2)
    predicted_indices = torch.argmax(cosine_similarities, dim=1)
    return [labels[i] for i in predicted_indices]

In [9]:
#  Choose the model here (1-based index): 1 = XLM-R, 2 = mDeBERTa, 3 = LaBSE, 4 = MuRIL
for index in range(1,5):
    df = pd.read_csv(ground_truth_csv)
    choose_model = index
    key, model_name = available_models[choose_model - 1]
    if key not in df.columns:
        df[key] = zero_shot_predict(df["Text"].tolist(), ["Positive", "Negative","Neutral"], model_name=model_name)
        df.to_csv(ground_truth_csv, index=False)

In [10]:
df

Unnamed: 0,Text,Code-mix-language,Label,processed_sentence,xlmr,mdeberta,labse,muril
0,Ee Sala Cup Namde,English- Kannada,Positive,ee sala cup namde,Negative,Neutral,Neutral,Positive
1,ಏನ್ ಗುರು ಇದು..️ Get ready for History.. ನೆನಪಿರ...,English- Kannada,Positive,ಏನ ಗರ ಇದ get ready for history ನನಪರಲ it will b...,Positive,Positive,Neutral,Positive
2,Film end ಟೈಟಲ್ ಕಾರ್ಡ್ music,English- Kannada,Neutral,film end ಟಟಲ ಕರಡ music,Neutral,Neutral,Negative,Positive
3,ಹೆಮ್ಮೆ ಪಡುವ ಸಂಗತಿ ntr ಮತ್ತು ramcharan voice,English- Kannada,Positive,ಹಮಮ ಪಡವ ಸಗತ ntr ಮತತ ramcharan voice,Positive,Neutral,Neutral,Positive
4,Jr ntr and ramcharan vico ತುಂಬಾಚೆನ್ನಾಗಿದೆ,English- Kannada,Positive,jr ntr and ramcharan vico ತಬಚನನಗದ,Positive,Negative,Neutral,Positive
...,...,...,...,...,...,...,...,...
95,TELUGU సినిమా చరిత్రర్లో లో MAA PRABAS నీ ఓ గొ...,English-Telugu,Positive,telugu సనమ చరతరరల ల maa prabas న ఓ గపప నటడగ నల...,Positive,Positive,Negative,Positive
96,2020 బాహుబలి ఫ్యాన్స్ like yesukondi,English-Telugu,Neutral,2020 బహబల ఫయనస like yesukondi,Positive,Neutral,Neutral,Positive
97,ఇప్పటికీ కూడా ఈ ట్రైలర్ చూస్తుంటే రోమాలు నిక్క...,English-Telugu,Positive,ఇపపటక కడ ఈ టరలర చసతట రమల నకకడచకటయ,Positive,Positive,Neutral,Positive
98,ఒక మినీ మూవీ ఈ సినిమా....music takes to that l...,English-Telugu,Positive,ఒక మన మవ ఈ సనమmusic takes to that level,Positive,Neutral,Neutral,Positive


# Evaluation

In [11]:
for index in range(1,5):
    choose_model = index
    key, model_name = available_models[choose_model - 1]
    all_true = df["Label"].tolist()
    all_pred = df[key].tolist()
    report = classification_report(all_true, all_pred, digits=4)
    print(f"\nCLASSIFICATION REPORT: {key}")
    print(report)


CLASSIFICATION REPORT: xlmr
              precision    recall  f1-score   support

    Negative     0.3333    0.2750    0.3014        40
     Neutral     0.2222    0.1538    0.1818        13
    Positive     0.5000    0.6170    0.5524        47

    accuracy                         0.4200       100
   macro avg     0.3519    0.3486    0.3452       100
weighted avg     0.3972    0.4200    0.4038       100


CLASSIFICATION REPORT: mdeberta
              precision    recall  f1-score   support

    Negative     0.4091    0.2250    0.2903        40
     Neutral     0.0851    0.3077    0.1333        13
    Positive     0.5806    0.3830    0.4615        47

    accuracy                         0.3100       100
   macro avg     0.3583    0.3052    0.2951       100
weighted avg     0.4476    0.3100    0.3504       100


CLASSIFICATION REPORT: labse
              precision    recall  f1-score   support

    Negative     0.4706    0.2000    0.2807        40
     Neutral     0.1053    0.6154    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
