# Import required Libraries

In [1]:
import pandas as pd
import re, os
import nltk
from nltk.corpus import stopwords

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Path Variables

In [2]:
root_path = "../../data/Sentiment-Analysis/"
test_file = os.path.join(root_path, "dev.conll")
ground_truth_csv = os.path.join(root_path, "test.csv")

# Ground Truth Preparation

In [3]:
def parse_conll_for_sentiment(file_path):
    sentences, current_sentence_tokens, current_label = [], [], None
    
    with open(file_path, "r+", encoding="utf-8") as read_file:
        text = read_file.read()
        lines = text.split("\n")
    lines = [line for line in lines if len(line.strip())>1]
    sentence_words, sentences, labels = [], [], []
    for idx, line in enumerate(lines):
        if "sent_enum" in line:
            labels.append(line.split()[-1])
            if idx != 0:
                sentences.append(" ".join(sentence_words))
                sentence_words = []
        else:
            sentence_words.append(line.split()[0])

        if idx == len(lines)-1:
            sentences.append(" ".join(sentence_words))

    temp_df = pd.DataFrame.from_dict(
        {
            "sentences": sentences,
            "labels": labels,
        }
    )
    return temp_df

In [4]:
if not os.path.exists(ground_truth_csv):
    df = parse_conll_for_sentiment(test_file)
    df.to_csv(ground_truth_csv, index=False)
else:
    df = pd.read_csv(ground_truth_csv)

In [5]:
df

Unnamed: 0,sentences,labels,processed_sentence
0,@brissamayen @sanluispotoyees estopp I blashhh...,positive,estopp i blashhh lol jk but aww thanks haha x
1,Qué mejor que pasar Valentine 's thirdwheelean...,positive,qué mejor que pasar valentine s thirdwheeleand...
2,#FF y de super #fashionfabolous a mi colega @b...,positive,y de super a mi colega el duo stylist de y mas
3,“ @AZUCENACIERCO : Look de hoy gracias a @Angi...,positive,look de hoy gracias a me encanto flaca de ...
4,RT @andyescalona : #brindando #celebrando #tod...,positive,rt cc
...,...,...,...
1854,Que struggle esto del traje del prom,positive,que struggle esto del traje del prom
1855,@Waltmanzc hahahaha gracias Walteeer !! ! 😄,positive,hahahaha gracias walteeer
1856,#musicvideo #republica #notengodinero @Maffio ...,positive,outfit 1
1857,Comiendo en @carrabbas's!! Very good !,positive,comiendo en s very good


# Basic Pre-process

In [6]:
def preprocess_tweet(text: str) -> str:
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'RT : ', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'(\n)+', ' ', text)
    text = re.sub(r'&#\d+;', ' ', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'#(\w+)', '', text)
    text = re.sub(r'&amp', 'and', text)
    text = re.sub(r'&(\w+);', '', text)
    text = re.sub(r'[^\w\s\d]+', '', text)
    text = text.lower()
    
    return text.strip()

In [7]:
if "processed_sentence" not in df.columns:
    df['processed_sentence'] = df['sentences'].apply(preprocess_tweet)
    df.to_csv(ground_truth_csv, index=False)
else:
    df = pd.read_csv(ground_truth_csv)

In [8]:
df

Unnamed: 0,sentences,labels,processed_sentence
0,@brissamayen @sanluispotoyees estopp I blashhh...,positive,estopp i blashhh lol jk but aww thanks haha x
1,Qué mejor que pasar Valentine 's thirdwheelean...,positive,qué mejor que pasar valentine s thirdwheeleand...
2,#FF y de super #fashionfabolous a mi colega @b...,positive,y de super a mi colega el duo stylist de y mas
3,“ @AZUCENACIERCO : Look de hoy gracias a @Angi...,positive,look de hoy gracias a me encanto flaca de ...
4,RT @andyescalona : #brindando #celebrando #tod...,positive,rt cc
...,...,...,...
1854,Que struggle esto del traje del prom,positive,que struggle esto del traje del prom
1855,@Waltmanzc hahahaha gracias Walteeer !! ! 😄,positive,hahahaha gracias walteeer
1856,#musicvideo #republica #notengodinero @Maffio ...,positive,outfit 1
1857,Comiendo en @carrabbas's!! Very good !,positive,comiendo en s very good


# Sentiment-Analysis

In [9]:
available_models = [
    ("xlmr", "xlm-roberta-base"),
    ("mdeberta", "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"),
    ("labse", "setu4993/LaBSE"),
    ("muril", "google/muril-base-cased")
]
hf_token = "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"

In [10]:
# Encode sentences
def encode_sentences(tokenizer, model, sentences):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    return model_output.last_hidden_state[:, 0, :]

# Zero-shot prediction
def zero_shot_predict(texts, labels, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
    model = AutoModel.from_pretrained(model_name, token=hf_token)

    text_embeddings = encode_sentences(tokenizer, model, texts)
    label_embeddings = encode_sentences(tokenizer, model, labels)

    cosine_similarities = F.cosine_similarity(text_embeddings.unsqueeze(1), label_embeddings.unsqueeze(0), dim=2)
    predicted_indices = torch.argmax(cosine_similarities, dim=1)
    return [labels[i] for i in predicted_indices]

In [11]:
#  Choose the model here (1-based index): 1 = XLM-R, 2 = mDeBERTa, 3 = LaBSE, 4 = MuRIL
for index in range(1,5):
    df = pd.read_csv(ground_truth_csv)
    choose_model = index
    key, model_name = available_models[choose_model - 1]
    if key not in df.columns:
        df[key] = zero_shot_predict(df["sentences"].tolist(), ["positive", "negative","neutral"], model_name=model_name)
        df.to_csv(ground_truth_csv, index=False)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

In [12]:
df

Unnamed: 0,sentences,labels,processed_sentence,xlmr,mdeberta,labse,muril
0,@brissamayen @sanluispotoyees estopp I blashhh...,positive,estopp i blashhh lol jk but aww thanks haha x,negative,neutral,positive,positive
1,Qué mejor que pasar Valentine 's thirdwheelean...,positive,qué mejor que pasar valentine s thirdwheeleand...,positive,neutral,neutral,positive
2,#FF y de super #fashionfabolous a mi colega @b...,positive,y de super a mi colega el duo stylist de y mas,negative,neutral,neutral,positive
3,“ @AZUCENACIERCO : Look de hoy gracias a @Angi...,positive,look de hoy gracias a me encanto flaca de ...,negative,positive,neutral,positive
4,RT @andyescalona : #brindando #celebrando #tod...,positive,rt cc,negative,neutral,neutral,positive
...,...,...,...,...,...,...,...
1854,Que struggle esto del traje del prom,positive,que struggle esto del traje del prom,negative,positive,neutral,positive
1855,@Waltmanzc hahahaha gracias Walteeer !! ! 😄,positive,hahahaha gracias walteeer,positive,neutral,positive,positive
1856,#musicvideo #republica #notengodinero @Maffio ...,positive,outfit 1,negative,neutral,neutral,positive
1857,Comiendo en @carrabbas's!! Very good !,positive,comiendo en s very good,negative,positive,positive,positive


# Evaluation

In [13]:
df = pd.read_csv(ground_truth_csv)

In [14]:
for index in range(1,5):
    choose_model = index
    key, model_name = available_models[choose_model - 1]
    all_true = df["labels"].tolist()
    all_pred = df[key].tolist()
    report = classification_report(all_true, all_pred, digits=4)
    print(f"\nCLASSIFICATION REPORT: {key}")
    print(report)


CLASSIFICATION REPORT: xlmr
              precision    recall  f1-score   support

    negative     0.1509    0.5344    0.2354       305
     neutral     0.0000    0.0000    0.0000       517
    positive     0.5244    0.3934    0.4496      1037

    accuracy                         0.3072      1859
   macro avg     0.2251    0.3093    0.2283      1859
weighted avg     0.3173    0.3072    0.2894      1859


CLASSIFICATION REPORT: mdeberta
              precision    recall  f1-score   support

    negative     0.1282    0.0328    0.0522       305
     neutral     0.2905    0.7041    0.4113       517
    positive     0.5019    0.2555    0.3387      1037

    accuracy                         0.3437      1859
   macro avg     0.3069    0.3308    0.2674      1859
weighted avg     0.3818    0.3437    0.3119      1859


CLASSIFICATION REPORT: labse
              precision    recall  f1-score   support

    negative     0.2493    0.2754    0.2617       305
     neutral     0.2825    0.7776    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
