In [95]:
text = "happy, good, nice, useful, beautiful, stunning, great"

In [96]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
sentiment_scores = outputs.logits.softmax(dim=1).tolist()[0]
sentiment_scores

[0.00011391211592126638, 0.9998860359191895]

In [97]:
bi_lstm_model = TextClassifier.load("en-sentiment")

sentence = Sentence(text)
bi_lstm_model.predict(sentence)
sentiment_scores = sentence.labels[0].score
sentiment_scores

0.9926915764808655

In [98]:
blob = TextBlob(text)
sentiment_scores = blob.sentiment.polarity 
sentiment_scores

0.65

In [99]:
sid = SentimentIntensityAnalyzer()
sentiment_scores = sid.polarity_scores(text)
sentiment_scores

{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.9716}

# Sentiment Analysis

In [147]:
import os
import pickle
import pandas as pd
from tqdm import tqdm
from constants import languages
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import BertForSequenceClassification, BertTokenizer
from flair.models import TextClassifier
from flair.data import Sentence
from textblob import TextBlob

from sharechat_scraper.constants import languages
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

from sklearn.preprocessing import label_binarize
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/shashankgsharma/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [138]:
distilbert_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
bi_lstm_model = TextClassifier.load("en-sentiment")

In [139]:
for lang in languages:
    print()
    print(f"Getting sentiments for language: {lang}")
    print()

    with open(f"translations/translations_{lang}.pickle", "rb") as file:
        translations = pickle.load(file)

    new_translations = {}
    actual_labels_df = pd.DataFrame(columns=["file", "text", "label"])
    file_locs = []
    texts = []

    for folder, files in translations.items():
        new_translations[folder] = {}
        print(f"Analyzing Folder: {folder}")

        for filename, sentence in tqdm(files.items(), desc="Analyzing Sentiment"):
            if sentence["translation"] is not None:
                text = sentence["translation"]
                if text == '':
                    continue

                # VADER
                sid = SentimentIntensityAnalyzer()
                vader_scores = sid.polarity_scores(text)

                # Hugging Face Transformers
                inputs = distilbert_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
                try:
                    outputs = distilbert_model(**inputs)
                except:
                    continue
                distilbert_scores = outputs.logits.softmax(dim=1).tolist()[0]

                # Flair Bi-LSTM
                flair_sentence = Sentence(text)
                try:
                    bi_lstm_model.predict(flair_sentence)
                except:
                    continue
                bi_lstm_score = flair_sentence.labels[0].score  # Positive score

                # TextBlob
                blob = TextBlob(text)
                textblob_score = blob.sentiment.polarity  # Between -1 and 1

                sentiment_scores = {
                    "vader": vader_scores,
                    "distilbert": distilbert_scores,
                    "bi_lstm": bi_lstm_score,
                    "textblob": textblob_score
                }

                new_translations[folder][filename] = {}
                new_translations[folder][filename]["scores"] = sentiment_scores

                # print(f"{filename}: {sentiment_scores}")

    with open(f"sentiments/updated_sentiments_{lang}.pickle", "wb") as file:
        pickle.dump(new_translations, file)


Getting sentiments for language: Hindi

Analyzing Folder: sharechat_scraper/updated_tesseract/Hindi/5ZEBpp_Hindi/


Analyzing Sentiment: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 85/85 [00:03<00:00, 25.81it/s]


Analyzing Folder: sharechat_scraper/updated_tesseract/Hindi/lew5Am_Hindi/


Analyzing Sentiment: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 81/81 [00:03<00:00, 24.61it/s]


Analyzing Folder: sharechat_scraper/updated_tesseract/Hindi/m6d09W_Hindi/


Analyzing Sentiment: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:03<00:00,  9.81it/s]


Analyzing Folder: sharechat_scraper/updated_tesseract/Hindi/VO6Zjy_Hindi/


Analyzing Sentiment: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 410/410 [00:08<00:00, 50.08it/s]


Analyzing Folder: sharechat_scraper/updated_tesseract/Hindi/VO6ZVy_Hindi/


Analyzing Sentiment: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62/62 [00:01<00:00, 36.29it/s]



Getting sentiments for language: Bengali

Analyzing Folder: sharechat_scraper/updated_tesseract/Bengali/5ZEBpp_Bengali/


Analyzing Sentiment: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 52/52 [00:02<00:00, 18.08it/s]


Analyzing Folder: sharechat_scraper/updated_tesseract/Bengali/lew5Am_Bengali/


Analyzing Sentiment: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 130/130 [00:04<00:00, 28.88it/s]


Analyzing Folder: sharechat_scraper/updated_tesseract/Bengali/m6d09W_Bengali/


Analyzing Sentiment: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 94/94 [00:06<00:00, 14.79it/s]


Analyzing Folder: sharechat_scraper/updated_tesseract/Bengali/VO6Zjy_Bengali/


Analyzing Sentiment: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 232/232 [00:04<00:00, 50.34it/s]


Analyzing Folder: sharechat_scraper/updated_tesseract/Bengali/VO6ZVy_Bengali/


Analyzing Sentiment: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62/62 [00:02<00:00, 26.19it/s]



Getting sentiments for language: Punjabi

Analyzing Folder: sharechat_scraper/updated_tesseract/Punjabi/5ZEBpp_Punjabi/


Analyzing Sentiment: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 52/52 [00:02<00:00, 20.35it/s]


Analyzing Folder: sharechat_scraper/updated_tesseract/Punjabi/lew5Am_Punjabi/


Analyzing Sentiment: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 54/54 [00:02<00:00, 22.45it/s]


Analyzing Folder: sharechat_scraper/updated_tesseract/Punjabi/m6d09W_Punjabi/


Analyzing Sentiment: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 139/139 [00:13<00:00, 10.26it/s]


Analyzing Folder: sharechat_scraper/updated_tesseract/Punjabi/VO6Zjy_Punjabi/


Analyzing Sentiment: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 232/232 [00:03<00:00, 65.31it/s]


Analyzing Folder: sharechat_scraper/updated_tesseract/Punjabi/VO6ZVy_Punjabi/


Analyzing Sentiment: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62/62 [00:01<00:00, 39.56it/s]


# Sentiment Labelling

In [140]:
import pandas as pd

import os
import pickle
import json
from tqdm import tqdm
from math import pow
from constants import languages


leader_mappings = {"5ZEBpp": "Arvind_Kejrival", "lew5Am": "Rahul_Gandhi", "m6d09W": "Narendra_Modi", "VO6Zjy": "Akhilesh_Yadav", "VO6ZVy": "Asaduddin_Owaisi"}


def normalize_list(values):

    min_val = min(values)
    max_val = max(values)
    normalized_values = [(val - min_val) / (max_val - min_val) for val in values]
    
    return normalized_values


def convert_to_integral_number(string):
    parts = string.split()
    numeric_part = parts[0]
    suffix = numeric_part[-1]
    numeric_part = numeric_part[:-1]
    numeric_value = float(numeric_part)

    if suffix == 'K':
        numeric_value *= 1000
    elif suffix == 'M':
        numeric_value *= 1000000
    elif suffix == 'B':
        numeric_value *= 1000000000

    integral_number = int(numeric_value)    
    return integral_number


def read_likes_views(json_file, reqd_post_ph):
    with open(json_file, 'r', encoding='utf-8') as file:
        data = file.readlines()

        for line in data:
            post_data = json.loads(line)
            post_ph = post_data.get('post_ph')

            if reqd_post_ph == post_ph:
                likes = post_data.get('likes')
                views = post_data.get('number_of_views')
                return likes, views
            else:
                continue


def overall_sentiment_not_blob(score):
    if score <= 0.25:
        return -1, "NEGATIVE"
    elif score <= 0.60:
        return 0, "NEUTRAL"
    else:
        return 1, "POSITIVE"

def overall_sentiment_blob(score):
    if score < 0:
        return -1, "NEGATIVE"
    elif score > 0 and score < 0.25:
        return 0, "NEUTRAL"
    else:
        return 1, "POSITIVE"

In [141]:
for lang in languages:
    
    bert_labels = []
    vader_labels = []
    blob_labels = []
    lstm_labels = []
    
    file_paths = []
    
    final_vader_scores = []
    final_blob_scores = []
    final_lstm_scores = []
    final_bert_scores = []
    
    with open(f"sentiments/updated_sentiments_{lang}.pickle", "rb") as file:
        sentiments = pickle.load(file)

        leader_scores = {}
        for folder_path, files in sentiments.items():
            print(f"Processing Folder: {folder_path}")
            leader_hash = folder_path.split('/')[-2][:6]

            vader_scores = []
            blob_scores = []
            lstm_scores = []
            bert_scores = []
            
            likes_lst = []
            views_lst = []
            for filename, file_data in tqdm(files.items(), desc = "Calculating Average Sentiment Scores"):
                file_paths.append(folder_path + filename)
                likes, views = read_likes_views(f"sharechat_scraper/jsonl/output_{leader_hash}_{lang}.jsonl", filename[:-4])
                views = convert_to_integral_number(views)

                likes_lst.append(likes)
                views_lst.append(views)
                scaling_factor = int(likes) + pow(10, -4) / int(views) + pow(10, -4)

                unscaled_score_bert = file_data["scores"]["distilbert"][1]
                unscaled_score_lstm = file_data["scores"]["bi_lstm"]
                unscaled_score_blob = file_data["scores"]["textblob"]
                unscaled_score_vader = file_data["scores"]["vader"]["compound"]

                scaled_score_bert = scaling_factor * unscaled_score_bert
                scaled_score_lstm = scaling_factor * unscaled_score_lstm
                scaled_score_blob = scaling_factor * unscaled_score_blob
                scaled_score_vader = scaling_factor * unscaled_score_vader

                bert_labels.append(overall_sentiment_not_blob(scaled_score_bert)[0])
                lstm_labels.append(overall_sentiment_not_blob(scaled_score_lstm)[0])
                vader_labels.append(overall_sentiment_not_blob(scaled_score_vader)[0])
                blob_labels.append(overall_sentiment_not_blob(scaled_score_blob)[0])

                vader_scores.append(scaled_score_vader)
                blob_scores.append(scaled_score_blob)
                lstm_scores.append(scaled_score_lstm)
                bert_scores.append(scaled_score_bert)
        
            normalized_vader = normalize_list(vader_scores)
            normalized_blob = normalize_list(blob_scores)
            normalized_lstm = normalize_list(lstm_scores)
            normalized_bert = normalize_list(bert_scores)

            final_vader_scores.extend(normalized_vader)
            final_blob_scores.extend(normalized_blob)
            final_lstm_scores.extend(normalized_lstm)
            final_bert_scores.extend(normalized_bert)

            df = pd.DataFrame(columns = ['bert_score', 'vader_score', 'blob_score', 'lstm_score', 'likes', 'views'])
            df['bert_score'] = normalized_bert
            df['vader_score'] = normalized_vader
            df['lstm_score'] = normalized_lstm
            df['blob_score'] = normalized_blob

            file_to_save = f'leader_sentiment_scores/updated_{leader_mappings[leader_hash]}_{lang}.csv'
            df.to_csv(file_to_save)
            print()
            print(f'Successfully saved {file_to_save}!')
            print()

            avg_leader_bert_score = sum(normalized_bert) / len(normalized_bert)
            avg_leader_lstm_score = sum(normalized_lstm) / len(normalized_lstm)
            avg_leader_blob_score = sum(normalized_blob) / len(normalized_blob)
            avg_leader_vader_score = sum(normalized_vader) / len(normalized_vader)

            print("***************************************")
            print(f"Leader: {leader_mappings[leader_hash]}")
            print(f"Avg. Vader Score: {avg_leader_vader_score}")
            print(f"Avg. LSTM Score: {avg_leader_lstm_score}")
            print(f"Avg. BERT Score: {avg_leader_bert_score}")
            print(f"Avg. BLOB Score: {avg_leader_blob_score}")
            print()
            print(f"Overall Vader Sentiment: {overall_sentiment_not_blob(avg_leader_vader_score)[1]}")
            print(f"Overall Bert Sentiment: {overall_sentiment_not_blob(avg_leader_bert_score)[1]}")
            print(f"Overall BLOB Sentiment: {overall_sentiment_blob(avg_leader_blob_score)[1]}")
            print(f"Overall LSTM Sentiment: {overall_sentiment_not_blob(avg_leader_lstm_score)[1]}")
            print("***************************************")
            print()

            leader_scores[leader_hash] = {
                "avg_vader": avg_leader_vader_score, 
                "avg_lstm": avg_leader_lstm_score, 
                "avg_blob": avg_leader_blob_score,
                "avg_bert": avg_leader_bert_score
            }

            with open(f"results/results_{lang}.pickle", "wb") as file:
                pickle.dump(leader_scores, file)
            
        df_labels = pd.DataFrame(columns=['file', 'vader_label', 'lstm_label', 'blob_label', 'bert_label'])
        df_labels['file'] = file_paths
        df_labels['vader_label'] = vader_labels
        df_labels['blob_label'] = blob_labels
        df_labels['lstm_label'] = lstm_labels
        df_labels['bert_label'] = bert_labels
        
        df_labels.to_csv(f'gen_labels/updated_gen_labels_{lang}.csv')

Processing Folder: sharechat_scraper/updated_tesseract/Hindi/5ZEBpp_Hindi/


Calculating Average Sentiment Scores: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 67/67 [00:00<00:00, 2103.18it/s]



Successfully saved leader_sentiment_scores/updated_Arvind_Kejrival_Hindi.csv!

***************************************
Leader: Arvind_Kejrival
Avg. Vader Score: 0.6038721593066261
Avg. LSTM Score: 0.15247667103560175
Avg. BERT Score: 0.09436771202914229
Avg. BLOB Score: 0.3283270844209462

Overall Vader Sentiment: POSITIVE
Overall Bert Sentiment: NEGATIVE
Overall BLOB Sentiment: POSITIVE
Overall LSTM Sentiment: NEGATIVE
***************************************

Processing Folder: sharechat_scraper/updated_tesseract/Hindi/lew5Am_Hindi/


Calculating Average Sentiment Scores: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [00:00<00:00, 2598.47it/s]



Successfully saved leader_sentiment_scores/updated_Rahul_Gandhi_Hindi.csv!

***************************************
Leader: Rahul_Gandhi
Avg. Vader Score: 0.8134886129910059
Avg. LSTM Score: 0.1567688118224581
Avg. BERT Score: 0.10121712368787521
Avg. BLOB Score: 0.5345288510185691

Overall Vader Sentiment: POSITIVE
Overall Bert Sentiment: NEGATIVE
Overall BLOB Sentiment: POSITIVE
Overall LSTM Sentiment: NEGATIVE
***************************************

Processing Folder: sharechat_scraper/updated_tesseract/Hindi/m6d09W_Hindi/


Calculating Average Sentiment Scores: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 5246.27it/s]



Successfully saved leader_sentiment_scores/updated_Narendra_Modi_Hindi.csv!

***************************************
Leader: Narendra_Modi
Avg. Vader Score: 0.08381353733126995
Avg. LSTM Score: 0.046026146160188026
Avg. BERT Score: 0.0516861115620806
Avg. BLOB Score: 0.049929379272258705

Overall Vader Sentiment: NEGATIVE
Overall Bert Sentiment: NEGATIVE
Overall BLOB Sentiment: NEUTRAL
Overall LSTM Sentiment: NEGATIVE
***************************************

Processing Folder: sharechat_scraper/updated_tesseract/Hindi/VO6Zjy_Hindi/


Calculating Average Sentiment Scores: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 159/159 [00:00<00:00, 1127.80it/s]



Successfully saved leader_sentiment_scores/updated_Akhilesh_Yadav_Hindi.csv!

***************************************
Leader: Akhilesh_Yadav
Avg. Vader Score: 0.5269149500660629
Avg. LSTM Score: 0.13979002752620318
Avg. BERT Score: 0.11425319767928492
Avg. BLOB Score: 0.4260171507779184

Overall Vader Sentiment: NEUTRAL
Overall Bert Sentiment: NEGATIVE
Overall BLOB Sentiment: POSITIVE
Overall LSTM Sentiment: NEGATIVE
***************************************

Processing Folder: sharechat_scraper/updated_tesseract/Hindi/VO6ZVy_Hindi/


Calculating Average Sentiment Scores: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [00:00<00:00, 4832.01it/s]



Successfully saved leader_sentiment_scores/updated_Asaduddin_Owaisi_Hindi.csv!

***************************************
Leader: Asaduddin_Owaisi
Avg. Vader Score: 0.33824196329077133
Avg. LSTM Score: 0.189699189966758
Avg. BERT Score: 0.08992509127100133
Avg. BLOB Score: 0.3067384191905508

Overall Vader Sentiment: NEUTRAL
Overall Bert Sentiment: NEGATIVE
Overall BLOB Sentiment: POSITIVE
Overall LSTM Sentiment: NEGATIVE
***************************************

Processing Folder: sharechat_scraper/updated_tesseract/Bengali/5ZEBpp_Bengali/


Calculating Average Sentiment Scores: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 1778.44it/s]



Successfully saved leader_sentiment_scores/updated_Arvind_Kejrival_Bengali.csv!

***************************************
Leader: Arvind_Kejrival
Avg. Vader Score: 0.5735315629205829
Avg. LSTM Score: 0.19777810709333213
Avg. BERT Score: 0.1404623313338584
Avg. BLOB Score: 0.4555250073158071

Overall Vader Sentiment: NEUTRAL
Overall Bert Sentiment: NEGATIVE
Overall BLOB Sentiment: POSITIVE
Overall LSTM Sentiment: NEGATIVE
***************************************

Processing Folder: sharechat_scraper/updated_tesseract/Bengali/lew5Am_Bengali/


Calculating Average Sentiment Scores: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 69/69 [00:00<00:00, 1328.29it/s]



Successfully saved leader_sentiment_scores/updated_Rahul_Gandhi_Bengali.csv!

***************************************
Leader: Rahul_Gandhi
Avg. Vader Score: 0.8789778866718118
Avg. LSTM Score: 0.11204448134547236
Avg. BERT Score: 0.08134684861366828
Avg. BLOB Score: 0.5724458891057851

Overall Vader Sentiment: POSITIVE
Overall Bert Sentiment: NEGATIVE
Overall BLOB Sentiment: POSITIVE
Overall LSTM Sentiment: NEGATIVE
***************************************

Processing Folder: sharechat_scraper/updated_tesseract/Bengali/m6d09W_Bengali/


Calculating Average Sentiment Scores: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 75/75 [00:00<00:00, 1678.48it/s]



Successfully saved leader_sentiment_scores/updated_Narendra_Modi_Bengali.csv!

***************************************
Leader: Narendra_Modi
Avg. Vader Score: 0.08097120303273231
Avg. LSTM Score: 0.031688045157310614
Avg. BERT Score: 0.02550770409033966
Avg. BLOB Score: 0.05552042108783674

Overall Vader Sentiment: NEGATIVE
Overall Bert Sentiment: NEGATIVE
Overall BLOB Sentiment: NEUTRAL
Overall LSTM Sentiment: NEGATIVE
***************************************

Processing Folder: sharechat_scraper/updated_tesseract/Bengali/VO6Zjy_Bengali/


Calculating Average Sentiment Scores: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77/77 [00:00<00:00, 1571.18it/s]



Successfully saved leader_sentiment_scores/updated_Akhilesh_Yadav_Bengali.csv!

***************************************
Leader: Akhilesh_Yadav
Avg. Vader Score: 0.5645992310015531
Avg. LSTM Score: 0.17286533322720107
Avg. BERT Score: 0.10510759474866023
Avg. BLOB Score: 0.5392773382156035

Overall Vader Sentiment: NEUTRAL
Overall Bert Sentiment: NEGATIVE
Overall BLOB Sentiment: POSITIVE
Overall LSTM Sentiment: NEGATIVE
***************************************

Processing Folder: sharechat_scraper/updated_tesseract/Bengali/VO6ZVy_Bengali/


Calculating Average Sentiment Scores: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [00:00<00:00, 3852.45it/s]



Successfully saved leader_sentiment_scores/updated_Asaduddin_Owaisi_Bengali.csv!

***************************************
Leader: Asaduddin_Owaisi
Avg. Vader Score: 0.3328918732317972
Avg. LSTM Score: 0.19794211913781617
Avg. BERT Score: 0.08711668473333874
Avg. BLOB Score: 0.3515123282064184

Overall Vader Sentiment: NEUTRAL
Overall Bert Sentiment: NEGATIVE
Overall BLOB Sentiment: POSITIVE
Overall LSTM Sentiment: NEGATIVE
***************************************

Processing Folder: sharechat_scraper/updated_tesseract/Punjabi/5ZEBpp_Punjabi/


Calculating Average Sentiment Scores: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43/43 [00:00<00:00, 1994.24it/s]



Successfully saved leader_sentiment_scores/updated_Arvind_Kejrival_Punjabi.csv!

***************************************
Leader: Arvind_Kejrival
Avg. Vader Score: 0.6433879115895101
Avg. LSTM Score: 0.17104766932446616
Avg. BERT Score: 0.1276735599826618
Avg. BLOB Score: 0.1812078572032815

Overall Vader Sentiment: POSITIVE
Overall Bert Sentiment: NEGATIVE
Overall BLOB Sentiment: NEUTRAL
Overall LSTM Sentiment: NEGATIVE
***************************************

Processing Folder: sharechat_scraper/updated_tesseract/Punjabi/lew5Am_Punjabi/


Calculating Average Sentiment Scores: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 2585.57it/s]



Successfully saved leader_sentiment_scores/updated_Rahul_Gandhi_Punjabi.csv!

***************************************
Leader: Rahul_Gandhi
Avg. Vader Score: 0.47703714175205203
Avg. LSTM Score: 0.2517604756105475
Avg. BERT Score: 0.21676766472736825
Avg. BLOB Score: 0.3384281101466617

Overall Vader Sentiment: NEUTRAL
Overall Bert Sentiment: NEGATIVE
Overall BLOB Sentiment: POSITIVE
Overall LSTM Sentiment: NEUTRAL
***************************************

Processing Folder: sharechat_scraper/updated_tesseract/Punjabi/m6d09W_Punjabi/


Calculating Average Sentiment Scores: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 91/91 [00:00<00:00, 1666.51it/s]



Successfully saved leader_sentiment_scores/updated_Narendra_Modi_Punjabi.csv!

***************************************
Leader: Narendra_Modi
Avg. Vader Score: 0.06999833650933167
Avg. LSTM Score: 0.029431187540663328
Avg. BERT Score: 0.020954317841425915
Avg. BLOB Score: 0.037231485419511615

Overall Vader Sentiment: NEGATIVE
Overall Bert Sentiment: NEGATIVE
Overall BLOB Sentiment: NEUTRAL
Overall LSTM Sentiment: NEGATIVE
***************************************

Processing Folder: sharechat_scraper/updated_tesseract/Punjabi/VO6Zjy_Punjabi/


Calculating Average Sentiment Scores: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 56/56 [00:00<00:00, 1567.49it/s]



Successfully saved leader_sentiment_scores/updated_Akhilesh_Yadav_Punjabi.csv!

***************************************
Leader: Akhilesh_Yadav
Avg. Vader Score: 0.501931639457699
Avg. LSTM Score: 0.3408243619976937
Avg. BERT Score: 0.25455931348922956
Avg. BLOB Score: 0.4899796640971234

Overall Vader Sentiment: NEUTRAL
Overall Bert Sentiment: NEUTRAL
Overall BLOB Sentiment: POSITIVE
Overall LSTM Sentiment: NEUTRAL
***************************************

Processing Folder: sharechat_scraper/updated_tesseract/Punjabi/VO6ZVy_Punjabi/


Calculating Average Sentiment Scores: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 3995.72it/s]


Successfully saved leader_sentiment_scores/updated_Asaduddin_Owaisi_Punjabi.csv!

***************************************
Leader: Asaduddin_Owaisi
Avg. Vader Score: 0.3212575540481779
Avg. LSTM Score: 0.20980131544542888
Avg. BERT Score: 0.10151502173491807
Avg. BLOB Score: 0.2997961109508963

Overall Vader Sentiment: NEUTRAL
Overall Bert Sentiment: NEGATIVE
Overall BLOB Sentiment: POSITIVE
Overall LSTM Sentiment: NEGATIVE
***************************************






# Evaluation

In [152]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# from constants import languages
sentiment_labels = [1, 0, -1]
languages = ["Hindi", "Punjabi", "Bengali"]
dataframes = {}

for lang in languages:
    print()
    print(f"Evaluating for language: {lang}")
    print()

    gen_labels = pd.read_csv(f'gen_labels/updated_gen_labels_{lang}.csv')
    actual_labels = pd.read_csv(f'actual_labels/actual_labels_{lang}.csv')

    gen_labels.drop(columns = ['Unnamed: 0'], inplace = True)
    actual_labels.drop(columns = ['Unnamed: 0', 'text'], inplace = True)
    # actual_labels['label'] = np.random.choice(sentiment_labels, size = len(actual_labels))

    merged_df = pd.merge(gen_labels, actual_labels, on='file', how='inner')
    # df.rename(columns={'Unnamed: 0_y': 'label_y'}, inplace=True)
    y_true = merged_df['label']  # Actual labels

    rows = []

    # Calculate evaluation metrics for each model
    for model_label in ['lstm_label', 'bert_label', 'blob_label', 'vader_label']:
        y_pred = merged_df[model_label]  # Predicted labels

        # calculating eval metrics
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, average='macro', labels=sentiment_labels)
        recall = recall_score(y_true, y_pred, average='macro', labels=sentiment_labels)
        f1 = f1_score(y_true, y_pred, average='macro', labels=sentiment_labels)
        roc_auc = roc_auc_score(y_true, label_binarize(y_pred, classes=sentiment_labels), multi_class='ovo')

        rows.append({
            'Model': model_label,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-score': f1,
            'ROC AUC': roc_auc
        })
        
        # results
        print()
        print(f"Model: {model_label}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-score: {f1:.4f}")
        print(f"ROC AUC: {roc_auc:.4f}")
        print()

        # saving the results
        with open(f'eval/eval_{lang}_{model_label}.txt', 'w') as f:
            f.write(f"Accuracy: {accuracy:.4f}\n")
            f.write(f"Precision: {precision:.4f}\n")
            f.write(f"Recall: {recall:.4f}\n")
            f.write(f"F1-score: {f1:.4f}\n")
            f.write(f"ROC AUC: {roc_auc:.4f}\n")

        print(f"Evaluation results saved to 'eval/eval_{lang}_{model_label}.txt'")
        print()

    dataframes[lang] = pd.DataFrame(rows)


Evaluating for language: Hindi


Model: lstm_label
Accuracy: 0.3234
Precision: 0.1078
Recall: 0.3333
F1-score: 0.1629
ROC AUC: 0.5000

Evaluation results saved to 'eval/eval_Hindi_lstm_label.txt'


Model: bert_label
Accuracy: 0.3027
Precision: 0.2995
Recall: 0.3848
F1-score: 0.2877
ROC AUC: 0.4197

Evaluation results saved to 'eval/eval_Hindi_bert_label.txt'


Model: blob_label
Accuracy: 0.2611
Precision: 0.3309
Recall: 0.3873
F1-score: 0.2498
ROC AUC: 0.4513

Evaluation results saved to 'eval/eval_Hindi_blob_label.txt'


Model: vader_label
Accuracy: 0.3145
Precision: 0.2285
Recall: 0.4563
F1-score: 0.2857
ROC AUC: 0.4032

Evaluation results saved to 'eval/eval_Hindi_vader_label.txt'


Evaluating for language: Punjabi


Model: lstm_label
Accuracy: 0.3872
Precision: 0.2404
Recall: 0.3353
F1-score: 0.2008
ROC AUC: 0.4986

Evaluation results saved to 'eval/eval_Punjabi_lstm_label.txt'


Model: bert_label
Accuracy: 0.3534
Precision: 0.3411
Recall: 0.4145
F1-score: 0.3287
ROC AUC: 0.4114



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [153]:
dataframes['Hindi']

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score,ROC AUC
0,lstm_label,0.323442,0.107814,0.333333,0.16293,0.5
1,bert_label,0.302671,0.299494,0.384774,0.287725,0.419719
2,blob_label,0.261128,0.330936,0.387313,0.249817,0.451305
3,vader_label,0.31454,0.228549,0.456317,0.285686,0.403175


In [154]:
dataframes['Bengali']

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score,ROC AUC
0,lstm_label,0.384868,0.128289,0.333333,0.185273,0.5
1,bert_label,0.371711,0.349085,0.395926,0.326433,0.434124
2,blob_label,0.299342,0.285347,0.37271,0.262789,0.459554
3,vader_label,0.319079,0.55485,0.362739,0.268088,0.481883


In [155]:
dataframes['Punjabi']

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score,ROC AUC
0,lstm_label,0.387218,0.240389,0.335256,0.20079,0.498558
1,bert_label,0.353383,0.341116,0.414502,0.32872,0.411357
2,blob_label,0.330827,0.324638,0.444735,0.298836,0.397013
3,vader_label,0.357143,0.242777,0.442949,0.297097,0.415385
