# Importing required libraries for Sentimental Analysis

In [3]:
import tweepy
import pandas as pd
import numpy as np
import json
import re
# import vaderSentiment
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.notebook import tqdm
from tqdm import tqdm
from ipywidgets import IntProgress
import seaborn as sns


from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
import torch

# Importing Labeled Datasets

The following three datasets are all labeled and categorized based on their political party. The Training dataset is the one that we check the final outcome with. If the the model worked properly, it is extected that the Train dataset can give the right overall outcome of the election result. The final expectation is that the model and sentimental analysis process, can leads to the correct outcome for the other two datasets (Test dataset and 43rd election dataset)

In [23]:
# Importing the Train set of the 42nd election
df_train_for_transformer = pd.read_csv('Data/df_train_for_transformer.csv')
df_train_for_transformer.drop('Unnamed: 0', axis = 1, inplace=True)
df_train_for_transformer = df_train_for_transformer.reset_index()

# Importing the Test set of the 42nd election
df_test_for_transformer = pd.read_csv('Data/df_test_for_transformer.csv')
df_test_for_transformer.drop('Unnamed: 0', axis = 1, inplace=True)
df_test_for_transformer = df_test_for_transformer.reset_index()

# Importing the Data set of the 43nd election
elxn43_for_transformer = pd.read_csv('Data/elxn43_labeled_for_transformer.csv')
elxn43_for_transformer.drop('Unnamed: 0', axis = 1, inplace=True)
elxn43_for_transformer = elxn43_for_transformer.reset_index()

# All of the above datrasets are labeled with the ensemble model

# Importing the Sentimental Analysis models

RoBERTa (Robustly Optimized BERT Pretraining Approach) is a large-scale language model developed by Facebook AI Research (FAIR) in 2019. It is built upon the pre-training approach of the popular BERT (Bidirectional Encoder Representations from Transformers) model, with several modifications to improve its performance. RoBERTa has been trained on a massive corpus of text data, which includes BooksCorpus and English Wikipedia, with a total of 160 GB of uncompressed text data. It has achieved state-of-the-art results on several natural language processing tasks, such as question answering, sentiment analysis, and text classification. The RoBERTa model has also been used as the foundation for several other language models, including ELECTRA, which uses a new training objective to improve efficiency and performance, and GPT-3, which has over 175 billion parameters and is currently the largest language model to date.

Here we import and download the parameters of this pre-trained model and use it for our sentimental analysis. The process of importing is as follow:

In [5]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool developed by researchers at the University of Georgia. It is designed to analyze the sentiment of text data, such as social media posts, online reviews, and news articles. VADER uses a lexicon of sentiment-related words and phrases, which are rated according to their emotional valence (positive, negative, or neutral) and intensity. It also incorporates a set of rules to handle contextual features such as negation, punctuation, and capitalization. VADER has been shown to perform well on a variety of sentiment analysis tasks, including sentiment classification, emotion detection, and opinion mining. Its accuracy and ease of use have made it a popular choice for researchers and practitioners in the field of natural language processing.

The Main approach for the sentimental analysis of this project is the RoBERTa language model. However, to also have a rough comparison between the machine learning approach and ruled-based approach of sentimental analysis, we get the result of the VADER sentimental through a same process for getting the RoBERTa sentiment into one function.


In [6]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [7]:
def polarity_scores_roberta(example):
    """
    This function save the positive, negative and neutrality sentiment of a text into a dictionary.
    The text will first get tokenized and formed into aPytorch tensor.
    Tokenizer(text example, return_tensor ---> pytorch)
    The the model is run and the result is detach into numpy.
    The final scores are then saved into a dictionary
    
    """
    
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

# Running Sentiment Transformer on Training Dataset

Here we use `tqdm` library to the progress bar at the bottom the code in order to see the progress.
The following code, measure the VADER sentiment as well as the RoBERTa sentiment into dictionary format and in the next cell they are combined with the dataset.

In [18]:
res = {}
for i, row in tqdm(df_train_for_transformer.iterrows(), total=len(df_train_for_transformer)):
    try:
        text = row['Full Text2']
        myid = row['index']
# input the text in the above defined function for VADER sentiment        
        vader_result = analyzer.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
# input the text in the above defined function for RoBERTa sentiment  
        roberta_result = polarity_scores_roberta(text)
        both = {**vader_result_rename, **roberta_result}
        res[myid] = both
    except RuntimeError:
        print(f'Broke for id {myid}')

100%|████████████████████████████████████████████████████████████████████████| 119995/119995 [1:29:31<00:00, 22.34it/s]


In [19]:
df_transformer_result_train = pd.DataFrame(res).T
df_transformer_result_train = df_transformer_result_train.reset_index()
df_transformer_result_train = df_transformer_result_train.merge(df_train_for_transformer, how='left', on='index')

# Running Sentiment Transformer on Test Dataset

The same process as above is run on the Test setn and the 43rd election dataset.

In [20]:
res = {}
for i, row in tqdm(df_test_for_transformer.iterrows(), total=len(df_test_for_transformer)):
    try:
        text = row['Full Text']
        myid = row['index']
        vader_result = analyzer.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
        roberta_result = polarity_scores_roberta(text)
        both = {**vader_result_rename, **roberta_result}
        res[myid] = both
    except RuntimeError:
        print(f'Broke for id {myid}')

100%|████████████████████████████████████████████████████████████████████████████| 30078/30078 [28:48<00:00, 17.41it/s]


In [21]:
df_transformer_result_test = pd.DataFrame(res).T
df_transformer_result_test = df_transformer_result_test.reset_index()
df_transformer_result_test = df_transformer_result_test.merge(df_test_for_transformer, how='left', on='index')

In [22]:
# Export the resulted dataset for backup purpose and also input for the visualization notebook
df_transformer_result_train.to_csv('Data/df_transformer_result_train.csv')
df_transformer_result_test.to_csv('Data/df_transformer_result_test.csv')

# Running Sentiment Transformer on 43rd election Dataset

In [26]:
res = {}
for i, row in tqdm(elxn43_for_transformer.iterrows(), total=len(elxn43_for_transformer)):
    try:
        text = row['Full Text']
        myid = row['index']
        vader_result = analyzer.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
        roberta_result = polarity_scores_roberta(text)
        both = {**vader_result_rename, **roberta_result}
        res[myid] = both
    except RuntimeError:
        print(f'Broke for id {myid}')

100%|████████████████████████████████████████████████████████████████████████████| 35945/35945 [35:01<00:00, 17.10it/s]


In [28]:
df_transformer_result_elxn43 = pd.DataFrame(res).T
df_transformer_result_elxn43 = df_transformer_result_elxn43.reset_index()
df_transformer_result_elxn43 = df_transformer_result_elxn43.merge(elxn43_for_transformer, how='left', on='index')

In [29]:
# Export the resulted dataset for backup purpose and also input for the visualization notebook
df_transformer_result_elxn43.to_csv('Data/df_transformer_result_elxn43.csv')