### From original paper: polarity = (num_pos + num_neut + 0.1)/(num_neg + num_neut + 0.1)

##### 4 Tools: Vader, Textblob, Bert, Flair

In [3]:
import pandas as pd
import ast
from flair.models import TextClassifier
from flair.data import Sentence
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import wordnet
from tqdm import tqdm
tqdm.pandas()

In [None]:
def is_adj(word):
    synsets = wordnet.synsets(word)
    if not synsets:
        return False
    likely_pos = synsets[0].pos()
    return likely_pos == 'a'

In [None]:
def count_sentiments(counter_dict, sa_tool, use_boost, threshold=0.1, boost = 0.2):
    num_pos = 0
    num_neg = 0
    num_neu = 0
    
    if sa_tool == 'V':
        sid = SentimentIntensityAnalyzer()
        for word, freq in counter_dict.items():
            if (use_boost):
                freq = freq * (1 + boost) if is_adj(word) else freq * (1 - boost)
            if (use_boost and is_adj(word)):
                freq *= boost
            sentiment_scores = sid.polarity_scores(word)
            if sentiment_scores['compound'] >= threshold:
                num_pos += freq
            elif sentiment_scores['compound'] <= -threshold:
                num_neg += freq
            else:
                num_neu += freq
                
    elif sa_tool == 'T':
        for word, freq in counter_dict.items():
            if (use_boost):
                freq = freq * (1 + boost) if is_adj(word) else freq * (1 - boost)           
            analysis = TextBlob(word)
            if analysis.sentiment.polarity > threshold:
                num_pos += freq
            elif analysis.sentiment.polarity < -threshold:
                num_neg += freq
            else:
                num_neu += freq
                
    elif sa_tool == 'B':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
        for word, freq in counter_dict.items():
            if (use_boost):
                freq = freq * (1 + boost) if is_adj(word) else freq * (1 - boost)           
            inputs = tokenizer(word, return_tensors='pt')
            outputs = model(**inputs)
            logits = outputs.logits
            softmax = torch.nn.functional.softmax(logits, dim=1)
            label = torch.argmax(softmax, dim=1).item()
            if label == 1:
                num_pos += freq
            elif label == 0:
                num_neu += freq
            else:
                num_neg += freq
                
    elif sa_tool == 'F':
        classifier = TextClassifier.load('en-sentiment')
        for word, freq in counter_dict.items():
            if (use_boost):
                freq = freq * (1 + boost) if is_adj(word) else freq * (1 - boost)      
            sentence = Sentence(word)
            classifier.predict(sentence)
            label = sentence.labels[0]
            if 'POSITIVE' in label.value:
                num_pos += freq
            elif 'NEGATIVE' in label.value:
                num_neg += freq
            else:
                num_neu += freq
    else:
        return "Invalid SA tool specified"
    return {'pos': round(num_pos,1), 'neg': round(num_neg,1), 'neu': round(num_neu,1)}

In [None]:
df = pd.read_csv("../data/AdjVCounter.csv")
df['counter'] = df['counter'].apply(ast.literal_eval)

In [None]:
# df['Vader'] = df['counter'].progress_apply(lambda x: count_sentiments(x, 'V', False))

In [None]:
# df['TextBlob'] = df['counter'].progress_apply(lambda x: count_sentiments(x, 'T', False))

In [None]:
# df['Bert'] = df['counter'].apply_progress(lambda x: count_sentiments(x, 'B', use_boost))

In [None]:
df['Flair'] = df['counter'].progress_apply(lambda x: count_sentiments(x, 'F', False))

In [None]:
# df['VaderBoost'] = df['counter'].progress_apply(lambda x: count_sentiments(x, 'V', True))

In [None]:
# df['TextBlobBoost'] = df['counter'].progress_apply(lambda x: count_sentiments(x, 'T', True))

In [1]:
def dictToPolarity(d):
    return (d['pos'] + d['neu'] + 0.1) / (d['neg'] + d['neu'] + 0.1)

In [4]:
# df['VaderPolarity'] = df['Vader'].apply(dictToPolarity)
# df['TextBlobPolarity'] = df['TextBlob'].apply(dictToPolarity)
# df['VaderBoostPolarity'] = df['VaderBoost'].apply(dictToPolarity)
# df['TextBlobBoostPolarity'] = df['TextBlobBoost'].apply(dictToPolarity)
df = pd.read_csv("../data/AdjVPolarity_withFlair.csv")
df['counter'] = df['counter'].apply(ast.literal_eval)
df['FlairPolarity'] = df['Flair'].apply(dictToPolarity)

TypeError: string indices must be integers, not 'str'

In [None]:
df.to_csv("../data/AdjVPolarityFlairProcessed.csv", index=False)