In [20]:
import pandas as pd

## translation libraries
import translators as ts
from googletrans import Translator as gt
from deep_translator import GoogleTranslator
from translate import Translator as tlr

## sentiment analysis libraries
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
import nltk

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# nltk.download('all')

In [35]:
# lemmatizer = WordNetLemmatizer()
def preprocess_text(text, lemmatizer):
    if type(text) == float:
        return str(text)
    tokens = word_tokenize(text.lower())
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

In [48]:
## https://www.kaggle.com/datasets/saurabhshahane/twitter-sentiment-dataset
twitter_data = pd.read_csv("/Users/rlzhang1310/Coding/buntain/twitter_dataset/Twitter_Data.csv")
twitter_data["text"] = twitter_data["clean_text"]
twitter_data["score"] = twitter_data["category"]
twitter_data = twitter_data.drop(["clean_text", "category"], axis=1)
twitter_data["processed_text"] = twitter_data["text"].apply(preprocess_text, lemmatizer = WordNetLemmatizer())
twitter_data.dropna(inplace=True)

In [44]:
## nltk sentiment analysis
analyzer = SentimentIntensityAnalyzer()
def get_nltk_sentiment(text, analyzer):
    scores = analyzer.polarity_scores(text)
    return scores

In [45]:
## spacy sentiment analysis
en_nlp = spacy.load('en_core_web_sm')
en_nlp.add_pipe('spacytextblob')

def get_spacy_sentiment(text, en_nlp):
    doc = en_nlp(text)
    return doc._.blob.polarity   

In [49]:
twitter_data["nltk_sentiment"] = twitter_data["text"].apply(get_nltk_sentiment, analyzer=analyzer)
twitter_data["spacy_sentiment"] = twitter_data["text"].apply(get_spacy_sentiment, en_nlp=en_nlp)

twitter_data["processed_nltk_sentiment"] = twitter_data["processed_text"].apply(get_nltk_sentiment, analyzer=analyzer)
twitter_data["processed_spacy_sentiment"] = twitter_data["processed_text"].apply(get_spacy_sentiment, en_nlp=en_nlp)

twitter_data

Unnamed: 0,text,score,processed_text,nltk_sentiment,spacy_sentiment,processed_nltk_sentiment,processed_spacy_sentiment
0,when modi promised “minimum government maximum...,-1.0,modi promised “ minimum government maximum gov...,"{'neg': 0.065, 'neu': 0.781, 'pos': 0.154, 'co...",-0.300000,"{'neg': 0.095, 'neu': 0.682, 'pos': 0.223, 'co...",-0.300000
1,talk all the nonsense and continue all the dra...,0.0,talk nonsense continue drama vote modi,"{'neg': 0.184, 'neu': 0.816, 'pos': 0.0, 'comp...",0.000000,"{'neg': 0.351, 'neu': 0.649, 'pos': 0.0, 'comp...",0.000000
2,what did just say vote for modi welcome bjp t...,1.0,say vote modi welcome bjp told rahul main camp...,"{'neg': 0.0, 'neu': 0.772, 'pos': 0.228, 'comp...",0.483333,"{'neg': 0.0, 'neu': 0.651, 'pos': 0.349, 'comp...",0.483333
3,asking his supporters prefix chowkidar their n...,1.0,asking supporter prefix chowkidar name modi gr...,"{'neg': 0.187, 'neu': 0.655, 'pos': 0.158, 'co...",0.150000,"{'neg': 0.219, 'neu': 0.479, 'pos': 0.301, 'co...",0.033333
4,answer who among these the most powerful world...,1.0,answer among powerful world leader today trump...,"{'neg': 0.0, 'neu': 0.808, 'pos': 0.192, 'comp...",0.400000,"{'neg': 0.0, 'neu': 0.763, 'pos': 0.237, 'comp...",0.300000
...,...,...,...,...,...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0,456 crore paid neerav modi recovered congress ...,"{'neg': 0.081, 'neu': 0.919, 'pos': 0.0, 'comp...",-0.291667,"{'neg': 0.104, 'neu': 0.896, 'pos': 0.0, 'comp...",-0.291667
162976,dear rss terrorist payal gawar what about modi...,-1.0,dear r terrorist payal gawar modi killing 1000...,"{'neg': 0.398, 'neu': 0.491, 'pos': 0.111, 'co...",-0.195833,"{'neg': 0.435, 'neu': 0.443, 'pos': 0.123, 'co...",-0.246875
162977,did you cover her interaction forum where she ...,0.0,cover interaction forum left,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000000,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000000
162978,there big project came into india modi dream p...,0.0,big project came india modi dream project happ...,"{'neg': 0.0, 'neu': 0.889, 'pos': 0.111, 'comp...",0.000000,"{'neg': 0.0, 'neu': 0.8, 'pos': 0.2, 'compound...",0.000000


In [101]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
vader_analyzer = SentimentIntensityAnalyzer()
def get_vader_sentiment(text, analyzer):
    vs = analyzer.polarity_scores(text)
    return vs


{'neg': 0.0, 'neu': 0.519, 'pos': 0.481, 'compound': 0.5719}

In [102]:
twitter_data["processed_vader_sentiment"] = twitter_data["processed_text"].apply(get_vader_sentiment, analyzer=vader_analyzer)
twitter_data["vader_sentiment"] = twitter_data["text"].apply(get_vader_sentiment, analyzer=vader_analyzer)


In [129]:
def categorize(nltk_output):
    if nltk_output["neu"] > 1:
        return 0
    else:
        return nltk_output["compound"]
    # pos = nltk_output["pos"]
    # neg = nltk_output["neg"]
    # neu = nltk_output["neu"]
    # max_v = max(pos, neg, neu) 
    # if max_v == pos:
    #     return 1
    # elif max_v == neg:
    #     return -1
    # else:
    #     return 0


def calculation_nltk_helper(actual, output, acc):
    diff = abs(output["compound"] - actual)
    acc["rmse"] += diff**2
    acc["mae"] += diff
    output_pred = categorize(output)
    if actual == 0:
        acc["neu_total"] += 1
        if output_pred == 0:
            acc["neu_correct"] += 1
    elif actual > 0:
        acc["pos_total"] += 1
        if output_pred > 0:
            acc["pos_correct"] += 1
    else:
        acc["neg_total"] += 1
        if output_pred < 0:
            acc["neg_correct"] += 1

def calculation_spacy_helper(actual, output, acc):
    diff = abs(output - actual)
    acc["rmse"] += diff**2
    acc["mae"] += diff
    if actual == 0:
        acc["neu_total"] += 1
        if output == 0:
            acc["neu_correct"] += 1
    elif actual > 0:
        acc["pos_total"] += 1
        if output > 0:
            acc["pos_correct"] += 1
    else:
        acc["neg_total"] += 1
        if output < 0:
            acc["neg_correct"] += 1


In [130]:
def acc_dict():
    d = dict()
    d["rmse"] = 0
    d["mae"] = 0
    d["pos_correct"] = 0
    d["neg_correct"] = 0
    d["neu_correct"] = 0
    d["pos_total"] = 0
    d["neg_total"] = 0
    d["neu_total"] = 0
    return d
proc_spacy = acc_dict()
proc_nltk = acc_dict()
proc_vader = acc_dict()
spacy = acc_dict()
nltk = acc_dict()
vader = acc_dict()

In [131]:
twitter_data.apply(lambda row: calculation_nltk_helper(row["score"], row["nltk_sentiment"], nltk), axis=1)
twitter_data.apply(lambda row: calculation_spacy_helper(row["score"], row["spacy_sentiment"], spacy), axis=1)
twitter_data.apply(lambda row: calculation_nltk_helper(row["score"], row["processed_nltk_sentiment"], proc_nltk), axis=1)
twitter_data.apply(lambda row: calculation_spacy_helper(row["score"], row["processed_spacy_sentiment"], proc_spacy), axis=1)
twitter_data.apply(lambda row: calculation_nltk_helper(row["score"], row["vader_sentiment"], vader), axis=1)
twitter_data.apply(lambda row: calculation_nltk_helper(row["score"], row["processed_vader_sentiment"], proc_vader), axis=1)

0         None
1         None
2         None
3         None
4         None
          ... 
162975    None
162976    None
162977    None
162978    None
162979    None
Length: 162969, dtype: object

In [132]:
import math
def print_res(d):
    print(f'rmse: {math.sqrt(d["rmse"])},  mae: {d["mae"]}')
    print(f'positive accuracy: {round(d["pos_correct"] / d["pos_total"], 4)}, negative accuracy: {round(d["neg_correct"] / d["neg_total"], 4)}, neutral accuracy: {round(d["neu_correct"] / d["neu_total"], 4)}')


print("nltk results")
print_res(nltk)
print("")
print("spacy results")
print_res(spacy)
print("")
print("vader results")
print_res(vader)
print("")
print("processed nltk results")
print_res(proc_nltk)
print("")
print("processed spacy results")
print_res(proc_spacy)
print("")
print("processed vader results")
print_res(proc_vader)

nltk results
rmse: 301.9313320406593,  mae: 93184.80140000037
positive accuracy: 0.6814, negative accuracy: 0.6119, neutral accuracy: 0.3976

spacy results
rmse: 248.42326391163456,  mae: 78161.65319124197
positive accuracy: 1.0, negative accuracy: 1.0, neutral accuracy: 1.0

vader results
rmse: 345.0173840142655,  mae: 118321.56489996763
positive accuracy: 1.0, negative accuracy: 0.0, neutral accuracy: 0.0

processed nltk results
rmse: 301.61753802366593,  mae: 93147.5429000006
positive accuracy: 0.6932, negative accuracy: 0.6041, neutral accuracy: 0.3941

processed spacy results
rmse: 254.60815454285597,  mae: 79542.31454811925
positive accuracy: 0.9092, negative accuracy: 0.8754, neutral accuracy: 0.9708

processed vader results
rmse: 345.0173840142655,  mae: 118321.56489996763
positive accuracy: 1.0, negative accuracy: 0.0, neutral accuracy: 0.0
