In [1]:
import numpy as np
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score
from textblob import TextBlob
import re

import nltk
#ltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
#ltk.download('stopwords')
from nltk.corpus import stopwords
#ltk.download('wordnet')
from nltk.corpus import wordnet
#ltk.download('averaged_perceptron_tagger')

# Vader Score

In [2]:
df = pd.read_csv('C:/Users/jrese/capstone_dump/Data/iee_data.csv')

df.head()

Unnamed: 0,id,created_at,text,sentiment,label
0,77522,2020-04-15 01:03:46+00:00,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",positive,1
1,661634,2020-06-25 06:20:06+00:00,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,negative,-1
2,413231,2020-06-04 15:41:45+00:00,Net issuance increases to fund fiscal programs...,positive,1
3,760262,2020-07-03 19:39:35+00:00,RT @bentboolean: How much of Amazon's traffic ...,positive,1
4,830153,2020-07-09 14:39:14+00:00,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,positive,1


In [3]:
analyzer = SentimentIntensityAnalyzer()

In [4]:
# calculate score
def get_vader_score(sentence): 
    compound=analyzer.polarity_scores(sentence)['compound']
    if compound >= 0.05: 
        return 1
    elif (compound > -0.05) and (compound <0.05): 
        return 0
    if compound <= 0.05:
        return -1
    
df['vader']=df.apply(lambda x: get_vader_score(x['text']), axis=1)

In [5]:
df.head()

Unnamed: 0,id,created_at,text,sentiment,label,vader
0,77522,2020-04-15 01:03:46+00:00,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",positive,1,1
1,661634,2020-06-25 06:20:06+00:00,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,negative,-1,1
2,413231,2020-06-04 15:41:45+00:00,Net issuance increases to fund fiscal programs...,positive,1,-1
3,760262,2020-07-03 19:39:35+00:00,RT @bentboolean: How much of Amazon's traffic ...,positive,1,1
4,830153,2020-07-09 14:39:14+00:00,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,positive,1,0


In [6]:
print(f'Accuracy: {accuracy_score(df.dropna()["label"].values, df.dropna()["vader"].values)}')

Accuracy: 0.6661538461538462


# TextBlob

In [7]:
scores_blob = []

# Declare variables for scores
polarity = []
subjectivity = []

for i in range(df['text'].shape[0]):
    polarity = TextBlob(df['text'][i]).sentiment[0]
    subjectivity = TextBlob(df['text'][i]).sentiment[1]
    
    scores_blob.append({"Polarity_blob": polarity,
                       "Subjectivity_blob": subjectivity
                  })

In [8]:
sentiments_score_blob = pd.DataFrame.from_dict(scores_blob)
df = df.join(sentiments_score_blob)

def getAnalysis(score):
    if score >= 0.05: 
        return 1
    elif (score > -0.05) and (score <0.05): 
        return 0
    if score <= 0.05:
        return -1
    
df['blob'] = df['Polarity_blob'].apply(getAnalysis)

In [9]:
print(f'Accuracy: {accuracy_score(df.dropna()["label"].values, df.dropna()["blob"].values)}')

Accuracy: 0.5261538461538462


# SentiWordNet

Reminder to move this section to data_processing file

In [10]:
# Define a function to clean the text
def clean(text):
# Removes all special characters and numericals leaving the alphabets
    text = re.sub('[^A-Za-z]+', ' ', text)
    return text

# Cleaning the text in the review column
df['text_clean'] = df['text'].apply(clean)

# POS tagger dictionary
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
            newlist.append(tuple([word, pos_dict.get(tag[0])]))
    return newlist



df['POS tagged'] = df['text_clean'].apply(token_stop_pos)
df.head()

Unnamed: 0,id,created_at,text,sentiment,label,vader,Polarity_blob,Subjectivity_blob,blob,text_clean,POS tagged
0,77522,2020-04-15 01:03:46+00:00,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",positive,1,1,1.0,0.4,1,RT RobertBeadles Yo Enter to WIN Monarch Token...,"[(RT, n), (RobertBeadles, n), (Yo, n), (Enter,..."
1,661634,2020-06-25 06:20:06+00:00,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,negative,-1,1,0.0,0.0,0,SriLanka surcharge on fuel removed The surcha...,"[(SriLanka, n), (surcharge, n), (fuel, n), (re..."
2,413231,2020-06-04 15:41:45+00:00,Net issuance increases to fund fiscal programs...,positive,1,-1,0.128788,0.318182,1,Net issuance increases to fund fiscal programs...,"[(Net, a), (issuance, n), (increases, n), (fun..."
3,760262,2020-07-03 19:39:35+00:00,RT @bentboolean: How much of Amazon's traffic ...,positive,1,1,0.2,0.4,1,RT bentboolean How much of Amazon s traffic is...,"[(RT, n), (bentboolean, v), (much, a), (Amazon..."
4,830153,2020-07-09 14:39:14+00:00,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,positive,1,0,0.8,0.75,1,AMD Ryzen desktop CPUs looking great and on t...,"[(AMD, n), (Ryzen, n), (desktop, n), (CPUs, n)..."


In [11]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

#takes pos_tag tuples, and gives the Lemma for each word in pos_tag based on the pos of that word. 
def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos:
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

df['Lemma'] = df['POS tagged'].apply(lemmatize)

In [12]:
nltk.download('sentiwordnet')
from nltk.corpus import sentiwordnet as swn

def sentiwordnetanalysis(pos_data):
    sentiment = 0
    tokens_count = 0
    for word, pos in pos_data:
        if not pos:
            continue
        lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
        if not lemma:
            continue
        
        synsets = wordnet.synsets(lemma, pos=pos)
        if not synsets:
            continue

        # Take the first sense, the most common
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())
        sentiment += swn_synset.pos_score() - swn_synset.neg_score()
        tokens_count += 1
        
    if not tokens_count:
        return 0
    return sentiment


df['SWN'] = df['POS tagged'].apply(sentiwordnetanalysis)
df.head()

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\jrese\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


Unnamed: 0,id,created_at,text,sentiment,label,vader,Polarity_blob,Subjectivity_blob,blob,text_clean,POS tagged,Lemma,SWN
0,77522,2020-04-15 01:03:46+00:00,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",positive,1,1,1.0,0.4,1,RT RobertBeadles Yo Enter to WIN Monarch Token...,"[(RT, n), (RobertBeadles, n), (Yo, n), (Enter,...",RT RobertBeadles Yo Enter WIN Monarch Tokens...,0.125
1,661634,2020-06-25 06:20:06+00:00,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,negative,-1,1,0.0,0.0,0,SriLanka surcharge on fuel removed The surcha...,"[(SriLanka, n), (surcharge, n), (fuel, n), (re...",SriLanka surcharge fuel remove surcharge Rs ...,0.0
2,413231,2020-06-04 15:41:45+00:00,Net issuance increases to fund fiscal programs...,positive,1,-1,0.128788,0.318182,1,Net issuance increases to fund fiscal programs...,"[(Net, a), (issuance, n), (increases, n), (fun...",Net issuance increase fund fiscal program gt...,-0.25
3,760262,2020-07-03 19:39:35+00:00,RT @bentboolean: How much of Amazon's traffic ...,positive,1,1,0.2,0.4,1,RT bentboolean How much of Amazon s traffic is...,"[(RT, n), (bentboolean, v), (much, a), (Amazon...",RT bentboolean much Amazon traffic serve Fas...,0.625
4,830153,2020-07-09 14:39:14+00:00,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,positive,1,0,0.8,0.75,1,AMD Ryzen desktop CPUs looking great and on t...,"[(AMD, n), (Ryzen, n), (desktop, n), (CPUs, n)...",AMD Ryzen desktop CPUs look great track laun...,0.25


In [17]:
def get_swn(score):
    if score > 0: 
        return 1
    if score == 0: 
        return 0
    else:
        return -1

In [19]:
df['swn_score'] =  df['SWN'].apply(get_swn)

print(f'Accuracy: {accuracy_score(df.dropna()["label"].values, df.dropna()["swn_score"].values)}')

Accuracy: 0.4930769230769231
