Analysing dataset of movie reviews using:
    1. Using Vader - (Built into nltk at Georgia Tech)
    2. Using Sentiwordnet

# 01- Using Vader Sentiment Analyser (Basics):

In [48]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [13]:
#nltk.download('vader_lexicon')

In [3]:
sia = SentimentIntensityAnalyzer()

In [21]:
#Normal Sentence:
print(sia.polarity_scores("What a terrible restaurant"))

{'neg': 0.608, 'neu': 0.392, 'pos': 0.0, 'compound': -0.4767}


In [26]:
#Emotions:
print(sia.polarity_scores(":D"))
print(sia.polarity_scores(":/"))
#Idioms:
print(sia.polarity_scores("the cumin was the kiss of death"))

{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.5106}
{'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.34}
{'neg': 0.5, 'neu': 0.5, 'pos': 0.0, 'compound': -0.6124}


In [32]:
#punctuation:
print(sia.polarity_scores("the food was good"))
print(sia.polarity_scores("the food was good!"))
print(sia.polarity_scores("the food was good!!"))

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}
{'neg': 0.0, 'neu': 0.484, 'pos': 0.516, 'compound': 0.4926}
{'neg': 0.0, 'neu': 0.463, 'pos': 0.537, 'compound': 0.5399}


In [35]:
#Negation:
print(sia.polarity_scores("the food was not good!!"))
#Double Negation:
print(sia.polarity_scores("the food was not the worst!!"))

{'neg': 0.428, 'neu': 0.572, 'pos': 0.0, 'compound': -0.457}
{'neg': 0.0, 'neu': 0.563, 'pos': 0.437, 'compound': 0.5964}


In [40]:
#Emphasis & Boosters(so,such,really):
print(sia.polarity_scores("the food was good"))
print(sia.polarity_scores("the food was GOOD"))
print(sia.polarity_scores("the food was so good"))

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}
{'neg': 0.0, 'neu': 0.452, 'pos': 0.548, 'compound': 0.5622}
{'neg': 0.0, 'neu': 0.517, 'pos': 0.483, 'compound': 0.5777}


In [47]:
#Contrasting Conjunctions(but):
print(sia.polarity_scores("I usually hate sea food but I liked this"))

#Incorrect prediction:
print(sia.polarity_scores("I usually hate sea food and I liked this"))

{'neg': 0.213, 'neu': 0.452, 'pos': 0.335, 'compound': 0.3291}
{'neg': 0.322, 'neu': 0.435, 'pos': 0.243, 'compound': -0.2263}


# 02-Using Vader of data and check accuracy of it.

here i think the data we used, has not acuurate labels and so having low accuracy:

In [155]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
def acc(a,b):
    print('Accuracy(test Data): ',accuracy_score(a,b))
    print('Confusion Matrix: \n',pd.DataFrame(confusion_matrix(a,b)))
    print('Classification Repost:\n',classification_report(a,b,digits=3))
    
def add_sentiment(text):
    r = sia.polarity_scores(text)['compound']
    if(r>0):
        return 1
    else:
        return 0

In [169]:
df = pd.read_csv("data2.csv",engine='python')
tweets = list(df.tweet)
_stopwords = set(stopwords.words('english') + list(punctuation) +
                 ["million","billion","year","millions","billions",
                  "y/y","'s'","''"])

In [198]:
texts = []
for tweet in tweets:
    words = word_tokenize(tweet.lower())
    words=[word for word in words if word not in _stopwords]
    words = ' '.join(words)
    texts.append(words)
df['new_twt'] = texts

In [172]:
df['pred1'] = df['tweet'].map(add_sentiment)
df['pred2'] = df['new_twt'].map(add_sentiment)
acc(df.label,df.pred2)

Accuracy(test Data):  0.37199542159481114
Confusion Matrix: 
       0     1
0  1308  1692
1  1600   642
Classification Repost:
               precision    recall  f1-score   support

           0      0.450     0.436     0.443      3000
           1      0.275     0.286     0.281      2242

    accuracy                          0.372      5242
   macro avg      0.362     0.361     0.362      5242
weighted avg      0.375     0.372     0.373      5242



# 03-Using Sentiwordnet Basics:

In [179]:
from nltk.corpus import sentiwordnet as swn

In [176]:
#nltk.download('sentiwordnet')

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\Starkx\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\sentiwordnet.zip.


True

In [197]:
# Getting Similar words:
list(swn.senti_synsets('Dog'))

[SentiSynset('dog.n.01'),
 SentiSynset('frump.n.01'),
 SentiSynset('dog.n.03'),
 SentiSynset('cad.n.01'),
 SentiSynset('frank.n.02'),
 SentiSynset('pawl.n.01'),
 SentiSynset('andiron.n.01'),
 SentiSynset('chase.v.01')]

In [243]:
def get_senti(tweet):
    weight = 0.0
    words = word_tokenize(tweet.lower())
    words=[word for word in words if word not in _stopwords]
    for word in words:
        try:
            for meaning in list(swn.senti_synsets(word)):
                if(meaning.pos_score() > meaning.neg_score()):
                    weight = weight + (meaning.pos_score() - meaning.neg_score())
                else:
                    weight = weight - (meaning.neg_score() - meaning.pos_score())
            #averaing of the above weight is required, use different variable.
        except:
            pass
    if(weight > 0):
        return 1
    else:
        return 0

In [248]:
df['pred3'] = df['tweet'].map(get_senti)

In [249]:
acc(df.label,df.pred3)

Accuracy(test Data):  0.3992750858450973
Confusion Matrix: 
      0     1
0  846  2154
1  995  1247
Classification Repost:
               precision    recall  f1-score   support

           0      0.460     0.282     0.350      3000
           1      0.367     0.556     0.442      2242

    accuracy                          0.399      5242
   macro avg      0.413     0.419     0.396      5242
weighted avg      0.420     0.399     0.389      5242



In [242]:
df.new_twt

0       [user, father, dysfunctional, selfish, drags, ...
1       [user, user, thanks, lyft, credit, ca, n't, us...
2                                       [bihday, majesty]
3                  [model, love, u, take, u, time, ur, ñ]
4                       [factsguide, society, motivation]
                              ...                        
5237    [lady, banned, kentucky, mall, user, jcpenny, ...
5238    [user, omfg, 'm, offended, 'm, mailbox, 'm, pr...
5239    [user, user, n't, balls, hashtag, say, weasel,...
5240     [makes, ask, anybody, ..., .god, oh, thank, god]
5241    [user, sikh, temple, vandalised, calgary, wso,...
Name: new_twt, Length: 5242, dtype: object