In [2]:
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

testing TextBlob

In [20]:
analysis = TextBlob("TextBlob has some interesting features!")

In [4]:
print(dir(analysis))

['__add__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_cmpkey', '_compare', '_create_sentence_objects', '_strkey', 'analyzer', 'classifier', 'classify', 'correct', 'ends_with', 'endswith', 'find', 'format', 'index', 'join', 'json', 'lower', 'ngrams', 'noun_phrases', 'np_counts', 'np_extractor', 'parse', 'parser', 'polarity', 'pos_tagger', 'pos_tags', 'raw', 'raw_sentences', 'replace', 'rfind', 'rindex', 'sentences', 'sentiment', 'sentiment_assessments', 'serialized', 'split', 'starts_with', 'startswith', 'string', 'strip', 'stripped', 'subjectivity', 'tags', 'title', 'to_json', 'tokenize', 'tokenizer', 'tokens', 'upper', 'word_counts',

In [9]:
!python3 -m textblob.download_corpora

[nltk_data] Downloading package brown to /home/sakshi/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt_tab to /home/sakshi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sakshi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/sakshi/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package conll2000 to /home/sakshi/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/sakshi/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


In [12]:
print(analysis.tags)

[('TextBlob', 'NNP'), ('has', 'VBZ'), ('some', 'DT'), ('interesting', 'JJ'), ('features', 'NNS')]


In [11]:
print(analysis.sentiment)

Sentiment(polarity=0.5, subjectivity=0.5)


assume positive has only positive and negative has only negative sentences  
use TextBlob to analyze sentiment polarity  
count correct predictions where polarity > 0.1 for positive and ≤ 0.1 for negative  

In [15]:
pos_count = 0
pos_correct = 0

with open("positive","r") as f:
    for line in f.read().split('\n'):
        analysis = TextBlob(line)
        if analysis.sentiment.polarity > 0.1:
            pos_correct += 1
        pos_count +=1


neg_count = 0
neg_correct = 0

with open("negative","r") as f:
    for line in f.read().split('\n'):
        analysis = TextBlob(line)
        if analysis.sentiment.polarity <= 0.1:
            neg_correct += 1
        neg_count +=1

print("Positive accuracy = {}% via {} samples".format(pos_correct/pos_count*100.0, pos_count))
print("Negative accuracy = {}% via {} samples".format(neg_correct/neg_count*100.0, neg_count))

Positive accuracy = 60.49128070504407% via 5333 samples
Negative accuracy = 68.38552409525596% via 5333 samples


filter sentences that have subjectivity greater than 0.5  
but this reduces the sample size

In [18]:
from textblob import TextBlob

pos_count = 0
pos_correct = 0

with open("positive","r") as f:
    for line in f.read().split('\n'):
        analysis = TextBlob(line)

        if analysis.sentiment.subjectivity > 0.5:
            if analysis.sentiment.polarity > 0:
                pos_correct += 1
            pos_count +=1


neg_count = 0
neg_correct = 0

with open("negative","r") as f:
    for line in f.read().split('\n'):
        analysis = TextBlob(line)
        if analysis.sentiment.subjectivity > 0.5:
            if analysis.sentiment.polarity <= 0:
                neg_correct += 1
            neg_count +=1

print("Positive accuracy = {}% via {} samples".format(pos_correct/pos_count*100.0, pos_count))
print("Negative accuracy = {}% via {} samples".format(neg_correct/neg_count*100.0, neg_count))

Positive accuracy = 82.02557898375389% via 2893 samples
Negative accuracy = 52.00506542845083% via 2369 samples


## testing VADER

gives positive, negative, neutral and compound score
compound score is between -1 (most extreme negative) to +1 (most extreme positive)

In [24]:
analyzer = SentimentIntensityAnalyzer()
vs = analyzer.polarity_scores("VADER Sentiment looks interesting, I have high hopes!")
print(vs)

{'neg': 0.0, 'neu': 0.509, 'pos': 0.491, 'compound': 0.6996}


In [25]:
pos_count = 0
pos_correct = 0

with open("positive","r") as f:
    for line in f.read().split('\n'):
        vs = analyzer.polarity_scores(line)
        if vs['compound'] > 0:
            pos_correct += 1
        pos_count +=1


neg_count = 0
neg_correct = 0

with open("negative","r") as f:
    for line in f.read().split('\n'):
        vs = analyzer.polarity_scores(line)
        if vs['compound'] <= 0:
            neg_correct += 1
        neg_count +=1

print("Positive accuracy = {}% via {} samples".format(pos_correct/pos_count*100.0, pos_count))
print("Negative accuracy = {}% via {} samples".format(neg_correct/neg_count*100.0, neg_count))

Positive accuracy = 70.16688543033939% via 5333 samples
Negative accuracy = 57.39733733358335% via 5333 samples


using a threshold is recommended by the documentation

In [26]:
pos_count = 0
pos_correct = 0

threshold = 0.5

with open("positive","r") as f:
    for line in f.read().split('\n'):
        vs = analyzer.polarity_scores(line)

        if vs['compound'] >= threshold or vs['compound'] <= -threshold:
            if vs['compound'] > 0:
                pos_correct += 1
            pos_count +=1


neg_count = 0
neg_correct = 0

with open("negative","r") as f:
    for line in f.read().split('\n'):
        vs = analyzer.polarity_scores(line)
        if vs['compound'] >= threshold or vs['compound'] <= -threshold:
            if vs['compound'] <= 0:
                neg_correct += 1
            neg_count +=1

print("Positive accuracy = {}% via {} samples".format(pos_correct/pos_count*100.0, pos_count))
print("Negative accuracy = {}% via {} samples".format(neg_correct/neg_count*100.0, neg_count))

Positive accuracy = 87.66037735849056% via 2650 samples
Negative accuracy = 49.56140350877193% via 1824 samples


looking for no conflict  
so to classify something as positive, the negative score should be less than 0.1  
and vice versa

In [28]:
pos_count = 0
pos_correct = 0

with open("positive","r") as f:
    for line in f.read().split('\n'):
        vs = analyzer.polarity_scores(line)
        if not vs['neg'] > 0.1:
            if vs['pos']-vs['neg'] > 0:
                pos_correct += 1
            pos_count +=1


neg_count = 0
neg_correct = 0

with open("negative","r") as f:
    for line in f.read().split('\n'):
        vs = analyzer.polarity_scores(line)
        if not vs['pos'] > 0.1:
            if vs['pos']-vs['neg'] <= 0:
                neg_correct += 1
            neg_count +=1

print("Positive accuracy = {}% via {} samples".format(pos_correct/pos_count*100.0, pos_count))
print("Negative accuracy = {}% via {} samples".format(neg_correct/neg_count*100.0, neg_count))

Positive accuracy = 81.01049548450085% via 4097 samples
Negative accuracy = 89.27229244960711% via 2927 samples
