# Import Necessary Packages

In [1]:
import nltk
from random import shuffle
from statistics import mean
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

# Sentiment Analysis on Twitter Dataset

In [2]:
# gather data for tweets and do one step preprocessing 
tweets = [t.replace("://", "//") for t in nltk.corpus.twitter_samples.strings()]

# function to return classified class
def is_positive_twitter(tweet):
    """True if tweet has positive compound sentiment, False otherwise."""
    return sia.polarity_scores(tweet)["compound"] > 0

# shuffle dataset
shuffle(tweets)
for tweet in tweets[:10]:
    print(">", "\033[31m" + str(is_positive_twitter(tweet)) + "\033[0m", tweet)
    print("\n")

> [31mFalse[0m It's true... :-( http//t.co/G3gV2f73Bh


> [31mFalse[0m @Cheeky_rob123 @Rviver1979 just said I wouldn't vote Tory you melt! 😂


> [31mFalse[0m @Channel4News @Jag_Paw_Jack well labours just lost Scotland u fuckin Tories in red ties what happened to the working class????????


> [31mTrue[0m Off to the USA! Rosh and I on our travels again :) (@ Glasgow International Airport (GLA) - @gla_airport) https//t.co/FS71kc8FZe


> [31mFalse[0m miliband the bawless wonder . http//t.co/3pyI73g0Rt


> [31mFalse[0m junmyeon looks so d*ddy here :( LOOK https//t.co/xSRggfOijW


> [31mTrue[0m RT @Nigel_Farage: I'm proud of #UKIP's health policy, which the public has voted as the most popular #AskNigelFarage http//t.co/qEeaFWexC6


> [31mFalse[0m @OwenJones84 Let's also remember, Owen, that Ed Miliband doesn't believe the last Labour govt over-spent. #bbcqt


> [31mFalse[0m RT @JASEMARKRUTTER: David Cameron says no more tax rises until 2020. Why don't we believe him? Let's

# Sentiment Analysis on Movies Dataset

In [3]:
# gather data for movie reviews
positive_review_ids = nltk.corpus.movie_reviews.fileids(categories=["pos"])
negative_review_ids = nltk.corpus.movie_reviews.fileids(categories=["neg"])
all_review_ids = positive_review_ids + negative_review_ids

# function to return classified class
def is_positive_movies(review_id):
    """True if the average of all sentence compound scores is positive."""
    text = nltk.corpus.movie_reviews.raw(review_id)
    scores = [
        sia.polarity_scores(sentence)["compound"]
        for sentence in nltk.sent_tokenize(text)
    ]
    return mean(scores) > 0

# shuffle dataset
shuffle(all_review_ids)
correct = 0

# calculate % of correct response from model
for review_id in all_review_ids:
    if is_positive_movies(review_id):
        if review_id in positive_review_ids:
            correct += 1
    else:
        if review_id in negative_review_ids:
            correct += 1
print(F"{correct / len(all_review_ids):.2%} correct")

64.05% correct


# Custom NLTK Sentiment Analysis

In [4]:
# remove unwanted stop words
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

def skip_unwanted(pos_tuple):
    word, tag = pos_tuple
    if not word.isalpha() or word in unwanted:
        return False
    if tag.startswith("NN"):
        return False
    return True

# Parts of Speech Tagging excluding Stopwords for Positive Class Words
positive_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["pos"]))
)]

# Parts of Speech Tagging excluding Stopwords for Negative Class Words
negative_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["neg"]))
)]

# frequency distribution and combine as a single set
positive_fd = nltk.FreqDist(positive_words)
negative_fd = nltk.FreqDist(negative_words)

common_set = set(positive_fd).intersection(negative_fd)

for word in common_set:
    del positive_fd[word]
    del negative_fd[word]

top_100_positive = {word for word, count in positive_fd.most_common(100)}
top_100_negative = {word for word, count in negative_fd.most_common(100)}

unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

# positive and negative bigram finders
positive_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
    w for w in nltk.corpus.movie_reviews.words(categories=["pos"])
    if w.isalpha() and w not in unwanted
])
negative_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
    w for w in nltk.corpus.movie_reviews.words(categories=["neg"])
    if w.isalpha() and w not in unwanted
])

# Training and Using a Classifier

In [5]:
def extract_features(text):
    features = dict()
    wordcount = 0
    compound_scores = list()
    positive_scores = list()

    for sentence in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sentence):
            if word.lower() in top_100_positive:
                wordcount += 1
        compound_scores.append(sia.polarity_scores(sentence)["compound"])
        positive_scores.append(sia.polarity_scores(sentence)["pos"])

    # Adding 1 to the final compound score to always have positive numbers
    # since some classifiers you'll use later don't work with negative numbers.
    features["mean_compound"] = mean(compound_scores) + 1
    features["mean_positive"] = mean(positive_scores)
    features["wordcount"] = wordcount

    return features

# iterate over the tokens and compute compund score
features = [
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "pos")
    for review in nltk.corpus.movie_reviews.fileids(categories=["pos"])
]
features.extend([
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "neg")
    for review in nltk.corpus.movie_reviews.fileids(categories=["neg"])
])

In [6]:
# Use 1/4 of the set for training
train_count = len(features) // 4
shuffle(features)
classifier = nltk.NaiveBayesClassifier.train(features[:train_count])
classifier.show_most_informative_features(10)

nltk.classify.accuracy(classifier, features[train_count:])

Most Informative Features
               wordcount = 4                 pos : neg    =      3.9 : 1.0
               wordcount = 2                 pos : neg    =      3.8 : 1.0
               wordcount = 0                 neg : pos    =      1.7 : 1.0
               wordcount = 1                 pos : neg    =      1.7 : 1.0
           mean_positive = 0.1245            pos : neg    =      1.0 : 1.0


0.6626666666666666