# Sentiment Analysis using Naive Bayes.

First, the necessary imports.

In [39]:
import re, math, collections, itertools, os
import nltk, nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.metrics import precision
from nltk.metrics import recall
import random

The source code for NLTK Naive Bayes is at http://www.nltk.org/_modules/nltk/classify/naivebayes.html

First, load up the dataset. This data is for movie ratings.

In [40]:
POLARITY_DATA_DIR = os.path.join('polarityData', 'rt-polaritydata')
RT_POLARITY_POS_FILE = os.path.join(POLARITY_DATA_DIR, 'rt-polarity-pos.txt')
RT_POLARITY_NEG_FILE = os.path.join(POLARITY_DATA_DIR, 'rt-polarity-neg.txt')

In [41]:
def evaluate_features(feature_select):
        posExamples = []
        negExamples = []
        
        # http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
        # This breaks up the sentences into lists of individual words (as selected by the input mechanism) and 
        # appends 'pos' or 'neg' after each list
        
        with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
                for i in posSentences:
                        posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
                        posWords = [feature_select(posWords), 'pos']
                        posExamples.append(posWords)
        with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
                for i in negSentences:
                        negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
                        negWords = [feature_select(negWords), 'neg']
                        negExamples.append(negWords)

        
        # Randomly shuffle the positive and negative examples - this is often a good idea.
        random.shuffle(posExamples)
        random.shuffle(negExamples)
        
        # Selects trainingPercent of the features to be used for training and rest to be used for testing
        # Typical breakups are 70% for training, 30% for testing. We can also do 75% and 25%.
        trainingPercent = 0.70
        posCutoff = int(math.floor(len(posExamples)*trainingPercent))
        negCutoff = int(math.floor(len(negExamples)*trainingPercent))
        trainExamples = posExamples[:posCutoff] + negExamples[:negCutoff]
        testExamples = posExamples[posCutoff:] + negExamples[negCutoff:]

        # Trains a Naive Bayes Classifier
        classifier = NaiveBayesClassifier.train(trainExamples)

        # Initiates referenceSets and testSets
        referenceSets = collections.defaultdict(set)
        testSets = collections.defaultdict(set)

        #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
        for i, (features, label) in enumerate(testExamples):
                referenceSets[label].add(i)
                predicted = classifier.classify(features)
                testSets[predicted].add(i)
                
        #prints metrics to show how well the feature selection did
        print 'train on %d instances, test on %d instances' % (len(trainExamples), len(testExamples))
        print 'accuracy: %.2f' %(100*nltk.classify.util.accuracy(classifier, testExamples) )
        print 'pos precision: %.2f' % (100*precision(referenceSets['pos'], testSets['pos']))
        print 'pos recall: %.2f' %(100*recall(referenceSets['pos'], testSets['pos']) )
        print 'neg precision: %.2f'%(100*precision(referenceSets['neg'], testSets['neg']))
        print 'neg recall: %.2f'%(100*recall(referenceSets['neg'], testSets['neg']))
        classifier.show_most_informative_features(10)

In [42]:
#creates a feature selection mechanism that uses all words
def make_full_dict(words):
        return dict([(word, True) for word in words])
    
print 'using all words as features'
evaluate_features(make_full_dict)

using all words as features
train on 7462 instances, test on 3200 instances
accuracy: 76.53
pos precision: 77.02
pos recall: 75.62
neg precision: 76.06
neg recall: 77.44
Most Informative Features
                    flat = True              neg : pos    =     21.0 : 1.0
                    warm = True              pos : neg    =     16.3 : 1.0
              delightful = True              pos : neg    =     13.0 : 1.0
                mediocre = True              neg : pos    =     13.0 : 1.0
                    dull = True              neg : pos    =     12.3 : 1.0
                  stupid = True              neg : pos    =     11.7 : 1.0
               inventive = True              pos : neg    =     11.7 : 1.0
              refreshing = True              pos : neg    =     11.0 : 1.0
               affecting = True              pos : neg    =     11.0 : 1.0
            refreshingly = True              pos : neg    =     11.0 : 1.0


What this means: the classifier found the word "wonderful" to be 15.7 times more indicative of a positive review as against it being a negative review. For more details, look at http://www.nltk.org/book/ch06.html#document-classify-smif. The NLTK code is at http://www.nltk.org/_modules/nltk/classify/naivebayes.html.

In [43]:
def create_word_scores():
        #creates lists of all positive and negative words
        posWords = []
        negWords = []
        with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
                for i in posSentences:
                        posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
                        posWords.append(posWord)
        with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
                for i in negSentences:
                        negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
                        negWords.append(negWord)
        posWords = list(itertools.chain(*posWords))
        negWords = list(itertools.chain(*negWords))

        #build frequency distibution of all words and then frequency distributions of words within positive and negative labels
        word_fd = FreqDist()
        cond_word_fd = ConditionalFreqDist()
        for word in posWords:
                word_fd[word.lower()] += 1
                cond_word_fd['pos'][word.lower()] += 1
        for word in negWords:
                word_fd[word.lower()] += 1
                cond_word_fd['neg'][word.lower()] += 1

        #finds the number of positive and negative words, as well as the total number of words
        pos_word_count = cond_word_fd['pos'].N()
        neg_word_count = cond_word_fd['neg'].N()
        total_word_count = pos_word_count + neg_word_count

        #builds dictionary of word scores based on chi-squared test
        word_scores = {}
        for word, freq in word_fd.iteritems():
                pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
                neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
                word_scores[word] = pos_score + neg_score

        return word_scores

In [44]:
# Finds word scores
word_scores = create_word_scores()

# Finds the best 'number' words based on word scores
def find_best_words(word_scores, number):
        best_vals = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:number]
        best_words = set([w for w, s in best_vals])
        return best_words

# Creates feature selection mechanism that only uses best words
def best_word_features(words):
        return dict([(word, True) for word in words if word in best_words])

# Numbers of features to select
numbers_to_test = [10, 100, 1000, 10000, 15000]
# Tries the best_word_features mechanism with each of the numbers_to_test of features
for num in numbers_to_test:
        print 'evaluating best %d word features' % (num)
        best_words = find_best_words(word_scores, num)
        evaluate_features(best_word_features)

evaluating best 10 word features
train on 7462 instances, test on 3200 instances
accuracy: 56.66
pos precision: 54.36
pos recall: 83.06
neg precision: 64.11
neg recall: 30.25
Most Informative Features
                    dull = True              neg : pos    =     12.8 : 1.0
                  boring = True              neg : pos    =     10.1 : 1.0
                     bad = True              neg : pos    =      5.3 : 1.0
                     too = True              neg : pos    =      3.6 : 1.0
                       ? = True              neg : pos    =      2.2 : 1.0
                      no = True              neg : pos    =      2.2 : 1.0
                    just = True              neg : pos    =      1.9 : 1.0
                   movie = True              neg : pos    =      1.6 : 1.0
                     and = True              pos : neg    =      1.2 : 1.0
                     and = None              neg : pos    =      1.2 : 1.0
evaluating best 100 word features
train on 7462 i