In [5]:
import nltk
from nltk.sentiment import vader

### Opening positive and negative reviews files

In [6]:
positiveReviewsFileName = "rt-polarity.pos"
with open (positiveReviewsFileName, 'r') as f:
    positiveReviews = f.readlines()

In [7]:
negativeReviewsFileName = "rt-polarity.neg"
with open (negativeReviewsFileName, 'r') as f:
    negativeReviews = f.readlines()

In [8]:
testTrainingSplitIndex = 2500

In [9]:
testNegativeReviews = negativeReviews[testTrainingSplitIndex+1:]
testPositiveReviews = positiveReviews[testTrainingSplitIndex+1:]

In [10]:
trainingNegativeReviews = negativeReviews[:testTrainingSplitIndex]
trainingPositiveReviews = positiveReviews[:testTrainingSplitIndex]

### Making lists with positive and negative words, then join them in a set with all the words

In [11]:
positiveWordList = [word for line in trainingPositiveReviews for word in line.split()]
negativeWordList = [word for line in trainingNegativeReviews for word in line.split()]
allWordList = [item for sublist in[positiveWordList, negativeWordList] for item in sublist]
vocabulary = list(set(allWordList))

### Function for extracting the features

In [12]:
def extract_features(review):
    review_words = set(review)
    features = {}
    for word in vocabulary:
        features[word] = (word in review_words)
    return features

### Defining the training data.

In [13]:
netTaggedTrainingReviewList = [{'review':oneReview.split(), 'label': 'negative'}for oneReview in trainingNegativeReviews]
posTaggedTrainingReviewList = [{'review':oneReview.split(), 'label': 'positive'}for oneReview in trainingPositiveReviews]
fullTaggedTrainingData = [item for sublist in[netTaggedTrainingReviewList, posTaggedTrainingReviewList] for item in sublist]
trainingData = [(review['review'], review['label']) for review in fullTaggedTrainingData]

In [14]:
trainingFeatures = nltk.classify.apply_features(extract_features, trainingData)

### Function that classifies a sentence in negative or positive using naive Bayes.

In [16]:
def naibeBayesSentimentCalculator(review):
    problemInstance = review.split()
    problemFeatures = extract_features(problemInstance)
    return trainedNBClassifier.classify(problemFeatures)

In [17]:
trainedNBClassifier = nltk.NaiveBayesClassifier.train(trainingFeatures)

In [19]:
naibeBayesSentimentCalculator("Best movie ever")

'negative'

### Functions to get the accuracy of the naive bayes model.

In [20]:
def getTestReviewSentiments(naibeBayesSentimentCalculator):
    testNegResults = [naibeBayesSentimentCalculator(review) for review in testNegativeReviews]
    testPosResults = [naibeBayesSentimentCalculator(review) for review in testPositiveReviews]
    labelToNum = {'positive':1, 'negative':-1}
    numericNegResults = [labelToNum[x] for x in testNegResults]
    numericPosResults = [labelToNum[x] for x in testPosResults]
    return {'results-on-positive': numericPosResults, 'results-on-negative':numericNegResults}

In [21]:
def runDiagnostics(reviewResult):
    positiveReviewsResult = reviewResult['results-on-positive']
    negativeReviewsResult = reviewResult['results-on-negative']
    numTruePositive = sum(x > 0 for x in positiveReviewsResult)
    numTrueNegative = sum(x < 0 for x in negativeReviewsResult)
    pctTruePositive = float(numTruePositive)/len(positiveReviewsResult)
    pctTrueNegative = float(numTrueNegative)/len(negativeReviewsResult)
    totalAccurate = numTrueNegative + numTruePositive
    total = len(positiveReviewsResult) + len(negativeReviewsResult)
    print("Accuracy on positive reviews = " + "%.2f" % (pctTruePositive * 100) + "%")
    print("Accuracy on negative reviews = " + "%.2f" % (pctTrueNegative * 100) + "%")
    print("Overall accuracy =" + "%.2f" % (totalAccurate * 100/total) + "%")
    

In [22]:
runDiagnostics(getTestReviewSentiments(naibeBayesSentimentCalculator))

Accuracy on positive reviews = 73.39%
Accuracy on negative reviews = 77.07%
Overall accuracy =75.23%
