In [1]:
#Naive Bayes classification algorithm for classifying movie reviews into positive and negative labels

In [2]:
#splitting our data corpus into training and test datasets

In [1]:
import nltk

In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
positiveReviewsFileName = "rt-polaritydata/rt-polaritydata/rt-polarity.pos"

In [4]:
with open(positiveReviewsFileName, 'r') as f: positiveReviews = f.readlines()

In [5]:
positiveReviews[0]

'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . \n'

In [6]:
negativeReviewsFileName = "rt-polaritydata/rt-polaritydata/rt-polarity.neg"

In [7]:
with open(negativeReviewsFileName, 'r') as f: negativeReviews = f.readlines()

In [8]:
negativeReviews[0]

'simplistic , silly and tedious . \n'

In [9]:
testTrainingSplitIndex = 2500 #splitting the entire corpus of data into equal parts for test data and training data

In [10]:
testNegativeReviews = negativeReviews[testTrainingSplitIndex + 1]
testPositiveReviews = positiveReviews[testTrainingSplitIndex + 1]
trainingNegativeReviews = negativeReviews[:testTrainingSplitIndex]
trainingPositiveReviews = positiveReviews[:testTrainingSplitIndex]

In [11]:
#defining a vocabulary of entire training dataset
def getVocabulary():
    positiveWordList = [word for line in trainingPositiveReviews for word in line.split()]
    negativeWordList = [word for line in trainingNegativeReviews for word in line.split()]
    allWordList = [item for sublist in [positiveWordList, negativeWordList] for item in sublist]
    vocabulary = list(set(allWordList))
    return vocabulary

In [12]:
vocabulary = getVocabulary()
vocabulary[0], len(vocabulary)

('writings', 14102)

In [13]:
#extracting features 
def extract_features(review):
    review_words = set(review)
    features = {}
    for word in vocabulary:
        features[word] = (word in review_words)
    return features

In [14]:
def getTrainingData():
    negTaggedTrainingReviewList = [{'review': oneReview.split(), 'label':'negative'} for oneReview in trainingNegativeReviews]
    posTaggedTrainingReviewList = [{'review': oneReview.split(), 'label':'positive'} for oneReview in trainingPositiveReviews]
    fullTaggedTrainingData = [item for sublist in [negTaggedTrainingReviewList,posTaggedTrainingReviewList] for item in sublist]
    trainingData = [(review['review'], review['label']) for review in fullTaggedTrainingData ]
    return  trainingData

In [15]:
trainingData = getTrainingData()
trainingData[0]

(['simplistic', ',', 'silly', 'and', 'tedious', '.'], 'negative')

In [16]:
len(trainingData)

5000

In [19]:
def getTrainedNaiveBayesClassifier(extract_features, trainingData):
    print 'Training the classifier'
    trainingFeatures = nltk.classify.apply_features(extract_features, trainingData)
    trainedNBClassifier = nltk.NaiveBayesClassifier.train(trainingFeatures)
    print 'Training the classifier 6'
    return trainedNBClassifier

In [18]:
trainedNBClassifier = getTrainedNaiveBayesClassifier(extract_features, trainingData)

Training the classifier
Training the classifier 6


In [20]:
#classifying the test data as our trained model is ready
def naiveBayesSentimentCalculator(review):
    problemInstance = review.split()
    problemFeatures = extract_features(problemInstance)
    return trainedNBClassifier.classify(problemFeatures)

In [21]:
naiveBayesSentimentCalculator("What an awesome movie")

'positive'

In [22]:
def getTestReviewSentiments(naiveBayesSentimentCalculator):
    testNegResults = [naiveBayesSentimentCalculator(review) for review in testNegativeReviews]
    testPosResults = [naiveBayesSentimentCalculator(review) for review in testPositiveReviews]
    labelToNum = {'positive':1, 'negative': -1}
    numericNegResults = [labelToNum[x] for x in testNegResults]
    numericPosResults = [labelToNum[x] for x in testPosResults]
    return {'results-on-positive': numericPosResults, 'results-on-negative': numericNegResults }

In [23]:
def runDiagnostics(reviewResult):
    positiveReviewsResult = reviewResult['results-on-positive']
    negativeReviewsResult = reviewResult['results-on-negative']
    pctTruePositive = float(sum(x>0 for x in positiveReviewsResult))/len(positiveReviewsResult)
    pctTrueNegative = float(sum(x<0 for x in negativeReviewsResult))/len(negativeReviewsResult)
    totalAccurate = float(sum(x>0 for x in positiveReviewsResult)) + float(sum(x<0 for x in negativeReviewsResult))
    total = len(positiveReviewsResult) + len(negativeReviewsResult)
    Accuracy = float(totalAccurate)/total
    print 'Accuracy on positive reviews = '+'%.2f'% (pctTruePositive*100)+'%'
    print 'Accuracy on negative reviews = '+'%.2f'% (pctTrueNegative*100)+'%'
    print 'Overall Accuracy = '+'%.2f'% (Accuracy*100)+'%'

In [25]:
runDiagnostics(getTestReviewSentiments(naiveBayesSentimentCalculator))

Accuracy on positive reviews = 44.70%
Accuracy on negative reviews = 64.58%
Overall Accuracy = 50.00%


In [26]:
naiveBayesSentimentCalculator("What a terrible movie")

'negative'