In [17]:
import nltk
import random
import pandas as pd
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy

In [19]:
# Download and Load Dataset
nltk.download('movie_reviews')

# Load data
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle data
random.shuffle(documents)


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Shrushti\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [21]:
# Text Preprocessing
# Create a frequency distribution of all words
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]  # top 2000 words

def document_features(doc):
    words = set(doc)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = (word in words)
    return features

featuresets = [(document_features(d), c) for (d, c) in documents]


In [23]:
# Train and Test the Model
# Train-Test Split
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = NaiveBayesClassifier.train(train_set)

# Evaluate
print("Accuracy:", accuracy(classifier, test_set))
classifier.show_most_informative_features(10)

Accuracy: 0.88
Most Informative Features
   contains(outstanding) = True              pos : neg    =     13.3 : 1.0
         contains(mulan) = True              pos : neg    =      7.7 : 1.0
        contains(seagal) = True              neg : pos    =      7.4 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.8 : 1.0
         contains(damon) = True              pos : neg    =      5.9 : 1.0
         contains(flynt) = True              pos : neg    =      5.7 : 1.0
         contains(awful) = True              neg : pos    =      5.7 : 1.0
        contains(wasted) = True              neg : pos    =      5.5 : 1.0
    contains(ridiculous) = True              neg : pos    =      5.2 : 1.0
          contains(lame) = True              neg : pos    =      5.1 : 1.0
