In [1]:
!pip install nltk




In [2]:
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import movie_reviews
import random


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# Create a list of tuples (review words, sentiment)
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the data
random.shuffle(documents)


In [4]:
from nltk.corpus import stopwords
from nltk import FreqDist

# Get all words in the dataset
all_words = [w.lower() for w in movie_reviews.words() if w.isalpha() and w.lower() not in stopwords.words('english')]

# Frequency distribution of words
all_words_freq = FreqDist(all_words)

# Top 2000 most common words
word_features = list(all_words_freq)[:2000]

# Function to extract features
def document_features(document):
    words = set(document)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = (word in words)
    return features


In [5]:
# Extract features for all documents
featuresets = [(document_features(d), c) for (d, c) in documents]

# Split into training and test data
train_set, test_set = featuresets[100:], featuresets[:100]


In [6]:
from nltk import NaiveBayesClassifier, classify

# Train the classifier
classifier = NaiveBayesClassifier.train(train_set)

# Evaluate the model
accuracy = classify.accuracy(classifier, test_set)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Show most informative features
classifier.show_most_informative_features(10)


Model Accuracy: 75.00%
Most Informative Features
   contains(outstanding) = True              pos : neg    =     10.9 : 1.0
        contains(seagal) = True              neg : pos    =      8.1 : 1.0
         contains(mulan) = True              pos : neg    =      7.7 : 1.0
   contains(wonderfully) = True              pos : neg    =      7.0 : 1.0
         contains(awful) = True              neg : pos    =      5.9 : 1.0
        contains(wasted) = True              neg : pos    =      5.6 : 1.0
         contains(damon) = True              pos : neg    =      5.5 : 1.0
          contains(lame) = True              neg : pos    =      5.2 : 1.0
         contains(waste) = True              neg : pos    =      5.2 : 1.0
       contains(unfunny) = True              neg : pos    =      5.0 : 1.0
