In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier

In [2]:
positive_documents = ["The movie was great and highly enjoyable.", "I loved the book; it was fantastic."]
negative_documents = ["The concert was terrible and disappointing.", "The service at the restaurant was awful."]

# Combine positive and negative documents into one list
documents = [(doc, "Positive") for doc in positive_documents] + [(doc, "Negative") for doc in negative_documents]

In [6]:
nltk.download('punkt')
nltk.download('stopwords')
# Tokenize and preprocess the documents
all_words = []
for document, sentiment in documents:
    words = word_tokenize(document)
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stopwords.words('english')]
    all_words.extend(words)

# Extract the most common words as features
word_features = FreqDist(all_words).most_common(100)
word_features = [word for word, _ in word_features]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
def document_features(document):
    document_words = set(word_tokenize(document.lower()))
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

# Generate feature sets for the documents
feature_sets = [(document_features(doc), sentiment) for doc, sentiment in documents]

In [10]:
classifier = NaiveBayesClassifier.train(feature_sets)

In [11]:
# Example document to classify
new_document = "The movie was terrible and boring."

# Preprocess and extract features from the new document
new_features = document_features(new_document)

# Classify the new document using the trained classifier
classification = classifier.classify(new_features)
print("Document sentiment:", classification)

Document sentiment: Negative
