In [2]:
import nltk
from nltk.corpus import stopwords, movie_reviews
from nltk.tokenize import word_tokenize
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [26]:
nltk.download('stopwords')
nltk.download('movie_reviews')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\U765123\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\U765123\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\U765123\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)
stop_words = set(stopwords.words('english'))


In [17]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(tokens)

In [18]:
documents = [(preprocess_text(' '.join(doc)), category) for doc, category in documents]

In [19]:
# Separate the data and labels
texts, labels = zip(*documents)

In [20]:
# this is a bag of words cassifier
vectorizer = CountVectorizer(max_features=2000)
X = vectorizer.fit_transform(texts)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.25, random_state=42)


In [22]:
model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [23]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 82.20%


In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.82      0.83      0.82       252
         pos       0.82      0.82      0.82       248

    accuracy                           0.82       500
   macro avg       0.82      0.82      0.82       500
weighted avg       0.82      0.82      0.82       500



In [25]:
# Function to predict sentiment
def predict_sentiment(text):
    preprocessed_text = preprocess_text(text)
    text_features = vectorizer.transform([preprocessed_text])
    sentiment = model.predict(text_features)
    return sentiment[0]

# Test the function
sample_text = "This movie was an absolute masterpiece with brilliant acting and storyline."
print(f"Sentiment: {predict_sentiment(sample_text)}")

Sentiment: pos
