# Step 1: Data Preparation and Preprocessing

In [3]:
import nltk
nltk.download('movie_reviews')

from nltk.corpus import movie_reviews
import random

# Load the IMDb movie reviews dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents
random.shuffle(documents)

# Separate into features (X) and labels (y)
X = [' '.join(document) for document, category in documents]
y = [category for document, category in documents]

# Split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\HARIKA\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


# Step 2: Feature Extraction with TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform on training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)


# Step 3: Model Training - Multinomial Naive Bayes Classifier

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Initialize Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_train_tfidf, y_train)

# Predictions on test set
y_pred = nb_classifier.predict(X_test_tfidf)

# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8175
              precision    recall  f1-score   support

         neg       0.76      0.90      0.82       187
         pos       0.89      0.75      0.81       213

    accuracy                           0.82       400
   macro avg       0.83      0.82      0.82       400
weighted avg       0.83      0.82      0.82       400

