In [13]:
import nltk
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [14]:
reviews = []
for fileid in movie_reviews.fileids():
    category = fileid.split('/')[0]
    review = movie_reviews.raw(fileid)
    reviews.append((review, category))

    

# Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(nltk.corpus.stopwords.words('english'))

def preprocess(text):
    words = word_tokenize(text.lower())
    words = [lemmatizer.lemmatize(word) for word in words if word.isalpha()]
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

preprocessed_reviews = [(preprocess(review), category) for review, category in reviews]

In [15]:
import random

random.shuffle(preprocessed_reviews)

train_size = int(0.8 * len(preprocessed_reviews))
train_reviews = preprocessed_reviews[:train_size]
test_reviews = preprocessed_reviews[train_size:]

In [16]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform([review for review, category in train_reviews])
y_train = [category for review, category in train_reviews]
X_test = vectorizer.transform([review for review, category in test_reviews])
y_test = [category for review, category in test_reviews]


In [17]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.80
