In [None]:
import random
import nltk
from nltk import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
nltk.download("movie_reviews")

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [None]:
cats = movie_reviews.categories()
reviews = []
for cat in cats:
    for fid in movie_reviews.fileids(cat):
        review = (list(movie_reviews.words(fid)), cat)
        reviews.append(review)
random.shuffle(reviews)


In [None]:
cats

['neg', 'pos']

In [None]:
all_wd_in_reviews = nltk.FreqDist(wd.lower() for wd in movie_reviews.words())
top_wd_in_reviews = [list(wds) for wds in zip(*all_wd_in_reviews.most_common(2000))][0]


In [None]:
top_wd_in_reviews

[',',
 'the',
 '.',
 'a',
 'and',
 'of',
 'to',
 "'",
 'is',
 'in',
 's',
 '"',
 'it',
 'that',
 '-',
 ')',
 '(',
 'as',
 'with',
 'for',
 'his',
 'this',
 'film',
 'i',
 'he',
 'but',
 'on',
 'are',
 't',
 'by',
 'be',
 'one',
 'movie',
 'an',
 'who',
 'not',
 'you',
 'from',
 'at',
 'was',
 'have',
 'they',
 'has',
 'her',
 'all',
 '?',
 'there',
 'like',
 'so',
 'out',
 'about',
 'up',
 'more',
 'what',
 'when',
 'which',
 'or',
 'she',
 'their',
 ':',
 'some',
 'just',
 'can',
 'if',
 'we',
 'him',
 'into',
 'even',
 'only',
 'than',
 'no',
 'good',
 'time',
 'most',
 'its',
 'will',
 'story',
 'would',
 'been',
 'much',
 'character',
 'also',
 'get',
 'other',
 'do',
 'two',
 'well',
 'them',
 'very',
 'characters',
 ';',
 'first',
 '--',
 'after',
 'see',
 '!',
 'way',
 'because',
 'make',
 'life',
 'off',
 'too',
 'any',
 'does',
 'really',
 'had',
 'while',
 'films',
 'how',
 'plot',
 'little',
 'where',
 'people',
 'over',
 'could',
 'then',
 'me',
 'scene',
 'man',
 'bad',
 '

In [None]:
def ext_ft(review, top_words):
    review_wds = set(review)
    ft = {}
    for wd in top_words:
        ft['word_present({})'.format(wd)] = (wd in review_wds)
    return ft


In [None]:
def ext_ft(review, top_words):
    review_wds = set(review)
    ft = {}
    for wd in top_words:
        ft['word_present({})'.format(wd)] = (wd in review_wds)
    return ft


In [None]:
featuresets = [(ext_ft(d, top_wd_in_reviews), c) for (d, c) in reviews]
train_set, test_set = featuresets[200:], featuresets[:200]


In [None]:
nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))


0.825


In [None]:
nb_classifier.show_most_informative_features(30)

<nltk.classify.naivebayes.NaiveBayesClassifier at 0x7d70e9e0cbb0>

In [None]:
dict_vectorizer = DictVectorizer(sparse=False)
X_train, y_train = list(zip(*train_set))
X_test, y_test = list(zip(*test_set))
X_train = dict_vectorizer.fit_transform(X_train)
X_test = dict_vectorizer.transform(X_test)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
preds = rf_classifier.predict(X_test)
print(f"Random Forest Accuracy: {accuracy_score(y_test, preds)}")

Random Forest Accuracy: 0.83


In [None]:
def classify_review(review_text, classifier, vectorizer=None):
    review_words = review_text.lower().split()
    review_features = ext_ft(review_words, top_wd_in_reviews)

    if vectorizer:
        review_features = vectorizer.transform([review_features])
        prediction = classifier.predict(review_features)[0]
    else:
        prediction = classifier.classify(review_features)

    return prediction

In [None]:
prediction = classifier.predict(review_features)[0]

AttributeError: 'NaiveBayesClassifier' object has no attribute 'predict'

In [None]:
user_review = "bad"


In [None]:
nb_prediction = classify_review(user_review, nb_classifier)
print(f"Naive Bayes Prediction: {nb_prediction}")

Naive Bayes Prediction: neg


In [None]:
rf_prediction = classify_review(user_review, rf_classifier, dict_vectorizer)
print(f"Random Forest Prediction: {rf_prediction}")

Random Forest Prediction: neg


In [None]:
print("Features extracted from the review:")
print(ext_ft(user_review.lower().split(), top_wd_in_reviews))

Features extracted from the review:
{'word_present(,)': False, 'word_present(the)': False, 'word_present(.)': False, 'word_present(a)': False, 'word_present(and)': True, 'word_present(of)': False, 'word_present(to)': False, "word_present(')": False, 'word_present(is)': False, 'word_present(in)': False, 'word_present(s)': False, 'word_present(")': False, 'word_present(it)': False, 'word_present(that)': False, 'word_present(-)': False, 'word_present())': False, 'word_present(()': False, 'word_present(as)': False, 'word_present(with)': True, 'word_present(for)': False, 'word_present(his)': False, 'word_present(this)': True, 'word_present(film)': False, 'word_present(i)': False, 'word_present(he)': False, 'word_present(but)': False, 'word_present(on)': False, 'word_present(are)': False, 'word_present(t)': False, 'word_present(by)': False, 'word_present(be)': False, 'word_present(one)': False, 'word_present(movie)': True, 'word_present(an)': True, 'word_present(who)': False, 'word_present(n

In [None]:

import random
import nltk
from nltk import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline


nltk.download('movie_reviews')


cats = movie_reviews.categories()
reviews = []
for cat in cats:
    for fid in movie_reviews.fileids(cat):
        review = (" ".join(movie_reviews.words(fid)), cat)
        reviews.append(review)
random.shuffle(reviews)

train_reviews, train_labels = zip(*reviews[200:])
test_reviews, test_labels = zip(*reviews[:200])

pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier())


pipeline.fit(train_reviews, train_labels)


preds = pipeline.predict(test_reviews)
print(f"Random Forest Accuracy: {accuracy_score(test_labels, preds)}")


def classify_review(review_text, model):
    prediction = model.predict([review_text])
    return prediction[0]


user_review = "This movie was absolutely an horror, lags without any story and only screenplay made sense."

rf_prediction = classify_review(user_review, pipeline)
print(f"Random Forest Prediction: {rf_prediction}")

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Random Forest Accuracy: 0.78
Random Forest Prediction: neg
