<h1> Movie review classification with NLTK </h1>

In [1]:
import random
import nltk
from nltk import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
cats = movie_reviews.categories()
reviews = []
for cat in cats:
    for fid in movie_reviews.fileids(cat):
        review = (list(movie_reviews.words(fid)),cat)
        reviews.append(review)
random.shuffle(reviews)

In [3]:
all_wd_in_reviews = nltk.FreqDist(wd.lower() for wd in movie_reviews.words())
top_wd_in_reviews = [list(wds) for wds in zip(*all_wd_in_reviews.most_common(2000))][0]

In [4]:
def ext_ft(review,top_words):
    review_wds = set(review)
    ft = {}
    for wd in top_words:
        ft['word_present({})'.format(wd)] = (wd in review_wds)
    return ft

In [5]:
featuresets = [(ext_ft(d,top_wd_in_reviews), c) for (d,c) in reviews]
train_set, test_set = featuresets[200:], featuresets[:200]

In [6]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.82


In [7]:
classifier.show_most_informative_features(10)

Most Informative Features
    word_present(seagal) = True              neg : pos    =     12.9 : 1.0
word_present(outstanding) = True              pos : neg    =     10.2 : 1.0
     word_present(mulan) = True              pos : neg    =      7.0 : 1.0
word_present(wonderfully) = True              pos : neg    =      6.5 : 1.0
     word_present(damon) = True              pos : neg    =      5.7 : 1.0
word_present(ridiculous) = True              neg : pos    =      5.6 : 1.0
     word_present(awful) = True              neg : pos    =      5.6 : 1.0
      word_present(lame) = True              neg : pos    =      5.5 : 1.0
       word_present(era) = True              pos : neg    =      5.4 : 1.0
     word_present(waste) = True              neg : pos    =      5.3 : 1.0


In [8]:
d_vect=None
def get_train_test(tr_set,te_set):
    global d_vect
    d_vect = DictVectorizer(sparse=False)
    X_tr, y_tr = zip(*tr_set)
    X_tr = d_vect.fit_transform(X_tr)
    X_te,y_te = zip(*te_set)
    X_te = d_vect.transform(X_te)
    return X_tr,X_te,y_tr,y_te

In [9]:
X_train,X_test,y_train,y_test = get_train_test(train_set,test_set)
rf = RandomForestClassifier(n_estimators=100,n_jobs=4,random_state=10)
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,
            oob_score=False, random_state=10, verbose=0, warm_start=False)

In [10]:
preds = rf.predict(X_test)
print(accuracy_score(y_test,preds))

0.825


In [11]:
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
all_words_in_reviews = nltk.FreqDist(word.lower() for word in movie_reviews.words() if word not in stopwords_list)
top_words_in_reviews = [list(words) for words in zip(*all_words_in_reviews.most_common(2000))][0]

In [12]:
featuresets = [(ext_ft(d,top_words_in_reviews), c) for (d,c) in reviews]
train_set, test_set = featuresets[200:], featuresets[:200]
X_train,X_test,y_train,y_test = get_train_test(train_set,test_set)

In [13]:
rf = RandomForestClassifier(n_estimators=100,n_jobs=4,random_state=10)
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,
            oob_score=False, random_state=10, verbose=0, warm_start=False)

In [14]:
preds = rf.predict(X_test)
print(accuracy_score(y_test,preds))

0.855


In [15]:
features_list = zip(d_vect.get_feature_names(),rf.feature_importances_)
features_list = sorted(features_list, key=lambda x: x[1], reverse=True)
print(features_list[0:20])

[('word_present(bad)', 0.012709101728574842), ('word_present(worst)', 0.007818498310639065), ('word_present(boring)', 0.005936688132114715), ('word_present(stupid)', 0.005494943898958303), ('word_present(waste)', 0.005091693110187572), ('word_present(awful)', 0.004958501019474184), ('word_present(life)', 0.004823972835890037), ('word_present(mess)', 0.004702269754432544), ('word_present(plot)', 0.004651200149051172), ('word_present(ridiculous)', 0.004302553379352463), ('word_present(lame)', 0.004211512878104104), ('word_present(wasted)', 0.003801230103927904), ('word_present(perfectly)', 0.003652949879118064), ('word_present(supposed)', 0.0036088721850700816), ('word_present(excellent)', 0.0034587221482947376), ('word_present(script)', 0.0033971166771779187), ('word_present(dull)', 0.0033936795346628922), ('word_present(great)', 0.0031970652208558547), ('word_present(?)', 0.0031860349055431846), ('word_present(outstanding)', 0.003175814254058592)]
