# Loading the Data Set

In [135]:
import nltk
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

# Movie Reviews has 2 categories

In [136]:
movie_reviews.categories()

['neg', 'pos']

In [137]:
len(movie_reviews.fileids())

2000

# Access the file id of 5th document

In [138]:
movie_reviews.fileids()[5] 

'neg/cv005_29357.txt'

# Access word written in a review using the review's file id

In [139]:
movie_reviews.words(movie_reviews.fileids()[5]) ## accessing the 5th review and array of words is present

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [140]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category)) ## storing a tuple for each document
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [141]:
import random
random.shuffle(documents)
documents[0:5]

[(['shakespeare', '.', 'you', 'hardly', 'understood', ...], 'pos'),
 (['synopsis', ':', 'an', 'aging', 'master', 'art', ...], 'neg'),
 (['take', 'two', 'old', 'and', 'dying', 'men', ',', 'a', ...], 'neg'),
 (['in', 'chocolat', ',', 'a', 'chocolate', 'shop', ...], 'pos'),
 (['i', 'must', 'admit', 'that', 'i', 'was', 'a', 'tad', ...], 'pos')]

In [142]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [143]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J') :
        return wordnet.ADJ
    elif tag.startswith('V') :
        return wordnet.VERB
    elif tag.startswith('N') :
        return wordnet.NOUN
    elif tag.startswith('R') :
        return wordnet.ADV
    else :
        return wordnet.NOUN

# Cleaning stop words and lemmatizing to find the root word

In [144]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [145]:
from nltk import pos_tag
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            ## To lemmatize the word, we need to pass the pos, here from tuple we want the first entry
            clean_word = lemmatizer.lemmatize(w , pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [146]:
documents = [(clean_review(document),category) for document, category in documents]

# Splitting data into train and test

In [167]:
training_documents = documents[0:1500]
testing_documents = documents[1500:]

# Preparing data as NLTK classifier require data in this format
Format is array of tuples , where each tuple has a dictionary which is feature and feature value(T/F) and the category

In [168]:
# Example
a = [1,2]
b = [3,4]
a += b
a

[1, 2, 3, 4]

In [169]:
all_words = []
for doc in training_documents:
    all_words += doc[0]

In [103]:
import nltk

In [170]:
freq = nltk.FreqDist(all_words) ## gives a frequency distribution object
common = freq.most_common(3000)

features = [i[0] for i in common]

In [173]:
def get_features_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features :
        current_features[w] = w in words_set
    return current_features

In [174]:
training_data = [(get_features_dict(doc),category) for doc, category in training_documents ] ## array of tuple of dict and category
testing_data = [(get_features_dict(doc),category) for doc, category in testing_documents ]

# NLTK 

In [175]:
from nltk import NaiveBayesClassifier

In [176]:
classfier  = NaiveBayesClassifier.train(training_data)

In [177]:
nltk.classify.accuracy(classfier, testing_data)

0.814

In [178]:
classfier.show_most_informative_features()

Most Informative Features
             outstanding = True              pos : neg    =     13.7 : 1.0
               ludicrous = True              neg : pos    =     12.0 : 1.0
                 idiotic = True              neg : pos    =     11.6 : 1.0
                  seagal = True              neg : pos    =     10.9 : 1.0
                  sloppy = True              neg : pos    =     10.8 : 1.0
              schumacher = True              neg : pos    =     10.2 : 1.0
                   anger = True              pos : neg    =      9.1 : 1.0
                    coen = True              pos : neg    =      8.5 : 1.0
            breathtaking = True              pos : neg    =      8.3 : 1.0
             magnificent = True              pos : neg    =      7.4 : 1.0


# SKLearn

In [179]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [180]:
svc = SVC()
classifier_sklearn = SklearnClassifier(svc)

In [181]:
classifier_sklearn.train(training_data)

<SklearnClassifier(SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))>

In [182]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.846

In [183]:
from sklearn.ensemble import RandomForestClassifier

In [184]:
rfc = RandomForestClassifier()
classifier_sklearn1 = SklearnClassifier(rfc)

In [185]:
classifier_sklearn1.train(training_data)

<SklearnClassifier(RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False))>

In [186]:
nltk.classify.accuracy(classifier_sklearn1, testing_data)

0.832