In [1]:
import nltk,random
from nltk.corpus import movie_reviews

Loading classifiers

In [2]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode

Loading modules for text cleaning

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [4]:
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

-----------------------------

Converting the movie reviews corpus into documentes

In [5]:
documents = [(list(movie_reviews.words(fileids= files)),category) for category in movie_reviews.categories() for files in movie_reviews.fileids(categories= category)]

In [6]:
print(len(documents) == len(movie_reviews.fileids()))

True


In [7]:
random.shuffle(documents)

In [8]:
all_words = [word for word in movie_reviews.words()]

Extracting keywords from the bag of words, to be used as features for the entire corpus

In [9]:
# normalize, remove stopwords, lemmatize, pos tagging
def extract_keywords(word_list):
    word= [w.lower() for w in word_list if w.isalpha() if w not in stop_words]
    word = [lemmatiser.lemmatize(w) for w in word]
    tagged_wordlist = nltk.pos_tag(word)
    keyword = []
    count = 0
    
# selecting adjectives and adverbs    
    while count < len(tagged_wordlist):
        if(tagged_wordlist[count][1] == "JJ" or tagged_wordlist[count][1] == "RB"):
            keyword.append(tagged_wordlist[count][0])
        count = count + 1
    return keyword

In [10]:
feature_keywords = extract_keywords(all_words)

In [11]:
print(feature_keywords[:100])

['teen', 'drive', 'accident', 'nightmare', 'critique', 'fuck', 'teen', 'touch', 'cool', 'bad', 'even', 'generally', 'highway', 'memento', 'good', 'bad', 'type', 'pretty', 'neat', 'terribly', 'well', 'main', 'simply', 'normal', 'fantasy', 'dream', 'back', 'dead', 'dead', 'strange', 'looooot', 'chase', 'weird', 'simply', 'personally', 'unravel', 'obviously', 'big', 'secret', 'want', 'completely', 'final', 'even', 'meantime', 'really', 'sad', 'actually', 'half', 'strangeness', 'start', 'little', 'still', 'guess', 'bottom', 'always', 'sure', 'even', 'secret', 'mean', 'melissa', 'away', 'lazy', 'okay', 'really', 'need', 'u', 'different', 'insight', 'apparently', 'away', 'decent', 'teen', 'fuck', 'somewhere', 'guess', 'little', 'pretty', 'good', 'exact', 'character', 'american', 'new', 'entire', 'actually', 'overall', 'rarely', 'pretty', 'redundant', 'pretty', 'cool', 'oh', 'apparently', 'still', 'hot', 'also', 'ever', 'skip', 'nightmare', 'elm', 'highway']


In [12]:
feature_keywords_freq_dist = nltk.FreqDist(feature_keywords)

In [13]:
print(feature_keywords_freq_dist.most_common(100))

[('even', 2568), ('good', 2388), ('much', 2045), ('also', 1967), ('well', 1895), ('first', 1828), ('really', 1558), ('little', 1492), ('bad', 1395), ('never', 1374), ('new', 1292), ('many', 1268), ('great', 1150), ('u', 1072), ('big', 1064), ('still', 1053), ('however', 989), ('back', 935), ('real', 915), ('enough', 902), ('old', 887), ('last', 852), ('actually', 837), ('long', 835), ('almost', 820), ('ever', 776), ('funny', 750), ('young', 743), ('right', 735), ('original', 705), ('quite', 649), ('far', 635), ('high', 631), ('rather', 621), ('american', 608), ('yet', 605), ('always', 586), ('special', 572), ('hard', 569), ('instead', 565), ('black', 542), ('probably', 539), ('human', 538), ('away', 531), ('together', 521), ('pretty', 510), ('sure', 491), ('whole', 482), ('perhaps', 464), ('second', 457), ('especially', 456), ('completely', 440), ('different', 430), ('small', 429), ('simply', 428), ('several', 419), ('give', 411), ('true', 410), ('entire', 408), ('dead', 408), ('soon',

In [14]:
word_features =[x for (x,y) in feature_keywords_freq_dist.most_common()]

In [15]:
print(word_features[:100])

['even', 'good', 'much', 'also', 'well', 'first', 'really', 'little', 'bad', 'never', 'new', 'many', 'great', 'u', 'big', 'still', 'however', 'back', 'real', 'enough', 'old', 'last', 'actually', 'long', 'almost', 'ever', 'funny', 'young', 'right', 'original', 'quite', 'far', 'high', 'rather', 'american', 'yet', 'always', 'special', 'hard', 'instead', 'black', 'probably', 'human', 'away', 'together', 'pretty', 'sure', 'whole', 'perhaps', 'second', 'especially', 'completely', 'different', 'small', 'simply', 'several', 'give', 'true', 'entire', 'dead', 'soon', 'main', 'comic', 'else', 'final', 'unfortunately', 'wrong', 'next', 'full', 'often', 'alien', 'certainly', 'finally', 'interesting', 'maybe', 'able', 'later', 'top', 'nice', 'open', 'white', 'classic', 'short', 'screen', 'evil', 'nearly', 'early', 'major', 'exactly', 'close', 'obvious', 'already', 'deep', 'beautiful', 'live', 'perfect', 'sometimes', 'strong', 'quickly', 'truly']


Finding features for a given document

In [16]:
def find_features(document) :
    # document here is going to be first part of tuple i.e just a list of words
    words = set(extract_keywords(document))
    # Converting list to set, inludes all the words and not the amount of words
    features = {}
    # empty dictionary
    for w in word_features:
        features[w] = (w in words)  # returns true or false based on the words presence in top 3000
        # w, from word_features, is the key of features dictionary
        # w in words, from words i.e set(document) returns a boolean true or false
    return features

Creating labelled featureset for each document

In [17]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [18]:
print(len(featuresets) == len(movie_reviews.fileids()))

True


In [19]:
random.sample(featuresets[0][0].items(), 2)

[('retracing', False), ('rant', False)]

In [20]:
featuresets[0][1]

'neg'

--------------

In [21]:
def cutoff(split = 0.75):
    return int(len(featuresets) * split)

In [22]:
training_set = featuresets[:cutoff()]

In [23]:
testing_set = featuresets[cutoff():]

In [24]:
# training various models

NB_classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(NB_classifier, testing_set))*100)
NB_classifier.show_most_informative_features(25)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

Original Naive Bayes Algo accuracy percent: 77.8
Most Informative Features
                   inept = True              neg : pos    =     14.6 : 1.0
               marvelous = True              pos : neg    =     11.5 : 1.0
                flawless = True              pos : neg    =     10.2 : 1.0
                  annual = True              pos : neg    =      9.5 : 1.0
               ludicrous = True              neg : pos    =      9.4 : 1.0
            inexplicably = True              neg : pos    =      9.1 : 1.0
                   oscar = True              pos : neg    =      8.9 : 1.0
                 idiotic = True              neg : pos    =      8.6 : 1.0
              derivative = True              neg : pos    =      8.5 : 1.0
             outstanding = True              pos : neg    =      8.3 : 1.0
                      na = True              pos : neg    =      8.2 : 1.0
                   elite = True              pos : neg    =      8.2 : 1.0
                thematic 

-----------

* Creating a VoteClassifier, which is basically a voting system. Have used odd number of classifiers.
+ Each algorithm gets one vote, and the classification that has the most votes is the chosen one.

In [25]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

# creating a confidence method, which can tally the votes for and against the winning vote    
#  eg. 3/5 votes for positive is weaker than 5/5 votes

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [26]:
voted_classifier = VoteClassifier(NB_classifier,
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

voted_classifier accuracy percent: 79.2


### Examples for the confidence percent

In [27]:
print(random.sample(testing_set[0][0].items(), 15))

[('kenneth', False), ('however', False), ('tarkovskian', False), ('generously', False), ('sabbatical', False), ('plotline', False), ('countryish', False), ('jamileh', False), ('armstrong', False), ('applegate', False), ('actually', False), ('english', False), ('australia', False), ('serum', False), ('vocally', False)]


In [28]:
print(random.sample(testing_set[1][0].items(), 15))

[('pacific', False), ('grungy', False), ('convent', False), ('undermining', False), ('scamper', False), ('actual', False), ('exemplified', False), ('wholly', False), ('hideous', False), ('form', False), ('disappointingly', False), ('tourist', False), ('continue', False), ('provocatively', False), ('genetic', False)]


In [29]:
print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)

Classification: pos Confidence %: 100.0
Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 85.71428571428571
Classification: pos Confidence %: 57.14285714285714
Classification: neg Confidence %: 100.0
Classification: pos Confidence %: 85.71428571428571


# Examples

In [30]:
def classify_review(review_input):
    review_word_list = [words for words in word_tokenize(review_input)]
    review_features = find_features(review_word_list)

    if(voted_classifier.classify(features=review_features) == 'pos'):
        sentiment = "Positive Sentiment"
    else:
        sentiment = "Negative Sentiment"
    return(sentiment)

In [31]:
input1 = input("Please enter the review paragraphs here : \n\n")

Please enter the review paragraphs here : 

The last few Scorsese pics left me a little disappointed. I had begun to think Marty had become a 'gun for hire' and that his brilliance may have been spent (his earlier works were some of the best movies ever made). I attended a screening of The Wolf of Wall Street this evening, and was expecting to be unimpressed. I am happy to say I was completely blown away. This pic is Marty at his best. I laughed, I cringed, I related (with fond memories as well as a bit of guilt) and I TOTALLY believed every unbelievable moment. A good book, a great screenplay and a delightful cast were formed and molded into what I believe should get Scorsese a best director Oscar, and likely a Best Picture Award for the movie. Leo DiCaprio has grown into a versatile actor and his creation of this super hero dirtbag's roller coaster ride in this crazy (true) story is really honest and delightfully entertaining. Jonah Hill pulled out all the stops too and this is defin

In [32]:
classify_review(input1)

'Positive Sentiment'

In [33]:
input2 = input("Please enter the review paragraphs here : \n\n")

Please enter the review paragraphs here : 

It's hard to find the words to explain how TRULY AWFUL this film is. I'll try to do a list:  1) There's no context: They never show the victims of the fraud. We see the sales effort but not the people they're selling to. How can you do a movie about people perpetrating a fraud without showing the fraud & its effects???   2) There's no character development: They all start out as disgusting creeps and they all end up being disgusting creeps.   3) There's not much of a plot: It's 2 and 1/2 hours of debauchery and then 1/2 hour of getting caught. The debauchery part goes on forever and gets boring really fast. Not to mention disturbing & disgusting. Did Scorsese really make this movie just to show all this debauchery? What's the point of showing 2 and 1/2 hours of it? We get the point that they are gross lunatics pretty fast. Why keep going with seemingly endless variations of it? There is no point to it.   So, when all is said & done, this is b

In [34]:
classify_review(input2)

'Negative Sentiment'