In [1]:
import nltk, inspect, os, random, pickle
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, movie_reviews
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [2]:
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [3]:
documents = [(list(movie_reviews.words(fileids= files)),category) for category in movie_reviews.categories() for files in movie_reviews.fileids(categories= category)]

In [4]:
random.shuffle(documents)

In [5]:
all_words = [word for word in movie_reviews.words()]

In [6]:
def extract_keywords(word_list):
    word= [w.lower() for w in word_list if w.isalpha() if w not in stop_words]
    word = [lemmatiser.lemmatize(w) for w in word]
    tagged_wordlist = nltk.pos_tag(word)
    keyword = []
    count = 0
    
    while count < len(tagged_wordlist):
        if(tagged_wordlist[count][1] == "JJ" or tagged_wordlist[count][1] == "RB"):
            keyword.append(tagged_wordlist[count][0])
        count = count + 1
    return keyword

In [7]:
feature_keywords = extract_keywords(all_words)

In [8]:
print(feature_keywords[:100])

['teen', 'drive', 'accident', 'nightmare', 'critique', 'fuck', 'teen', 'touch', 'cool', 'bad', 'even', 'generally', 'highway', 'memento', 'good', 'bad', 'type', 'pretty', 'neat', 'terribly', 'well', 'main', 'simply', 'normal', 'fantasy', 'dream', 'back', 'dead', 'dead', 'strange', 'looooot', 'chase', 'weird', 'simply', 'personally', 'unravel', 'obviously', 'big', 'secret', 'want', 'completely', 'final', 'even', 'meantime', 'really', 'sad', 'actually', 'half', 'strangeness', 'start', 'little', 'still', 'guess', 'bottom', 'always', 'sure', 'even', 'secret', 'mean', 'melissa', 'away', 'lazy', 'okay', 'really', 'need', 'u', 'different', 'insight', 'apparently', 'away', 'decent', 'teen', 'fuck', 'somewhere', 'guess', 'little', 'pretty', 'good', 'exact', 'character', 'american', 'new', 'entire', 'actually', 'overall', 'rarely', 'pretty', 'redundant', 'pretty', 'cool', 'oh', 'apparently', 'still', 'hot', 'also', 'ever', 'skip', 'nightmare', 'elm', 'highway']


In [9]:
feature_keywords_freq_dist = nltk.FreqDist(feature_keywords)

In [10]:
feature_keywords_freq_dist

FreqDist({'tempted': 1,
          'kinetic': 10,
          'lot': 41,
          'patti': 3,
          'garth': 5,
          'summary': 11,
          'inappropriately': 1,
          'mere': 45,
          'thematic': 17,
          'maltese': 1,
          'ensemble': 57,
          'back': 935,
          'though': 3,
          'navigate': 4,
          'blanchett': 3,
          'faye': 2,
          'overwight': 1,
          'embark': 4,
          'saw': 26,
          'theory': 7,
          'unethical': 1,
          'unchanged': 3,
          'fx': 4,
          'recreational': 1,
          'ungar': 3,
          'oavs': 1,
          'scripture': 2,
          'investigate': 21,
          'leviathan': 3,
          'debatable': 2,
          'stretched': 1,
          'occasion': 1,
          'inhospitable': 1,
          'grail': 1,
          'genital': 2,
          'saddened': 1,
          'dancer': 4,
          'desperatly': 1,
          'meier': 1,
          'vintage': 1,
          'sniff': 4,
 

In [11]:
print(feature_keywords_freq_dist.most_common(100))

[('even', 2568), ('good', 2388), ('much', 2045), ('also', 1967), ('well', 1895), ('first', 1828), ('really', 1558), ('little', 1492), ('bad', 1395), ('never', 1374), ('new', 1292), ('many', 1268), ('great', 1150), ('u', 1072), ('big', 1064), ('still', 1053), ('however', 989), ('back', 935), ('real', 915), ('enough', 902), ('old', 887), ('last', 852), ('actually', 837), ('long', 835), ('almost', 820), ('ever', 776), ('funny', 750), ('young', 743), ('right', 735), ('original', 705), ('quite', 649), ('far', 635), ('high', 631), ('rather', 621), ('american', 608), ('yet', 605), ('always', 586), ('special', 572), ('hard', 569), ('instead', 565), ('black', 542), ('probably', 539), ('human', 538), ('away', 531), ('together', 521), ('pretty', 510), ('sure', 491), ('whole', 482), ('perhaps', 464), ('second', 457), ('especially', 456), ('completely', 440), ('different', 430), ('small', 429), ('simply', 428), ('several', 419), ('give', 411), ('true', 410), ('entire', 408), ('dead', 408), ('soon',

In [12]:
word_features =[x for (x,y) in feature_keywords_freq_dist.most_common()]

In [13]:
print(word_features[:100])

['even', 'good', 'much', 'also', 'well', 'first', 'really', 'little', 'bad', 'never', 'new', 'many', 'great', 'u', 'big', 'still', 'however', 'back', 'real', 'enough', 'old', 'last', 'actually', 'long', 'almost', 'ever', 'funny', 'young', 'right', 'original', 'quite', 'far', 'high', 'rather', 'american', 'yet', 'always', 'special', 'hard', 'instead', 'black', 'probably', 'human', 'away', 'together', 'pretty', 'sure', 'whole', 'perhaps', 'second', 'especially', 'completely', 'different', 'small', 'simply', 'several', 'give', 'true', 'entire', 'dead', 'soon', 'main', 'comic', 'else', 'final', 'unfortunately', 'wrong', 'next', 'full', 'often', 'alien', 'certainly', 'finally', 'interesting', 'maybe', 'able', 'top', 'later', 'nice', 'open', 'white', 'classic', 'short', 'screen', 'evil', 'nearly', 'early', 'major', 'exactly', 'close', 'obvious', 'already', 'deep', 'beautiful', 'live', 'sometimes', 'perfect', 'strong', 'quickly', 'truly']


In [14]:
len(set(feature_keywords))

16468

In [15]:
len(word_features)

16468

In [16]:
def find_features(document) :
    # document here is going to be first part of tuple i.e just a list of words
    words = set(extract_keywords(document))
    # Converting list to set, inludes all the words and not the amount of words
    features = {}
    # empty dictionary
    for w in word_features:
        features[w] = (w in words)  # returns true or false based on the words presence in top 3000
        # w, from word_features, is the key of features dictionary
        # w in words, from words i.e set(document) returns a boolean true or false
    return features

In [17]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [18]:
featuresets[:100]

[({'depite': False,
   'tempted': False,
   'leftover': False,
   'kinetic': False,
   'ensemble': False,
   'carbon': False,
   'hood': False,
   'myra': False,
   'origin': False,
   'jew': False,
   'patti': False,
   'garth': False,
   'summary': False,
   'inappropriately': False,
   'mere': False,
   'thematic': False,
   'alex': False,
   'fluid': False,
   'sable': False,
   'deepti': False,
   'maltese': False,
   'lot': False,
   'route': False,
   'distraught': False,
   'schwartz': False,
   'back': False,
   'thankful': False,
   'salaam': False,
   'dippy': False,
   'navigate': False,
   'agonizingly': False,
   'blanchett': False,
   'hairy': False,
   'naughtiness': False,
   'sterile': False,
   'beau': False,
   'faye': False,
   'pan': False,
   'thereof': False,
   'breakneck': False,
   'intense': False,
   'embark': False,
   'sensually': False,
   'turkey': False,
   'saw': False,
   'crucial': False,
   'earth': False,
   'hey': False,
   'theory': False,
   'u

In [19]:
len(featuresets)

2000

In [20]:
def cutoff(split = 0.75):
    return int(len(featuresets) * split)

In [21]:
training_set = featuresets[:cutoff()]

In [22]:
testing_set = featuresets[cutoff():]

In [23]:
myClassifier = nltk.NaiveBayesClassifier.train(training_set)

In [24]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(myClassifier, testing_set))*100)

Classifier accuracy percent: 77.8


In [25]:
myClassifier.show_most_informative_features(100)

Most Informative Features
                  batman = True              neg : pos    =     12.2 : 1.0
               ludicrous = True              neg : pos    =     11.7 : 1.0
                   lousy = True              neg : pos    =     10.9 : 1.0
              unbearable = True              neg : pos    =      9.6 : 1.0
                thematic = True              pos : neg    =      9.1 : 1.0
              derivative = True              neg : pos    =      8.9 : 1.0
             outstanding = True              pos : neg    =      8.3 : 1.0
             wonderfully = True              pos : neg    =      8.3 : 1.0
              accessible = True              pos : neg    =      7.7 : 1.0
                   fairy = True              pos : neg    =      7.7 : 1.0
                    echo = True              pos : neg    =      7.7 : 1.0
                  seagal = True              neg : pos    =      7.6 : 1.0
          excruciatingly = True              neg : pos    =      7.6 : 1.0

# Examples

In [26]:
def classify_review(review_input):
    review_word_list = [words for words in word_tokenize(review_input)]
    review_features = find_features(review_word_list)

    if(myClassifier.classify(review_features) == 'pos'):
        sentiment = "Positive Sentiment"
    else:
        sentiment = "Negative Sentiment"
    return(sentiment)

In [27]:
input1 = input("Please enter the review paragraphs here : \n\n")

Please enter the review paragraphs here : 

The last few Scorsese pics left me a little disappointed. I had begun to think Marty had become a 'gun for hire' and that his brilliance may have been spent (his earlier works were some of the best movies ever made). I attended a screening of The Wolf of Wall Street this evening, and was expecting to be unimpressed. I am happy to say I was completely blown away. This pic is Marty at his best. I laughed, I cringed, I related (with fond memories as well as a bit of guilt) and I TOTALLY believed every unbelievable moment. A good book, a great screenplay and a delightful cast were formed and molded into what I believe should get Scorsese a best director Oscar, and likely a Best Picture Award for the movie. Leo DiCaprio has grown into a versatile actor and his creation of this super hero dirtbag's roller coaster ride in this crazy (true) story is really honest and delightfully entertaining. Jonah Hill pulled out all the stops too and this is defin

In [28]:
classify_review(input1)

'Positive Sentiment'

In [29]:
input2 = input("Please enter the review paragraphs here : \n\n")

Please enter the review paragraphs here : 

It's hard to find the words to explain how TRULY AWFUL this film is. I'll try to do a list:  1) There's no context: They never show the victims of the fraud. We see the sales effort but not the people they're selling to. How can you do a movie about people perpetrating a fraud without showing the fraud & its effects???   2) There's no character development: They all start out as disgusting creeps and they all end up being disgusting creeps.   3) There's not much of a plot: It's 2 and 1/2 hours of debauchery and then 1/2 hour of getting caught. The debauchery part goes on forever and gets boring really fast. Not to mention disturbing & disgusting. Did Scorsese really make this movie just to show all this debauchery? What's the point of showing 2 and 1/2 hours of it? We get the point that they are gross lunatics pretty fast. Why keep going with seemingly endless variations of it? There is no point to it.   So, when all is said & done, this is b

In [30]:
classify_review(input2)

'Negative Sentiment'

[Input Reviews](http://www.imdb.com/title/tt0993846/reviews?ref_=tt_urv)