## Natural Language Toolkit

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
sample = "This is the thing which i always wanted to do! Whooo Wait what? nothing"
# sent to seperate as a sentences
print(sent_tokenize(sample.lower()))
# seperate as word wise
words = word_tokenize(sample.lower())

['this is the thing which i always wanted to do!', 'whooo wait what?', 'nothing']


In [3]:
import string
punct = list(string.punctuation)
print(punct)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [4]:
# to remove stop words
from nltk.corpus import stopwords
stop = stopwords.words('english') # contain all in small letters means not contain This but contain this
stop = stop + punct
clean_words = [w for w in words if w not in stop]
clean_words

['thing', 'always', 'wanted', 'whooo', 'wait', 'nothing']

## Stemming (player, playing, played, play)

In [5]:
from nltk.stem import PorterStemmer

In [6]:
ps = PorterStemmer()
lis = ['Player', 'Play', 'Playing', 'played', 'rahul', 'happii', 'happier'] # player and play bith have diff context
stemmed = [ps.stem(w) for w in lis]
print(stemmed) # this can produce even those words which are even not part of the english dictionary

['player', 'play', 'play', 'play', 'rahul', 'happii', 'happier']


## Part of Speech (whether the word used is a nou,adjctive,verb,pronoun etc,,)

In [7]:
from nltk import pos_tag
from nltk.corpus import state_union

In [8]:
text = state_union.raw("2006-GWBush.txt")
pos = pos_tag(word_tokenize(text)) # needed to convert in the list
# print(pos) # to see each word

In [9]:
# nltk.help.upenn_tagset() # to see what tag meaning is.

In [10]:
print(pos_tag(["raj", "went", "for", "a", "walk"]))
print(pos_tag(["This", "painting", "is", "beautiful", "."]))
str1 = "I have been painting since morning."
print(pos_tag(word_tokenize(str1)))

[('raj', 'NN'), ('went', 'VBD'), ('for', 'IN'), ('a', 'DT'), ('walk', 'NN')]
[('This', 'DT'), ('painting', 'NN'), ('is', 'VBZ'), ('beautiful', 'JJ'), ('.', '.')]
[('I', 'PRP'), ('have', 'VBP'), ('been', 'VBN'), ('painting', 'VBG'), ('since', 'IN'), ('morning', 'NN'), ('.', '.')]


## Lemmitizer(more powerfull way than stemming)

In [11]:
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()

In [12]:
print(lem.lemmatize("better", pos = 'a'))
print(lem.lemmatize("good", pos = 'a'))
print(lem.lemmatize("excellent", pos = 'n'))
print(lem.lemmatize("painting", pos = 'n')) # This painting is awesome
print(lem.lemmatize("painting", pos = 'v')) # i am painting this wall

good
good
excellent
painting
paint


In [13]:
from nltk.corpus import wordnet
# lemmatizer need part of speech in a diff format like for a noun we need to pass 'n'
def simpler_pos(word):
    if word.startswith('J'):
        return wordnet.ADJ
    elif word.startswith('V'):
        return wordnet.VERB # this constant defines in word net  from which we used lemmatizer
    elif word.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN #not actually just use it as for simplicity

## Movie Reviews Dataset Project

In [14]:
from nltk.corpus import movie_reviews

In [15]:
print(movie_reviews.categories()) # all categories  in the dataset
print(len(movie_reviews.fileids()))
print(len(movie_reviews.fileids('neg'))) # lenth of negative reviews
print(movie_reviews.words(movie_reviews.fileids()[10])) # accessing the words written in a review

['neg', 'pos']
2000
1000
['best', 'remembered', 'for', 'his', 'understated', ...]


In [16]:
documents = [] # store all 2000 reviews with category in a tuple
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [17]:
import random
random.shuffle(documents) # to shuffle t randomly for easy train and test split
documents[0:5]

[(['house', 'on', 'haunted', 'hill', '(', '1999', ')', ...], 'neg'),
 (['here', "'", 's', 'a', 'concept', '--', 'jean', '-', ...], 'neg'),
 (['dr', '.', 'alan', 'grant', '(', 'sam', 'neill', ',', ...], 'neg'),
 (['old', 'soldiers', 'never', 'die', ',', 'they', ...], 'neg'),
 (['this', 'movie', 'is', 'written', 'by', 'the', 'man', ...], 'neg')]

In [18]:
def clean_review(words):
    output = []
    for w in words:
        if w.lower() not in stop:
            pos = pos_tag([w])#pass word as array elem nt as strg becz then it consider each char of string as seprate elem
            clean_word = lem.lemmatize(w, pos = simpler_pos(pos[0][1])) # as pos_tag will give an tuple 1st te answer 2nd its tag
            output.append(clean_word.lower())
    return output

In [19]:
documen = [(clean_review(document), category) for document, category in documents]

In [20]:
train_doc = documen[0:1500]
test_doc = documen[1500:]

In [21]:
all_words = []
for doc in train_doc:
    all_words += doc[0] # doc is a tuple with all words at its 0th entry

In [22]:
import nltk
freq = nltk.FreqDist(all_words) # to store all frequencies
common = freq.most_common(3000)
features = [i[0] for i in common] #  it is an array of tuples each doc has there feat and category 

In [23]:
def get_features_dict(words):
    current_feat = {}
    words_set = set(words)
    for w in features:
        current_feat[w] = w in words_set
    return current_feat

In [24]:
training_data = [(get_features_dict(doc), category) for doc, category in train_doc]
testing_data = [(get_features_dict(doc), category) for doc, category in test_doc]

In [25]:
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(training_data)
nltk.classify.accuracy (classifier, testing_data)

0.746

In [26]:
classifier.show_most_informative_features(15)

Most Informative Features
             magnificent = True              pos : neg    =     11.2 : 1.0
                    anna = True              pos : neg    =     10.6 : 1.0
              schumacher = True              neg : pos    =      9.5 : 1.0
                    zeta = True              neg : pos    =      9.5 : 1.0
            breathtaking = True              pos : neg    =      8.6 : 1.0
             outstanding = True              pos : neg    =      8.1 : 1.0
                religion = True              pos : neg    =      8.0 : 1.0
               ludicrous = True              neg : pos    =      7.7 : 1.0
                   anger = True              pos : neg    =      7.4 : 1.0
             wonderfully = True              pos : neg    =      7.3 : 1.0
                  seagal = True              neg : pos    =      7.2 : 1.0
                  random = True              neg : pos    =      7.1 : 1.0
                 balance = True              pos : neg    =      7.0 : 1.0