## Natural Language Toolkit

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [3]:
sample = "This is the thing which i always wanted to do! Whooo Wait what? nothing"
# sent to seperate as a sentences
print(sent_tokenize(sample.lower()))
# seperate as word wise
words = word_tokenize(sample.lower())

['this is the thing which i always wanted to do!', 'whooo wait what?', 'nothing']


In [4]:
import string
punct = list(string.punctuation)
print(punct)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [5]:
# to remove stop words
from nltk.corpus import stopwords
stop = stopwords.words('english') # contain all in small letters means not contain This but contain this
stop = stop + punct
clean_words = [w for w in words if w not in stop]
clean_words

['thing', 'always', 'wanted', 'whooo', 'wait', 'nothing']

## Stemming (player, playing, played, play)

In [6]:
from nltk.stem import PorterStemmer

In [7]:
ps = PorterStemmer()
lis = ['Player', 'Play', 'Playing', 'played', 'rahul', 'happii', 'happier'] # player and play bith have diff context
stemmed = [ps.stem(w) for w in lis]
print(stemmed) # this can produce even those words which are even not part of the english dictionary

['player', 'play', 'play', 'play', 'rahul', 'happii', 'happier']


## Part of Speech (whether the word used is a nou,adjctive,verb,pronoun etc,,)

In [8]:
from nltk import pos_tag
from nltk.corpus import state_union

In [9]:
text = state_union.raw("2006-GWBush.txt")
pos = pos_tag(word_tokenize(text)) # needed to convert in the list
# print(pos) # to see each word

In [10]:
# nltk.help.upenn_tagset() # to see what tag meaning is.

In [11]:
print(pos_tag(["raj", "went", "for", "a", "walk"]))
print(pos_tag(["This", "painting", "is", "beautiful", "."]))
str1 = "I have been painting since morning."
print(pos_tag(word_tokenize(str1)))

[('raj', 'NN'), ('went', 'VBD'), ('for', 'IN'), ('a', 'DT'), ('walk', 'NN')]
[('This', 'DT'), ('painting', 'NN'), ('is', 'VBZ'), ('beautiful', 'JJ'), ('.', '.')]
[('I', 'PRP'), ('have', 'VBP'), ('been', 'VBN'), ('painting', 'VBG'), ('since', 'IN'), ('morning', 'NN'), ('.', '.')]


## Lemmitizer(more powerfull way than stemming)

In [12]:
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()

In [13]:
print(lem.lemmatize("better", pos = 'a'))
print(lem.lemmatize("good", pos = 'a'))
print(lem.lemmatize("excellent", pos = 'n'))
print(lem.lemmatize("painting", pos = 'n')) # This painting is awesome
print(lem.lemmatize("painting", pos = 'v')) # i am painting this wall

good
good
excellent
painting
paint


In [14]:
from nltk.corpus import wordnet
# lemmatizer need part of speech in a diff format like for a noun we need to pass 'n'
def simpler_pos(word):
    if word.startswith('J'):
        return wordnet.ADJ
    elif word.startswith('V'):
        return wordnet.VERB # this constant defines in word net  from which we used lemmatizer
    elif word.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN #not actually just use it as for simplicity

## Movie Reviews Dataset Project

In [15]:
from nltk.corpus import movie_reviews

In [16]:
print(movie_reviews.categories()) # all categories  in the dataset
print(len(movie_reviews.fileids()))
print(len(movie_reviews.fileids('neg'))) # lenth of negative reviews
print(movie_reviews.words(movie_reviews.fileids()[10])) # accessing the words written in a review

['neg', 'pos']
2000
1000
['best', 'remembered', 'for', 'his', 'understated', ...]


In [17]:
documents = [] # store all 2000 reviews with category in a tuple
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [18]:
import random
random.shuffle(documents) # to shuffle t randomly for easy train and test split
documents[0:5]

[(['despite', 'its', 'exceedingly', 'well', '-', 'done', ...], 'neg'),
 (['well', 'if', 'you', 'are', 'up', 'for', 'stellar', ...], 'neg'),
 (['by', 'the', 'time', 'dennis', 'quaid', ',', 'the', ...], 'neg'),
 (['the', 'last', 'steve', 'martin', 'film', 'i', 'saw', ...], 'pos'),
 (['it', "'", 's', 'tough', 'to', 'really', 'say', ...], 'pos')]

In [19]:
def clean_review(words):
    output = []
    for w in words:
        if w.lower() not in stop:
            pos = pos_tag([w])#pass word as array elem nt as strg becz then it consider each char of string as seprate elem
            clean_word = lem.lemmatize(w, pos = simpler_pos(pos[0][1])) # as pos_tag will give an tuple 1st te answer 2nd its tag
            output.append(clean_word.lower())
    return output

In [20]:
documen = [(clean_review(document), category) for document, category in documents]

In [21]:
train_doc = documen[0:1500]
test_doc = documen[1500:]

In [22]:
all_words = []
for doc in train_doc:
    all_words += doc[0] # doc is a tuple with all words at its 0th entry

In [23]:
import nltk
freq = nltk.FreqDist(all_words) # to store all frequencies
common = freq.most_common(3000)
features = [i[0] for i in common] #  it is an array of tuples each doc has there feat and category 

In [24]:
def get_features_dict(words):
    current_feat = {}
    words_set = set(words)
    for w in features:
        current_feat[w] = w in words_set
    return current_feat

In [25]:
training_data = [(get_features_dict(doc), category) for doc, category in train_doc]
testing_data = [(get_features_dict(doc), category) for doc, category in test_doc]

In [26]:
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(training_data)
nltk.classify.accuracy (classifier, testing_data)

0.818

In [27]:
classifier.show_most_informative_features(15)

Most Informative Features
                  seagal = True              neg : pos    =     10.9 : 1.0
                 freddie = True              neg : pos    =     10.9 : 1.0
             outstanding = True              pos : neg    =     10.7 : 1.0
                  sloppy = True              neg : pos    =     10.5 : 1.0
                  prinze = True              neg : pos    =     10.2 : 1.0
              schumacher = True              neg : pos    =      9.5 : 1.0
                   inept = True              neg : pos    =      9.2 : 1.0
                   mulan = True              pos : neg    =      9.1 : 1.0
             wonderfully = True              pos : neg    =      8.6 : 1.0
                   ideal = True              pos : neg    =      7.8 : 1.0
            respectively = True              pos : neg    =      7.4 : 1.0
                   anger = True              pos : neg    =      6.2 : 1.0
                 idiotic = True              neg : pos    =      6.0 : 1.0

## Using SkLearn's Classifiers within NLTK

In [28]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [29]:
svc = SVC()
classifier_sklearn = SklearnClassifier(svc)
classifier_sklearn.train(training_data)

<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))>

In [30]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.786

In [31]:
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [32]:
rfc = RandomForestClassifier()
classifier_sklearn1 = SklearnClassifier(rfc)
classifier_sklearn1.train(training_data)

<SklearnClassifier(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))>

In [33]:
nltk.classify.accuracy(classifier_sklearn1, testing_data)

0.694