In [265]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords, state_union
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import gutenberg, movie_reviews, wordnet
import random
import pickle
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC


### Tokenizing words and Sentences

In [3]:
#nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

### Tokenizers

Form of grouping stuff

* Word Tokenizers - separates by word
* Sentence Tokenizers - separates by sentence

### Corpora

body of text. eg medical journals, presidential speeches

### Lexicon
dictionary - words and their meanings
investor speak vs english
investor speak - slangs "bull on the market"
english - bull: an animal.



In [18]:
example_text = "Liverpool will annihilate Chelsea. Easily a 4-0 victory for Liverpool."

#### Word Tokenization
Split by space.

#### Sentence Tokenization
Could use split punctuation, but could trip you out. Regex will be a pain. 

#### Example

In [19]:
print(sent_tokenize(example_text))
print(word_tokenize(example_text))

['Liverpool will annihilate Chelsea.', 'Easily a 4-0 victory for Liverpool.']
['Liverpool', 'will', 'annihilate', 'Chelsea', '.', 'Easily', 'a', '4-0', 'victory', 'for', 'Liverpool', '.']


#### Stop Words

English has a lot of filler words that appear very frequently like “and”, “the”, and “a”. When doing statistics on text, these words introduce a lot of noise since they appear way more frequently than other word. Stop words are usually identified by just by checking a hardcoded list of known stop words. But there’s no standard list of stop words that is appropriate for all applications. The list of words to ignore can vary depending on your application.

In [21]:
stop_words = set(stopwords.words("english"))
print(stop_words)

{"it's", 'if', 'needn', "shouldn't", 'hadn', 'no', "didn't", 'on', 'how', 'too', 'has', 're', 'couldn', 'd', 'an', 'by', "shan't", "should've", "you're", 'we', 'her', 'as', 'to', 'll', 'these', 'while', 'is', "haven't", "that'll", "aren't", 'that', 'did', 'weren', 'between', 'can', "hadn't", 'had', 'itself', 'but', 'being', 'the', 'are', 'over', 'same', "weren't", 'who', 'yourself', 'both', 'below', 'then', 'ours', 'with', 'after', 'theirs', 'do', 'down', 'will', 'should', 'ma', 'its', 'until', 'any', "mightn't", 'doing', 'because', 'most', 'again', 'not', 'having', 'them', 'am', "isn't", 'shouldn', 'for', 'more', 'were', 'you', "wouldn't", 'won', 'a', 'been', 'so', 'into', 'isn', 'where', 'don', 'those', 'this', 'about', 'your', 'than', 'and', 'it', 'which', 'above', 'what', 'nor', "hasn't", 'all', 'few', 'me', 'further', 'such', 'o', 'up', 'our', 'haven', "mustn't", "don't", 'their', 'here', 'whom', 'he', 'she', "you've", 'him', 'of', 'against', 'there', "couldn't", 'why', 'yourselve

In [24]:
filtered_sentence = [w for w in word_tokenize(example_text) if not w in stop_words]
filtered_sentence

['Liverpool',
 'annihilate',
 'Chelsea',
 '.',
 'Easily',
 '4-0',
 'victory',
 'Liverpool',
 '.']

#### Stemming

Form of "normalization". Take words then take the stem of the word.
for example - riding, ridden -- **root** is ride.

We do this because we might have a variation of words but really the meaning of the sentence is really unchanged.

I was taking a ride in the car

I was riding in the car.

two words having the same definition. Pointless, causes redundancy. 

PorterStemmer (circa 1979) used 

In [31]:
ps = PorterStemmer()
example_words = ["destroy","destroyed","destroying","destroys"]
stemmed_words = [ps.stem(w) for w in example_words]
stemmed_words

['destroy', 'destroy', 'destroy', 'destroy']

#### Part of Speech Tagging
labelling a part of speech to every word 


PunktSentenceTokenizer is an sentence boundary detection algorithm that must be trained to be used [1]. NLTK already includes a pre-trained version of the PunktSentenceTokenizer.

So if you use initialize the tokenizer without any arguments, it will default to the pre-trained version

You can also provide your own training data to train the tokenizer before using it. Punkt tokenizer uses an unsupervised algorithm, meaning you just train it with regular text.

https://stackoverflow.com/questions/35275001/use-of-punktsentencetokenizer-in-nltk

In [66]:
train_text = state_union.raw("2006-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
## sentence tokenizer
tokenized = custom_sent_tokenizer.tokenize(sample_text)


In [83]:
tagged = [nltk.pos_tag(nltk.word_tokenize(i)) for i in tokenized]


#### Chunking

Who is the sentence talking about. Named enity (many nouns) in the account? Words that modify that noun. Descriptive bunch of words surrounding that noun.

Chunking is a process of extracting phrases from unstructured text. Instead of just simple tokens which may not represent the actual meaning of the text, its advisable to use phrases such as “South Africa” as a single word instead of ‘South’ and ‘Africa’ separate words.

Can chunk to find noun phrases. United States of America needs to be together, President Bush should be together. Chunks help it keep it together.



https://rikenshah.github.io/articles/natural-language-

https://medium.com/greyatom/learning-pos-tagging-chunking-in-nlp-85f7f811a8cb


In [78]:
sentence = "President Obama Barack White House barked at the cat"
grammar = ('''
    NP: {<DT>?<JJ>*<NN>} # NP
    ''')
chunkParser = nltk.RegexpParser(grammar)
tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
tree = chunkParser.parse(tagged)


In [74]:
tree.draw()

### Chinking
A chink is what we wish to remove from the chunk. Can Use regular expressions to remove unwanted 

#### Name Entity Recoginition

In [82]:
words = nltk.word_tokenize(sentence)
tagged = nltk.pos_tag(words)
namedEnt = nltk.ne_chunk(tagged, binary=True)

namedEnt.draw()

#### Lemmatizing

Stemming usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma 

#### Caveat
 The only major thing to note is that lemmatize takes a part of speech parameter, "pos." If not supplied, the default is "noun." This means that an attempt will be made to find the closest noun, which can create trouble for you. 

In [108]:
lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("better"))
print(lemmatizer.lemmatize("better",'a'))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("runs"))


cat
cactus
goose
rock
better
good
best
run


#### Wordnet

In [124]:
syns = wordnet.synsets("good")
syns

[Synset('good.n.01'),
 Synset('good.n.02'),
 Synset('good.n.03'),
 Synset('commodity.n.01'),
 Synset('good.a.01'),
 Synset('full.s.06'),
 Synset('good.a.03'),
 Synset('estimable.s.02'),
 Synset('beneficial.s.01'),
 Synset('good.s.06'),
 Synset('good.s.07'),
 Synset('adept.s.01'),
 Synset('good.s.09'),
 Synset('dear.s.02'),
 Synset('dependable.s.04'),
 Synset('good.s.12'),
 Synset('good.s.13'),
 Synset('effective.s.04'),
 Synset('good.s.15'),
 Synset('good.s.16'),
 Synset('good.s.17'),
 Synset('good.s.18'),
 Synset('good.s.19'),
 Synset('good.s.20'),
 Synset('good.s.21'),
 Synset('well.r.01'),
 Synset('thoroughly.r.02')]

In [125]:
syns[0].lemmas()[0].name()

'good'

In [121]:
syns[0].definition()

'a series of steps to be carried out or goals to be accomplished'

In [123]:
syns[0].examples()

['they drew up a six-step plan', 'they discussed plans for a new bond issue']

In [134]:
syns[0].lemmas()[0].name()

'good'

#### Semantic Similarity

In [139]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("cat.n.01")
w1.wup_similarity(w2)

0.32

In [144]:
w1 = wordnet.synset("trap.n.01")
w2 = wordnet.synset("trap.n.01")
w1.wup_similarity(w2)

1.0

#### Text classifier for Sentiment Analysis

In [271]:
documents = []


for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append(tuple([movie_reviews.words(fileid),category]))


In [272]:
### create training and testing set.
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]


In [273]:
random.shuffle(documents)

#### Bag of Words

 A way of extracting features from the text for use in machine learning algorithms. We use the tokenized words for each observation and find out the frequency of each token.
 
 1. We get a list of all possible words from all our documents including the punctuation.
 2. Create vectors Counting the number of times each word appears in a document.



In [274]:
### Massive word list

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())


In [275]:
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]

In [276]:
def find_features(document):
    ## find all unique words
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [277]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]


In [278]:
training_set = featuresets[:1900]
testing_set = featuresets[1900:]


In [279]:
classifier = nltk.NaiveBayesClassifier.train(training_set)


### Saving classifier using pickle


In [253]:
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [254]:
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

In [280]:
print("Naive Bayes Algo accuracy: ",(nltk.classify.accuracy(classifier, testing_set))*100)

Naive Bayes Algo accuracy:  75.0


In [281]:
classifier.show_most_informative_features(15)

Most Informative Features
                   sucks = True              neg : pos    =     10.5 : 1.0
                 frances = True              pos : neg    =      9.1 : 1.0
                  annual = True              pos : neg    =      9.1 : 1.0
                     ugh = True              neg : pos    =      8.9 : 1.0
           unimaginative = True              neg : pos    =      8.2 : 1.0
             silverstone = True              neg : pos    =      7.6 : 1.0
              schumacher = True              neg : pos    =      7.3 : 1.0
                 idiotic = True              neg : pos    =      7.1 : 1.0
                  regard = True              pos : neg    =      7.1 : 1.0
                  suvari = True              neg : pos    =      6.9 : 1.0
                    mena = True              neg : pos    =      6.9 : 1.0
               atrocious = True              neg : pos    =      6.5 : 1.0
                obstacle = True              pos : neg    =      6.4 : 1.0

#### NLTK and Sci-kit

In [282]:
#### MultinomialNB
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:",nltk.classify.accuracy(MNB_classifier, testing_set))

MultinomialNB accuracy percent: 0.83


In [283]:
#### BernoulliNB
BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(BNB_classifier, testing_set))

BernoulliNB accuracy percent: 0.78


In [284]:
#### Logistic Regression

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)


LogisticRegression_classifier accuracy percent: 90.0


In [285]:
#### Stochastic Gradient Descent
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)



SGDClassifier_classifier accuracy percent: 81.0


In [286]:
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

SVC_classifier accuracy percent: 82.0


In [270]:
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)


NuSVC_classifier accuracy percent: 81.0


#### Refractoring the above code in a function and writing a function 