# Sentiment Classifier Using NLTK

In [1]:
import numpy

In [2]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

Using TensorFlow backend.


In [3]:
numpy.random.seed(42)

In [4]:
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = top_words)

In [5]:
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen = max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen = max_review_length)

In [6]:
embedding_vector_length = 35
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length = max_review_length ))
model.add(LSTM(100))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss= 'binary_crossentropy', optimizer = 'adam', metrics= ['accuracy'])
print(model.summary())

model.fit(X_train, y_train, validation_data=(X_test, y_test) , epochs=3, batch_size=64)
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 35)           175000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               54400     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 229,501
Trainable params: 229,501
Non-trainable params: 0
_________________________________________________________________
None
Instructions for updating:
Use tf.cast instead.
Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 76.77%


# Starting with Latent Semantic Analysis

Its a text classification technique which create word as vectors for further computation

In [12]:
from nltk.corpus import reuters
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
import nltk
nltk.download('reuters')

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Swayanshu\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [14]:
tfidf = TfidfVectorizer()

In [15]:
from nltk.corpus import PlaintextCorpusReader
corpus_root = 'C:/Users/Swayanshu'  # Mac users should leave out C:
inaug = PlaintextCorpusReader(corpus_root, '.*txt')  # all files ending in 'txt' 

# Tokenization

In [19]:
import nltk
from nltk import word_tokenize
r = input("write any text ")
print("The length of text is", len(word_tokenize(r)), "words")

write any text this is a sunny day and very charming
The length of text is 8 words


In [20]:
from nltk import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize('Have an analytical day to everyone. Hope you guys will be fascinated towards Data science')

['Have',
 'an',
 'analytical',
 'day',
 'to',
 'everyone.',
 'Hope',
 'you',
 'guys',
 'will',
 'be',
 'fascinated',
 'towards',
 'Data',
 'science']

In [23]:
text = nltk.word_tokenize("ask as much doubt you have in a particular subject, don't hesitate")
print(text)

['ask', 'as', 'much', 'doubt', 'you', 'have', 'in', 'a', 'particular', 'subject', ',', 'do', "n't", 'hesitate']


In [25]:
from nltk import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
tokenizer.tokenize("ask as much doubt you have in a particular subject, don't hesitate")

['ask',
 'as',
 'much',
 'doubt',
 'you',
 'have',
 'in',
 'a',
 'particular',
 'subject',
 ',',
 'don',
 "'",
 't',
 'hesitate']

In [26]:
import nltk
sent = "she secured 9.04cgpa in her under graduation"
from nltk.tokenize import WhitespaceTokenizer
WhitespaceTokenizer().tokenize(sent)

['she', 'secured', '9.04cgpa', 'in', 'her', 'under', 'graduation']

In [29]:
import nltk
sent = "she secured 9.04cgpa in her under graduation"
print(sent.split('\n'))
print (sent.split(' '))
print (sent.split())

['she secured 9.04cgpa in her under graduation']
['she', 'secured', '9.04cgpa', 'in', 'her', 'under', 'graduation']
['she', 'secured', '9.04cgpa', 'in', 'her', 'under', 'graduation']


# Normalization

In [30]:
text = "HaRd WoRK is the KEY for SucCess"
print(text.lower())

hard work is the key for success


In [32]:
print(text.upper())

HARD WORK IS THE KEY FOR SUCCESS


In [34]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [36]:
def para_fraction(text):
    stopwords = nltk.corpus.stopwords.words('english')
    para = [w for w in text if w.lower() not in stopwords]
    return len(para) / len(text)
    

In [40]:
para_fraction(nltk.corpus.inaugural.words())

0.5228599855902837

replacer

from replacers import RepeatReplacer
replace = RepeatReplacer()
replacer.replace('Lottttttt')

# Understanding Word Frequency

In [49]:
corpus = nltk.corpus.brown.tagged_sents(categories = 'adventure')[:700]


In [50]:
print(len(corpus))

700


In [51]:
from nltk.util import unique_list
tag_set = unique_list(tag for sent in corpus for(word,tag) in sent)
print(len(tag_set))

104


In [53]:
symbols = unique_list(word for sent in corpus for (word,tag) in sent)

In [54]:
trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
train_corpus = []
test_corpus = []
for i in range(len(corpus)):
    if i % 10:
        train_corpus += [corpus[i]]
    else:
        test_corpus += [corpus[i]]
            

In [56]:
print(len(train_corpus))

630


In [57]:
print(len(test_corpus))

70


In [58]:
def train_and_test(est):
    hmm = trainer.train_supervised(train_corpus, estimator= est)
    print('%.2f%%' % (100*hmm.evaluate(test_corpus)))
    

# Introducing Parts-of-Speech Tagging

In [60]:
import nltk
text1 = nltk.word_tokenize("What a pleasant day today")
nltk.pos_tag(text1)

[('What', 'WP'),
 ('a', 'DT'),
 ('pleasant', 'JJ'),
 ('day', 'NN'),
 ('today', 'NN')]

In [61]:
taggedword = nltk.tag.str2tuple('day/NN')
taggedword

('day', 'NN')

In [62]:
taggedword[0]

'day'

In [63]:
taggedword[1]

'NN'

# Statistical Modeling Involving the n-gram Approach

In [64]:
from nltk.tag import UnigramTagger 
from nltk.corpus import treebank
training = treebank.tagged_sents()[:7000]
unitagger = UnigramTagger(training)
treebank.sents()[0]

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [65]:
testing = treebank.tagged_sents()[:2000]
unitagger.evaluate(testing)

0.9581593179237475

In [68]:
from nltk.tag import UnigramTagger 
from nltk.corpus import treebank

unitag = UnigramTagger(model={'Vinken' : 'NN'})
unitag.tag(treebank.sents()[0])

[('Pierre', None),
 ('Vinken', 'NN'),
 (',', None),
 ('61', None),
 ('years', None),
 ('old', None),
 (',', None),
 ('will', None),
 ('join', None),
 ('the', None),
 ('board', None),
 ('as', None),
 ('a', None),
 ('nonexecutive', None),
 ('director', None),
 ('Nov.', None),
 ('29', None),
 ('.', None)]

In [69]:
from nltk.tag import AffixTagger 
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training = treebank.tagged_sents()[:7000]
prefixtagger = AffixTagger(training, affix_length = 4)
prefixtagger.evaluate(testing)

0.2094751318841472

In [70]:
prefixtagger3 = AffixTagger(training, affix_length = 3, backoff = prefixtagger)
prefixtagger3.evaluate(testing)

0.25841082168442225

In [72]:
suffixtagger3 = AffixTagger(training, affix_length = -3, backoff = prefixtagger)
suffixtagger3.evaluate(testing)

0.29166410082722666

In [74]:
suffixtagger4 = AffixTagger(training, affix_length = -4, backoff = suffixtagger3)
suffixtagger4.evaluate(testing)

0.3288379826343987

In [75]:
from nltk.tag import tnt 
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training = treebank.tagged_sents()[:7000]
tnt_tagger = tnt.TnT()
tnt_tagger.train(training)
tnt_tagger.evaluate(testing)

0.9882176652913768