## Introduction to NLTK and text classification

In [1]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

### Word and Sentence Tokenization

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize

text = "This is a sample text. We are learning nltk, text-classification using Python. Very excited!"

print(sent_tokenize(text))

['This is a sample text.', 'We are learning nltk, text-classification using Python.', 'Very excited!']


In [4]:
print(word_tokenize(text))

['This', 'is', 'a', 'sample', 'text', '.', 'We', 'are', 'learning', 'nltk', ',', 'text-classification', 'using', 'Python', '.', 'Very', 'excited', '!']


### Stopwords in English

In [6]:
from nltk.corpus import stopwords
print(set(stopwords.words('english')))

set([u'all', u'just', u"don't", u'being', u'over', u'both', u'through', u'yourselves', u'its', u'before', u'o', u'don', u'hadn', u'herself', u'll', u'had', u'should', u'to', u'only', u'won', u'under', u'ours', u'has', u"should've", u"haven't", u'do', u'them', u'his', u'very', u"you've", u'they', u'not', u'during', u'now', u'him', u'nor', u"wasn't", u'd', u'did', u'didn', u'this', u'she', u'each', u'further', u"won't", u'where', u"mustn't", u"isn't", u'few', u'because', u"you'd", u'doing', u'some', u'hasn', u"hasn't", u'are', u'our', u'ourselves', u'out', u'what', u'for', u"needn't", u'below', u're', u'does', u"shouldn't", u'above', u'between', u'mustn', u't', u'be', u'we', u'who', u"mightn't", u"doesn't", u'were', u'here', u'shouldn', u'hers', u"aren't", u'by', u'on', u'about', u'couldn', u'of', u"wouldn't", u'against', u's', u'isn', u'or', u'own', u'into', u'yourself', u'down', u"hadn't", u'mightn', u"couldn't", u'wasn', u'your', u"you're", u'from', u'her', u'their', u'aren', u"it's",

In [8]:
example_sent = "This is some sample text, showing off the stop words filtration."
stop_set = set(stopwords.words('english'))
word_tokens = word_tokenize(example_sent)
filtered_sent = [word for word in word_tokens if word not in stop_set]
print(example_sent)
print(filtered_sent)

This is some sample text, showing off the stop words filtration.
['This', 'sample', 'text', ',', 'showing', 'stop', 'words', 'filtration', '.']


### Stemming in English

In [9]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
sample_text = "When riders are riding their horses, they often think of how cowboys rode horses."
sample_words = word_tokenize(sample_text)
stem_words = [ps.stem(w) for w in sample_words]
print(stem_words)

['when', u'rider', 'are', u'ride', 'their', u'hors', ',', 'they', 'often', 'think', 'of', 'how', u'cowboy', 'rode', u'hors', '.']


### POS tagging with NLTK

In [11]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_sample = state_union.raw('2005-GWBush.txt')
test_sample = state_union.raw('2006-GWBush.txt')
custom_tokenizer = PunktSentenceTokenizer(train_sample)
tokenized_test = custom_tokenizer.tokenize(test_sample)

In [14]:
tagged = [nltk.pos_tag(word_tokenize(sent)) for sent in tokenized_test]

In [15]:
print(tagged[0])

[(u'PRESIDENT', 'NNP'), (u'GEORGE', 'NNP'), (u'W.', 'NNP'), (u'BUSH', 'NNP'), (u"'S", 'POS'), (u'ADDRESS', 'NNP'), (u'BEFORE', 'IN'), (u'A', 'NNP'), (u'JOINT', 'NNP'), (u'SESSION', 'NNP'), (u'OF', 'IN'), (u'THE', 'NNP'), (u'CONGRESS', 'NNP'), (u'ON', 'NNP'), (u'THE', 'NNP'), (u'STATE', 'NNP'), (u'OF', 'IN'), (u'THE', 'NNP'), (u'UNION', 'NNP'), (u'January', 'NNP'), (u'31', 'CD'), (u',', ','), (u'2006', 'CD'), (u'THE', 'NNP'), (u'PRESIDENT', 'NNP'), (u':', ':'), (u'Thank', 'NNP'), (u'you', 'PRP'), (u'all', 'DT'), (u'.', '.')]


## Text Classification using NLTK

In [17]:
import random
from nltk.corpus import movie_reviews
docs = [(list(movie_reviews.words(fileid)), category)
       for category in movie_reviews.categories()
       for fileid in movie_reviews.fileids(category)]
random.shuffle(docs)
vocab = [w.lower() for w in movie_reviews.words()]
vocab_dist = nltk.FreqDist(vocab)

In [20]:
print(len(docs))
print(docs[0])

2000
([u'in', u'the', u'series', u'of', u'the', u'erotic', u'thrillers', u'that', u'flooded', u'the', u'videoshelves', u'in', u'the', u'early', u'1990s', u'came', u'this', u'french', u'-', u'canadian', u'co', u'-', u'production', u'by', u'max', u'fischer', u'.', u'the', u'movie', u'is', u'set', u'in', u'paris', u'where', u'its', u'hero', u',', u'struggling', u'american', u'author', u'david', u'mirkine', u'(', u'judd', u'nelson', u',', u'at', u'the', u'time', u'specialised', u'in', u'playing', u'losers', u'and', u'people', u'at', u'the', u'edge', u'of', u'sanity', u')', u'suffers', u'a', u'terrible', u'writers', u'bloc', u'.', u'he', u'manages', u'to', u'overcome', u'crisis', u'after', u'beginning', u'romantic', u'relationship', u'with', u'beautiful', u'model', u'anabelle', u'(', u'laurence', u'treill', u')', u'.', u'unfortunately', u',', u'she', u'hangs', u'out', u'in', u'jet', u'set', u'circles', u',', u'which', u'gradually', u'makes', u'mirkine', u'pathologically', u'jealous', u'.', 

In [21]:
word_features = list(vocab_dist.keys())[:4000]
print(len(word_features))

4000


In [22]:
featuresets = [({w:(w in rev) for w in word_features},category) for (rev, category) in docs]

In [25]:
from sklearn.model_selection import train_test_split
seed = 1
train_data, test_data = train_test_split(featuresets, test_size=0.25, random_state=seed)

In [27]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel='linear'))
model.train(train_data)
accuracy = nltk.classify.accuracy(model, test_data)
print(accuracy)

0.678
