## Data preprocessing tools

In [89]:
# Tokenize the words

In [90]:
text = "Does this thing really work? Let's see."

In [91]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [92]:
sent_tokenize(text)

['Does this thing really work?', "Let's see."]

In [93]:
word_tokenize(text)

['Does', 'this', 'thing', 'really', 'work', '?', 'Let', "'s", 'see', '.']

In [94]:
# Remove unneccessary words

In [95]:
from nltk.corpus import stopwords
import string
stops = stopwords.words('english')
punctuations = list(string.punctuation)
stops += punctuations
stops = set(stops)

In [96]:
[w for w in word_tokenize(text.lower()) if w not in stops]

['thing', 'really', 'work', 'let', "'s", 'see']

In [97]:
# Stemming

In [98]:
words = ['play', 'playing', 'played', 'player']

In [99]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [100]:
stemmed_words = [ps.stem(w) for w in words]
stemmed_words

['play', 'play', 'play', 'player']

In [101]:
# Part of speech

In [102]:
from nltk import pos_tag
from nltk.corpus import state_union

In [103]:
# load the speech
text = state_union.raw('2006-GWBush.txt')
text[: 500]

"PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all. Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream. Tonight we are comforted by the hope of a glad reunion with the hus"

In [104]:
pos = pos_tag(word_tokenize(text))
pos[: 10]

[('PRESIDENT', 'NNP'),
 ('GEORGE', 'NNP'),
 ('W.', 'NNP'),
 ('BUSH', 'NNP'),
 ("'S", 'POS'),
 ('ADDRESS', 'NNP'),
 ('BEFORE', 'IN'),
 ('A', 'NNP'),
 ('JOINT', 'NNP'),
 ('SESSION', 'NNP')]

In [105]:
len(pos)

6461

In [106]:
# Lemmatization

In [107]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [108]:
lemmatizer.lemmatize('better', pos='a')

'good'

In [109]:
lemmatizer.lemmatize('painting', pos='v')

'paint'

In [110]:
lemmatizer.lemmatize('painting', pos='n')

'painting'

## Actual work

In [111]:
# Load the data

In [112]:
from nltk.corpus import movie_reviews
movie_reviews.categories()

['neg', 'pos']

In [113]:
movie_reviews.fileids()[: 5]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt']

In [114]:
print(len(movie_reviews.fileids()))
print(len(movie_reviews.fileids('neg')))

2000
1000


In [115]:
words = movie_reviews.words(movie_reviews.fileids()[0])
print(words)
print(len(words))

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]
879


In [116]:
# Extracting the words array and categories

In [117]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
        
documents[: 5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [127]:
# shuffle them
import random
random.shuffle(documents)
documents[: 5], len(documents[0][0])

([(['ladies', 'and', 'gentlemen', ',', '1997', "'", 's', ...], 'pos'),
  (['a', 'suave', ',', 'cool', ',', 'collected', ',', ...], 'neg'),
  (['bruce', 'willis', 'needs', 'to', 'stay', 'away', ...], 'neg'),
  (['plot', ':', 'a', 'little', 'boy', 'born', 'in', ...], 'neg'),
  (['susan', 'granger', "'", 's', 'review', 'of', '"', ...], 'pos')],
 1809)

### Data preprocessing

In [131]:
from nltk.corpus import wordnet

# simple pos tags will help to lemmatize the words
def getSimplePosTag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    
    elif tag.startswith('V'):
        return wordnet.VERB
    
    elif tag.startswith('N'):
        return wordnet.NOUN
    
    elif tag.startswith('R'):
        return wordnet.ADV
    
    else:
        return wordnet.NOUN

In [132]:
from nltk import pos_tag
print(pos_tag(['better']))
print(pos_tag(['better'])[0][1])

[('better', 'RBR')]
RBR


In [133]:
def getCleanedDocument(words):
    result = []
    for word in words:
        if word.lower() not in stops:
            stemmed_word = ps.stem(word)
            tag = pos_tag([word])[0][1]
            lemmatized = lemmatizer.lemmatize(word, pos=getSimplePosTag(tag))
            result.append(lemmatized.lower())
    return result

In [134]:
documents[: 5]

[(['ladies', 'and', 'gentlemen', ',', '1997', "'", 's', ...], 'pos'),
 (['a', 'suave', ',', 'cool', ',', 'collected', ',', ...], 'neg'),
 (['bruce', 'willis', 'needs', 'to', 'stay', 'away', ...], 'neg'),
 (['plot', ':', 'a', 'little', 'boy', 'born', 'in', ...], 'neg'),
 (['susan', 'granger', "'", 's', 'review', 'of', '"', ...], 'pos')]

In [135]:
len(documents[0][0])

1809

In [136]:
len(getCleanedDocument(documents[0][0]))

836

In [None]:
new_documents = [(getCleanedDocument(doc), cat) for doc, cat in documents]

In [139]:
len(new_documents[0][0])

836

In [140]:
documents[: 5]

[(['ladies', 'and', 'gentlemen', ',', '1997', "'", 's', ...], 'pos'),
 (['a', 'suave', ',', 'cool', ',', 'collected', ',', ...], 'neg'),
 (['bruce', 'willis', 'needs', 'to', 'stay', 'away', ...], 'neg'),
 (['plot', ':', 'a', 'little', 'boy', 'born', 'in', ...], 'neg'),
 (['susan', 'granger', "'", 's', 'review', 'of', '"', ...], 'pos')]

### Count vectorization

In [142]:
data = ['the sky is blue', 'the sun is bright']

In [150]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3)
output = cv.fit_transform(data)
output

<2x3 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [151]:
output.toarray()

array([[1, 1, 1],
       [0, 1, 1]], dtype=int64)

In [152]:
cv.get_feature_names()

['blue', 'is', 'the']

In [153]:
' '.join(['hello', 'sir'])

'hello sir'

In [154]:
categories = [0 if cat=='neg' else 1 for doc, cat in new_documents]
text_docs = [' '.join(doc) for doc, cat in new_documents]

In [156]:
print(len(text_docs))
print(text_docs[0][: 500])
categories[0]

2000
lady gentleman 1997 independence day title starship trooper surprisingly entertain id4 realize give last year sci fi hit 4 star rating spell powerful first hour subsequent viewing feel independence day really great seem -- though still give positive review starship trooper hand mock sense humor know perfectly well ridiculous may seem result starship trooper excite energetic lively science fiction film fact probably could never get tire watch film starship trooper reminiscient star war another ki


1

In [157]:
from sklearn.model_selection import train_test_split as split
x_train, x_test, y_train, y_test = split(text_docs, categories, test_size=.3, random_state=0)
len(x_train), len(x_test)

(1400, 600)

In [158]:
cv = CountVectorizer(max_features=3000, ngram_range=(1, 3), max_df=.8, min_df=.1)
x_train_count = cv.fit_transform(x_train)
x_train_count.toarray()

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 2, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 2, ..., 0, 0, 0],
       [1, 0, 2, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]], dtype=int64)

In [189]:
cv.get_feature_names()[: 10]

['able',
 'across',
 'act',
 'action',
 'actor',
 'actually',
 'add',
 'age',
 'almost',
 'alone']

In [159]:
len(cv.get_feature_names())

448

In [160]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

In [162]:
x_test_count = cv.transform(x_test)
x_test_count.toarray()

array([[0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 2, ..., 0, 0, 2],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 1]], dtype=int64)

In [163]:
x_test_count_array = x_test_count.toarray()
x_test_count_array.shape

(600, 448)

In [164]:
model.fit(x_train_count, y_train)
model.score(x_test_count_array, y_test)

0.8016666666666666

In [165]:
from sklearn.svm import SVC
model = SVC()

In [166]:
model.fit(x_train_count, y_train)
model.score(x_test_count_array, y_test)

0.7833333333333333

In [168]:
docs = ['This is the first document.',
          'This document is the second document.',
          'And this is the third one.',
          'Is this the first document?',]

In [169]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [170]:
response = tfidf.fit_transform(docs)
response.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

In [172]:
tfidf.get_feature_names()

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

In [173]:
tfidf.vocabulary_

{'this': 8,
 'is': 3,
 'the': 6,
 'first': 2,
 'document': 1,
 'second': 5,
 'and': 0,
 'third': 7,
 'one': 4}

In [174]:
print(response)

  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 8)	0.38408524091481483
  (1, 5)	0.5386476208856763
  (1, 1)	0.6876235979836938
  (1, 6)	0.281088674033753
  (1, 3)	0.281088674033753
  (1, 8)	0.281088674033753
  (2, 4)	0.511848512707169
  (2, 7)	0.511848512707169
  (2, 0)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 3)	0.267103787642168
  (2, 8)	0.267103787642168
  (3, 1)	0.46979138557992045
  (3, 2)	0.5802858236844359
  (3, 6)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 8)	0.38408524091481483


In [175]:
feature_names = tfidf.get_feature_names()
for col in response.nonzero()[1]:
    print(feature_names[col],'-', response[0, col])

document - 0.46979138557992045
first - 0.5802858236844359
the - 0.38408524091481483
is - 0.38408524091481483
this - 0.38408524091481483
second - 0.0
document - 0.46979138557992045
the - 0.38408524091481483
is - 0.38408524091481483
this - 0.38408524091481483
one - 0.0
third - 0.0
and - 0.0
the - 0.38408524091481483
is - 0.38408524091481483
this - 0.38408524091481483
document - 0.46979138557992045
first - 0.5802858236844359
the - 0.38408524091481483
is - 0.38408524091481483
this - 0.38408524091481483


In [176]:
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 3), max_df=.8, min_df=.1)
x_train_count = tfidf.fit_transform(x_train)
x_train_count.toarray()

array([[0.        , 0.        , 0.05861156, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.13944536, ..., 0.        , 0.08506925,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.14565737, ..., 0.        , 0.        ,
        0.        ],
       [0.07301869, 0.        , 0.09950506, ..., 0.        , 0.        ,
        0.05594542],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.04362013]])

In [185]:
model = MultinomialNB()
model.fit(x_train_count, y_train)

MultinomialNB()

In [186]:
x_test_count = tfidf.transform(x_test)

In [187]:
x_test_count_array = x_test_count.toarray()
model.score(x_test_count_array, y_test)

0.7933333333333333