# Bag of words model

In [25]:
import time
import random
from math import *
import operator
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", 10000)

# import plotting libraries
import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from matplotlib import style
%matplotlib inline 

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)

# For text processing
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize  
from nltk.tokenize import sent_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


## Example 1

In [26]:
texts = [
    "It was the best of times it it  it it" ,
    "it was the worst of times",
    "it was the age of wisdom and lots of wisdom",
    "it was the age of foolishness"
]

In [27]:
# instantiate the count vectorizer
vect_cv = CountVectorizer()

In [28]:
# train (Bow) 
vect_cv.fit(texts)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [29]:
# get all the features/tokens
feature_names = vect_cv.get_feature_names()
print(feature_names)

# get count of tokens
print("Vocabulary size: {}".format(len(vect_cv.vocabulary_)))

['age', 'and', 'best', 'foolishness', 'it', 'lots', 'of', 'the', 'times', 'was', 'wisdom', 'worst']
Vocabulary size: 12


In [30]:
# print vocab in sorted manner
def get_key(val): 
    for key, value in vect_cv.vocabulary_.items(): 
         if val == value: 
            return key 

print('Position', 'Token')
for v in sorted(vect_cv.vocabulary_.values()) :  
     print('{:8d} {}'.format(v, get_key(v) )) 

Position Token
       0 age
       1 and
       2 best
       3 foolishness
       4 it
       5 lots
       6 of
       7 the
       8 times
       9 was
      10 wisdom
      11 worst


In [31]:
# prepare dtm
X_train_cv_dtm = vect_cv.transform(texts)

In [32]:
X_train_cv_dtm

<4x12 sparse matrix of type '<class 'numpy.int64'>'
	with 26 stored elements in Compressed Sparse Row format>

In [33]:
X_train_cv_dtm.toarray()

array([[0, 0, 1, 0, 5, 0, 1, 1, 1, 1, 0, 0],
       [0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1],
       [1, 1, 0, 0, 1, 1, 2, 1, 0, 1, 2, 0],
       [1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0]], dtype=int64)

In [34]:
# transform new test samples
test_texts = [
    "Pollution is very bad for health" ,
    "Govt not very keen on pollution control measures",
]

In [35]:
# prepare dtm
test_dtm = vect_cv.transform(test_texts)

In [36]:
test_dtm.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

## Example 2 (binary representation)

- note the default lowercasing of the tokens
- stop words are not removed

In [37]:
texts = [
    "I love apples. Apples are good for health. An apple a day keeps the doctor away",
    "Play football. It is very exciting. Football is played every where"
]

In [38]:
# instantiate the count vectorizer
vect_cv = CountVectorizer(binary=True)

In [39]:
# train (Bow) 
vect_cv.fit(texts)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [40]:
# get all the features/tokens
feature_names = vect_cv.get_feature_names()
print(feature_names)

# get count of tokens
print("Vocabulary size: {}".format(len(vect_cv.vocabulary_)))

['an', 'apple', 'apples', 'are', 'away', 'day', 'doctor', 'every', 'exciting', 'football', 'for', 'good', 'health', 'is', 'it', 'keeps', 'love', 'play', 'played', 'the', 'very', 'where']
Vocabulary size: 22


In [41]:
# print vocab in sorted manner
def get_key(val): 
    for key, value in vect_cv.vocabulary_.items(): 
         if val == value: 
            return key 

print('Position', 'Token')
for v in sorted(vect_cv.vocabulary_.values()) :  
     print('{:8d} {}'.format(v, get_key(v) )) 

Position Token
       0 an
       1 apple
       2 apples
       3 are
       4 away
       5 day
       6 doctor
       7 every
       8 exciting
       9 football
      10 for
      11 good
      12 health
      13 is
      14 it
      15 keeps
      16 love
      17 play
      18 played
      19 the
      20 very
      21 where


In [42]:
# prepare dtm
X_train_cv_dtm = vect_cv.transform(texts)

In [43]:
X_train_cv_dtm

<2x22 sparse matrix of type '<class 'numpy.int64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [44]:
X_train_cv_dtm.toarray()

array([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1]],
      dtype=int64)

## Example 3 - (max_df and min_df)

In [45]:
movie_list = ['3-idiots', 'Joker', 'Petta', 'Kaappaan', 'Kabir', 'Drishtikone']

In [47]:
movies = []
np.random.seed(100)

for i in range(10):
    movie_names_arrray = random.choices(movie_list, k=4)
    movie_names_str    = ' '.join(movie_names_arrray)
    
    movies.append(movie_names_str)
    
#movies = np.array(movies)
movies

['3-idiots Kaappaan Joker Petta',
 '3-idiots Joker Petta Drishtikone',
 'Petta Kaappaan Kaappaan Kabir',
 'Kabir Kaappaan 3-idiots Kaappaan',
 'Petta 3-idiots Kabir Kaappaan',
 'Kabir 3-idiots Drishtikone Kaappaan',
 'Joker Kabir Kabir Kabir',
 'Petta Drishtikone Joker Kabir',
 'Petta Joker Kaappaan 3-idiots',
 '3-idiots Drishtikone Kabir Kabir']

In [48]:
# instantiate the count vectorizer
vect_cv = CountVectorizer()

In [49]:
# train (Bow) 
vect_cv.fit(movies)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [50]:
# get all the features/tokens
feature_names = vect_cv.get_feature_names()
print(feature_names)

# get count of tokens
print("Vocabulary size: {}".format(len(vect_cv.vocabulary_)))

['drishtikone', 'idiots', 'joker', 'kaappaan', 'kabir', 'petta']
Vocabulary size: 6


In [51]:
# print vocab in sorted manner
def get_key(val): 
    for key, value in vect_cv.vocabulary_.items(): 
         if val == value: 
            return key 

print('Position', 'Token')
for v in sorted(vect_cv.vocabulary_.values()) :  
     print('{:8d} {}'.format(v, get_key(v) )) 

Position Token
       0 drishtikone
       1 idiots
       2 joker
       3 kaappaan
       4 kabir
       5 petta


In [52]:
# prepare dtm
X_train_cv_dtm = vect_cv.transform(movies)

In [53]:
X_train_cv_dtm.toarray()

array([[0, 1, 1, 1, 0, 1],
       [1, 1, 1, 0, 0, 1],
       [0, 0, 0, 2, 1, 1],
       [0, 1, 0, 2, 1, 0],
       [0, 1, 0, 1, 1, 1],
       [1, 1, 0, 1, 1, 0],
       [0, 0, 1, 0, 3, 0],
       [1, 0, 1, 0, 1, 1],
       [0, 1, 1, 1, 0, 1],
       [1, 1, 0, 0, 2, 0]], dtype=int64)

### Document Frequency

In [54]:
import re

In [55]:
number_docs = X_train_cv_dtm.shape[0]

In [56]:
# count how many times a token appears in the corpus
for token in vect_cv.vocabulary_.keys():
    
    counter = 0
    
    # read each document
    for doc in movies:

        # check if the token appears in the document, if YES, increment the counter
        if re.search(token, str(doc), re.IGNORECASE):
            counter +=1
    
    print('{:15s} count = {:3d}, DF = {:7.2f}'.format(token, counter, (counter/number_docs)*100))

idiots          count =   7, DF =   70.00
kaappaan        count =   6, DF =   60.00
joker           count =   5, DF =   50.00
petta           count =   6, DF =   60.00
drishtikone     count =   4, DF =   40.00
kabir           count =   7, DF =   70.00


#### Max_df

In [57]:
# instantiate the count vectorizer
vect_cv = CountVectorizer(max_df=.65)

# train (Bow) 
vect_cv.fit(movies)

# get all the features/tokens
feature_names = vect_cv.get_feature_names()
print(feature_names)

# get count of tokens
print("Vocabulary size: {}".format(len(vect_cv.vocabulary_)))

# print vocab in sorted manner
def get_key(val): 
    for key, value in vect_cv.vocabulary_.items(): 
         if val == value: 
            return key 

print('Position', 'Token')
for v in sorted(vect_cv.vocabulary_.values()) :  
     print('{:8d} {}'.format(v, get_key(v) )) 

['drishtikone', 'joker', 'kaappaan', 'petta']
Vocabulary size: 4
Position Token
       0 drishtikone
       1 joker
       2 kaappaan
       3 petta


In [173]:
# prepare dtm
X_train_cv_dtm = vect_cv.transform(movies)

X_train_cv_dtm.toarray()

array([[0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0],
       [1, 0, 0, 2, 0],
       [0, 1, 1, 0, 0],
       [0, 1, 0, 1, 1],
       [0, 0, 2, 0, 1],
       [0, 0, 1, 1, 1],
       [0, 0, 1, 1, 1],
       [0, 0, 0, 0, 1],
       [1, 0, 0, 2, 0]], dtype=int64)

#### min_df

In [59]:
# instantiate the count vectorizer
vect_cv = CountVectorizer(max_df=.75, min_df=.30)

# train (Bow) 
vect_cv.fit(movies)

# get all the features/tokens
feature_names = vect_cv.get_feature_names()
print(feature_names)

# get count of tokens
print("Vocabulary size: {}".format(len(vect_cv.vocabulary_)))

# print vocab in sorted manner
def get_key(val): 
    for key, value in vect_cv.vocabulary_.items(): 
         if val == value: 
            return key 

print('Position', 'Token')
for v in sorted(vect_cv.vocabulary_.values()) :  
     print('{:8d} {}'.format(v, get_key(v) )) 

['drishtikone', 'idiots', 'joker', 'kaappaan', 'kabir', 'petta']
Vocabulary size: 6
Position Token
       0 drishtikone
       1 idiots
       2 joker
       3 kaappaan
       4 kabir
       5 petta


## Example 3 - ngram_range or n-gram

#### What is an n-gram?

An n-gram is a contiguous sequence of n __items__ from a given sequence of text. 

Given a sentence, s, we can construct a list of n-grams from s by finding pairs of words that occur next to each other. 

Here an __item__ can be a character, a word or a sentence and N can be any integer. 

- When N is 2, we call the sequence a bigram.
- Similarly, a sequence of 3 items is called a trigram, and so on.

For example, given the sentence “I am Rajat” you can construct bigrams (n-grams of length 2) by finding consecutive pairs of words.

### word grams

In [205]:
s = "I studied DS/ML/DL at IISc"

In [207]:
tokens = s.split(" ")
tokens

['I', 'studied', 'DS/ML/DL', 'at', 'IISc']

In [199]:
bigrams = [(tokens[i],tokens[i+1]) for i in range(0, len(tokens)-1)]
bigrams

[('I', 'studied'), ('studied', 'DS/ML/DL'), ('DS/ML/DL', 'at'), ('at', 'IISc')]

In [200]:
trigrams = [(tokens[i],tokens[i+1],tokens[i+2]) for i in range(0, len(tokens)-2)]
trigrams

[('I', 'studied', 'DS/ML/DL'),
 ('studied', 'DS/ML/DL', 'at'),
 ('DS/ML/DL', 'at', 'IISc')]

### character grams

In [201]:
tokens = [ch for ch in s]

In [202]:
bigrams = [(tokens[i],tokens[i+1]) for i in range(0, len(tokens)-1)]
bigrams

[('I', ' '),
 (' ', 's'),
 ('s', 't'),
 ('t', 'u'),
 ('u', 'd'),
 ('d', 'i'),
 ('i', 'e'),
 ('e', 'd'),
 ('d', ' '),
 (' ', 'D'),
 ('D', 'S'),
 ('S', '/'),
 ('/', 'M'),
 ('M', 'L'),
 ('L', '/'),
 ('/', 'D'),
 ('D', 'L'),
 ('L', ' '),
 (' ', 'a'),
 ('a', 't'),
 ('t', ' '),
 (' ', 'I'),
 ('I', 'I'),
 ('I', 'S'),
 ('S', 'c')]

In [203]:
trigrams = [(tokens[i],tokens[i+1],tokens[i+2]) for i in range(0, len(tokens)-2)]
trigrams

[('I', ' ', 's'),
 (' ', 's', 't'),
 ('s', 't', 'u'),
 ('t', 'u', 'd'),
 ('u', 'd', 'i'),
 ('d', 'i', 'e'),
 ('i', 'e', 'd'),
 ('e', 'd', ' '),
 ('d', ' ', 'D'),
 (' ', 'D', 'S'),
 ('D', 'S', '/'),
 ('S', '/', 'M'),
 ('/', 'M', 'L'),
 ('M', 'L', '/'),
 ('L', '/', 'D'),
 ('/', 'D', 'L'),
 ('D', 'L', ' '),
 ('L', ' ', 'a'),
 (' ', 'a', 't'),
 ('a', 't', ' '),
 ('t', ' ', 'I'),
 (' ', 'I', 'I'),
 ('I', 'I', 'S'),
 ('I', 'S', 'c')]

### sentence level n-gram

In [60]:
tokens = ['I studied DS/ML/DL at IISc',
          'IISc is a great place to study',
          'IISc courses are cheap too',
          'course duration is 5 months'
         ]

In [61]:
bigrams = [(tokens[i],tokens[i+1]) for i in range(0, len(tokens)-1)]
bigrams
    

[('I studied DS/ML/DL at IISc', 'IISc is a great place to study'),
 ('IISc is a great place to study', 'IISc courses are cheap too'),
 ('IISc courses are cheap too', 'course duration is 5 months')]

## Characters N-Grams Model - with a dataset

In [62]:
import nltk
import numpy as np
import random
import string

import bs4 as bs
import urllib.request
import re

We will be using the Beautifulsoup4 library to parse the data from Wikipedia. Furthermore, Python's regex library, re, will be used for some preprocessing tasks on the text.

In [63]:
raw_html = urllib.request.urlopen('https://en.wikipedia.org/wiki/Tennis')
raw_html = raw_html.read()

In [64]:
article_html       = bs.BeautifulSoup(raw_html, 'lxml')
article_paragraphs = article_html.find_all('p')

article_text = ''

for para in article_paragraphs:
    article_text += para.text

article_text = article_text.lower()

we remove everything from our dataset __except letters, periods, and spaces_

In [65]:
article_text = re.sub(r'[^A-Za-z. ]', '', article_text)

In [66]:
len(article_text)

57445

In [67]:
article_text[:500]

'tennis is a racket sport that can be played individually against a single opponent singles or between two teams of two players each doubles. each player uses a tennis racket that is strung with cord to strike a hollow rubber ball covered with felt over or around a net and into the opponents court. the object of the game is to maneuver the ball in such a way that the opponent is not able to play a valid return. the player who is unable to return the ball will not gain a point while the opposite p'

#### create trigrams

create a dictionary ngrams. The keys of this dictionary will be the character trigrams in our corpus and the values will be the characters that occur next to the trigrams.

In [68]:
# choose a small example text
article_text = 'I studied Data Science at IISc in 2010'

In [69]:
ngrams = {}
chars  = 3

for i in range(len(article_text) - chars):
    
    seq = article_text[i:i+chars]
    #print(seq)
    
    if seq not in ngrams.keys():
        ngrams[seq] = []
        
    ngrams[seq].append(article_text[i+chars])

In [70]:
i = 1
for k, v in ngrams.items():
    
    if i> 5: 
        break
        
    print(k, v)

    i +=1 
    

I s ['t']
 st ['u']
stu ['d']
tud ['i']
udi ['e']


You can see __trigrams as keys__, and the corresponding characters, which occur after the trigrams in the text, as __values__.

#### Predict 

In [311]:
#curr_sequence = article_text[0:chars]
curr_sequence = 'stu'
output        = curr_sequence

for i in range(12):
    
    if curr_sequence not in ngrams.keys():
        break
        
    possible_chars = ngrams[curr_sequence]
    next_char      = possible_chars[random.randrange(len(possible_chars))]
    
    output        += next_char
    
    curr_sequence = output[ len(output) - chars:len(output) ]

print(output)

studied Data Sc


## Words N-Grams Model

first create a dictionary that contains word trigrams as keys and the list of words that occur after the trigrams as values.

In [315]:
# choose a small example text
article_text = 'I studied Data Science at IISc in 2010'

In [316]:
ngrams = {}
words  = 3

words_tokens = nltk.word_tokenize(article_text)

for i in range(len(words_tokens) - words):
    
    seq = ' '.join(words_tokens[i:i+words])
    
    print(seq)
    
    if  seq not in ngrams.keys():
        ngrams[seq] = []
        
    ngrams[seq].append(words_tokens[i+words])

I studied Data
studied Data Science
Data Science at
Science at IISc
at IISc in


In [317]:
ngrams

{'I studied Data': ['Science'],
 'studied Data Science': ['at'],
 'Data Science at': ['IISc'],
 'Science at IISc': ['in'],
 'at IISc in': ['2010']}

#### predict

In [319]:
#curr_sequence = ' '.join(words_tokens[0:words])
curr_sequence = 'Science at IISc'

output        = curr_sequence

for i in range(50):
    if curr_sequence not in ngrams.keys():
        break
        
    possible_words = ngrams[curr_sequence]
    next_word      = possible_words[random.randrange(len(possible_words))]
    
    output        += ' ' + next_word
    
    seq_words = nltk.word_tokenize(output)
    curr_sequence = ' '.join(seq_words[len(seq_words)-words:len(seq_words)])

print(output)

Science at IISc in 2010


## n-grams Using NLTK

In [326]:
s = "Natural-language processing (NLP) is an area of computer science " \
    "and artificial intelligence concerned with the interactions " \
    "between computers and human (natural) languages. !!!"

In [327]:
s

'Natural-language processing (NLP) is an area of computer science and artificial intelligence concerned with the interactions between computers and human (natural) languages. !!!'

In [328]:
from nltk.util import ngrams

In [329]:
s = s.lower()

In [330]:
s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
s

'natural language processing  nlp  is an area of computer science and artificial intelligence concerned with the interactions between computers and human  natural  languages     '

In [332]:
tokens = [token for token in s.split(" ") if token != ""]
tokens

['natural',
 'language',
 'processing',
 'nlp',
 'is',
 'an',
 'area',
 'of',
 'computer',
 'science',
 'and',
 'artificial',
 'intelligence',
 'concerned',
 'with',
 'the',
 'interactions',
 'between',
 'computers',
 'and',
 'human',
 'natural',
 'languages']

In [334]:
output = list(ngrams(tokens, 5))
output

[('natural', 'language', 'processing', 'nlp', 'is'),
 ('language', 'processing', 'nlp', 'is', 'an'),
 ('processing', 'nlp', 'is', 'an', 'area'),
 ('nlp', 'is', 'an', 'area', 'of'),
 ('is', 'an', 'area', 'of', 'computer'),
 ('an', 'area', 'of', 'computer', 'science'),
 ('area', 'of', 'computer', 'science', 'and'),
 ('of', 'computer', 'science', 'and', 'artificial'),
 ('computer', 'science', 'and', 'artificial', 'intelligence'),
 ('science', 'and', 'artificial', 'intelligence', 'concerned'),
 ('and', 'artificial', 'intelligence', 'concerned', 'with'),
 ('artificial', 'intelligence', 'concerned', 'with', 'the'),
 ('intelligence', 'concerned', 'with', 'the', 'interactions'),
 ('concerned', 'with', 'the', 'interactions', 'between'),
 ('with', 'the', 'interactions', 'between', 'computers'),
 ('the', 'interactions', 'between', 'computers', 'and'),
 ('interactions', 'between', 'computers', 'and', 'human'),
 ('between', 'computers', 'and', 'human', 'natural'),
 ('computers', 'and', 'human', 'na

## n-grams in vectors for supervised learning problems

In [345]:
texts = [
    "Penny bought bright blue fishes. !! ) %$&#&#**#*",
    "Penny bought bright blue and orange fish.",
    "The cat ate a fish at the store.",
    "Penny went to the store. Penny ate a bug. Penny saw a fish fish .",
    "It meowed once at the bug, it is still meowing at the bug and the fish",
    "The cat is at the fish store. The cat is orange. The cat is meowing at the fish.",
    "Penny is a fish",
    "lets take this sentence for example"
]

In [72]:
# N-grams (sets of consecutive words) N=2
# instantiate the count vectorizer
vect_cv = CountVectorizer(ngram_range=(1, 2))

In [347]:
# train (Bow) 
vect_cv.fit(texts)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [348]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_cv.vocabulary_)))

feature_names = vect_cv.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_cv.vocabulary_))

Vocabulary size: 78
['and', 'and orange', 'and the', 'at', 'at the', 'ate', 'ate bug', 'ate fish', 'blue', 'blue and', 'blue fishes', 'bought', 'bought bright', 'bright', 'bright blue', 'bug', 'bug and', 'bug it', 'bug penny', 'cat', 'cat ate', 'cat is', 'example', 'fish', 'fish at', 'fish fish', 'fish store', 'fishes', 'for', 'for example', 'is', 'is at', 'is fish', 'is meowing', 'is orange', 'is still', 'it', 'it is', 'it meowed', 'lets', 'lets take', 'meowed', 'meowed once', 'meowing', 'meowing at', 'once', 'once at', 'orange', 'orange fish', 'orange the', 'penny', 'penny ate', 'penny bought', 'penny is', 'penny saw', 'penny went', 'saw', 'saw fish', 'sentence', 'sentence for', 'still', 'still meowing', 'store', 'store penny', 'store the', 'take', 'take this', 'the', 'the bug', 'the cat', 'the fish', 'the store', 'this', 'this sentence', 'to', 'to the', 'went', 'went to']
Vocabulary content:
 {'penny': 50, 'bought': 11, 'bright': 13, 'blue': 8, 'fishes': 27, 'penny bought': 52, 'bou

In [349]:
# prepare dtm
X_train_cv_dtm = vect_cv.transform(texts)

In [350]:
# create a dataframe
pd.DataFrame(X_train_cv_dtm.toarray(), columns=feature_names)

Unnamed: 0,and,and orange,and the,at,at the,ate,ate bug,ate fish,blue,blue and,blue fishes,bought,bought bright,bright,bright blue,bug,bug and,bug it,bug penny,cat,cat ate,cat is,example,fish,fish at,fish fish,fish store,fishes,for,for example,is,is at,is fish,is meowing,is orange,is still,it,it is,it meowed,lets,lets take,meowed,meowed once,meowing,meowing at,once,once at,orange,orange fish,orange the,penny,penny ate,penny bought,penny is,penny saw,penny went,saw,saw fish,sentence,sentence for,still,still meowing,store,store penny,store the,take,take this,the,the bug,the cat,the fish,the store,this,this sentence,to,to the,went,went to
0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,1,0,1,0,0,0,0,0,0
3,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,0,0,1,1,1,1,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,1,1,1,1
4,1,0,1,2,2,0,0,0,0,0,0,0,0,0,0,2,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,2,1,1,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,3,2,0,1,0,0,0,0,0,0,0
5,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,3,0,2,0,0,1,0,0,0,3,1,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,5,0,3,2,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0


## Example - 4 (Stopwords)

In [73]:
# instantiate the count vectorizer
vect_cv = CountVectorizer(stop_words='english', max_features=None)

In [74]:
# train (Bow) 
vect_cv.fit(texts)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [75]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_cv.vocabulary_)))

feature_names = vect_cv.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_cv.vocabulary_))

Vocabulary size: 13
['apple', 'apples', 'away', 'day', 'doctor', 'exciting', 'football', 'good', 'health', 'keeps', 'love', 'play', 'played']
Vocabulary content:
 {'love': 10, 'apples': 1, 'good': 7, 'health': 8, 'apple': 0, 'day': 3, 'keeps': 9, 'doctor': 4, 'away': 2, 'play': 11, 'football': 6, 'exciting': 5, 'played': 12}


In [354]:
# notice the lack of stemming .. fish and fishes, meowed	meowing

# CountVectorizer can 
# - lowercase letters, 
# - disregard punctuation and 
# - stopwords, 

# but it can't LEMMATIZE or STEM

In [355]:
# create the stemmer object
porter_stemmer = PorterStemmer()
print(porter_stemmer.stem("fish"))
print(porter_stemmer.stem("fishes"))
print(porter_stemmer.stem("meowed"))
print(porter_stemmer.stem("meowing"))

fish
fish
meow
meow


In [356]:
# Use NLTK's PorterStemmer
def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [357]:
# instantiate the count vectorizer
vect_cv = CountVectorizer(ngram_range=(1, 1), stop_words='english', tokenizer=stemming_tokenizer, max_features=None)

In [358]:
# train (Bow) 
vect_cv.fit(texts)

  'stop_words.' % sorted(inconsistent))


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function stemming_tokenizer at 0x0000017CC5AE09D8>,
        vocabulary=None)

In [359]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_cv.vocabulary_)))

feature_names = vect_cv.get_feature_names()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_cv.vocabulary_))

Vocabulary size: 18
['ate', 'blue', 'bought', 'bright', 'bug', 'cat', 'exampl', 'fish', 'let', 'meow', 'onc', 'orang', 'penni', 'saw', 'sentenc', 'store', 'thi', 'went']
Vocabulary content:
 {'penni': 12, 'bought': 2, 'bright': 3, 'blue': 1, 'fish': 7, 'orang': 11, 'cat': 5, 'ate': 0, 'store': 15, 'went': 17, 'bug': 4, 'saw': 13, 'meow': 9, 'onc': 10, 'let': 8, 'thi': 16, 'sentenc': 14, 'exampl': 6}


In [360]:
# prepare dtm
X_train_cv_dtm = vect_cv.transform(texts)

In [361]:
# create a dataframe
pd.DataFrame(X_train_cv_dtm.toarray(), columns=feature_names)

Unnamed: 0,ate,blue,bought,bright,bug,cat,exampl,fish,let,meow,onc,orang,penni,saw,sentenc,store,thi,went
0,0,1,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0
1,0,1,1,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0
2,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,1,0,0,2,0,0,0,0,3,1,0,1,0,1
4,0,0,0,0,2,0,0,1,0,2,1,0,0,0,0,0,0,0
5,0,0,0,0,0,3,0,2,0,1,0,1,0,0,0,1,0,0
6,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
7,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,1,0
