-----------------
#### Bag of words model
--------------

In [1]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", 10000)

# import plotting libraries
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)

# For text processing
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize  
from nltk.tokenize import sent_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer



#### Example 1

In [2]:
texts = [
    "It was the best of times it it  it it" ,
    "it was the worst of times",
    "it was the age of wisdom and lots of wisdom",
    "it was the age of foolishness"
]

In [5]:
# instantiate the count vectorizer
vect_cv = CountVectorizer()

In [6]:
# train (Bow) 
vect_cv.fit(texts)

CountVectorizer()

In [7]:
vect_cv.get_feature_names_out()

array(['age', 'and', 'best', 'foolishness', 'it', 'lots', 'of', 'the',
       'times', 'was', 'wisdom', 'worst'], dtype=object)

In [8]:
# get all the features/tokens
feature_names = vect_cv.get_feature_names_out()
print(feature_names)

# get count of tokens
print("Vocabulary size: {}".format(len(vect_cv.vocabulary_)))

['age' 'and' 'best' 'foolishness' 'it' 'lots' 'of' 'the' 'times' 'was'
 'wisdom' 'worst']
Vocabulary size: 12


In [9]:
# print vocab in sorted manner
def get_key(val): 
    for key, value in vect_cv.vocabulary_.items(): 
         if val == value: 
            return key 

print('Position', 'Token')
for v in sorted(vect_cv.vocabulary_.values()) :  
     print('{:8d} {}'.format(v, get_key(v) )) 

Position Token
       0 age
       1 and
       2 best
       3 foolishness
       4 it
       5 lots
       6 of
       7 the
       8 times
       9 was
      10 wisdom
      11 worst


In [14]:
# prepare dtm
X_train_cv_dtm = vect_cv.transform(texts)

In [12]:
X_train_cv_dtm

<4x12 sparse matrix of type '<class 'numpy.int64'>'
	with 26 stored elements in Compressed Sparse Row format>

In [13]:
X_train_cv_dtm.toarray()

array([[0, 0, 1, 0, 5, 0, 1, 1, 1, 1, 0, 0],
       [0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1],
       [1, 1, 0, 0, 1, 1, 2, 1, 0, 1, 2, 0],
       [1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0]], dtype=int64)

In [15]:
# transform new test samples
test_texts = [
    "Pollution is very bad for health" ,
    "Govt not very keen on pollution control measures",
]

In [16]:
# prepare dtm
test_dtm = vect_cv.transform(test_texts)

In [17]:
test_dtm.shape

(2, 12)

In [18]:
test_dtm.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

#### Example 2 (binary representation)

- note the default lowercasing of the tokens
- stop words are not removed

In [19]:
texts = [
    "I love apples. Apples are good for health. An apple a day keeps the doctor away",
    "Play football. It is very exciting. Football is played every where"
]

In [20]:
# instantiate the count vectorizer
vect_cv = CountVectorizer(binary=True)

In [22]:
# train (Bow) 
vect_cv.fit(texts)

CountVectorizer(binary=True)

In [23]:
# get all the features/tokens
feature_names = vect_cv.get_feature_names_out()
print(feature_names)

# get count of tokens
print("Vocabulary size: {}".format(len(vect_cv.vocabulary_)))

['an' 'apple' 'apples' 'are' 'away' 'day' 'doctor' 'every' 'exciting'
 'football' 'for' 'good' 'health' 'is' 'it' 'keeps' 'love' 'play' 'played'
 'the' 'very' 'where']
Vocabulary size: 22


In [24]:
# print vocab in sorted manner
def get_key(val): 
    for key, value in vect_cv.vocabulary_.items(): 
         if val == value: 
            return key 

print('Position', 'Token')
for v in sorted(vect_cv.vocabulary_.values()) :  
     print('{:8d} {}'.format(v, get_key(v) )) 

Position Token
       0 an
       1 apple
       2 apples
       3 are
       4 away
       5 day
       6 doctor
       7 every
       8 exciting
       9 football
      10 for
      11 good
      12 health
      13 is
      14 it
      15 keeps
      16 love
      17 play
      18 played
      19 the
      20 very
      21 where


In [25]:
# prepare dtm
X_train_cv_dtm = vect_cv.transform(texts)

In [26]:
X_train_cv_dtm

<2x22 sparse matrix of type '<class 'numpy.int64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [27]:
X_train_cv_dtm.toarray()

array([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1]],
      dtype=int64)

#### Example 3 - (max_df and min_df)

In [26]:
import random

In [27]:
movie_list = ['3-idiots', 'Joker', 'Petta', 'Kaappaan', 'Kabir', 'Drishtikone']

In [28]:
random.choices(movie_list, k=4)

['Kabir', '3-idiots', 'Joker', 'Petta']

In [29]:
movie_names_arrray = random.choices(movie_list, k=4)
movie_names_arrray

['Kabir', 'Kaappaan', 'Drishtikone', 'Joker']

In [30]:
' '.join(movie_names_arrray)

'Kabir Kaappaan Drishtikone Joker'

In [31]:
movies = []
np.random.seed(100)

for i in range(10):
    movie_names_arrray = random.choices(movie_list, k=4)
    movie_names_str    = ' '.join(movie_names_arrray)
    
    movies.append(movie_names_str)
    
#movies = np.array(movies)
movies

['Petta Kaappaan 3-idiots 3-idiots',
 'Drishtikone Kaappaan Drishtikone Joker',
 'Kaappaan Joker Kabir Petta',
 '3-idiots Joker Kaappaan 3-idiots',
 '3-idiots Drishtikone Kaappaan Kabir',
 'Joker Kaappaan Kabir Kabir',
 '3-idiots Kabir Kaappaan Kabir',
 'Kabir Joker 3-idiots Drishtikone',
 'Joker Kabir 3-idiots 3-idiots',
 '3-idiots Drishtikone Drishtikone Drishtikone']

In [32]:
# instantiate the count vectorizer
vect_cv = CountVectorizer()

In [33]:
# train (Bow) 
vect_cv.fit(movies)

In [35]:
# get all the features/tokens
feature_names = vect_cv.get_feature_names_out()
print(feature_names)

# get count of tokens
print("Vocabulary size: {}".format(len(vect_cv.vocabulary_)))

['drishtikone' 'idiots' 'joker' 'kaappaan' 'kabir' 'petta']
Vocabulary size: 6


In [36]:
# print vocab in sorted manner
def get_key(val): 
    for key, value in vect_cv.vocabulary_.items(): 
         if val == value: 
            return key 

print('Position', 'Token')
for v in sorted(vect_cv.vocabulary_.values()) :  
     print('{:8d} {}'.format(v, get_key(v) )) 

Position Token
       0 drishtikone
       1 idiots
       2 joker
       3 kaappaan
       4 kabir
       5 petta


In [37]:
# prepare dtm
X_train_cv_dtm = vect_cv.transform(movies)

In [38]:
X_train_cv_dtm.toarray()

array([[0, 2, 0, 1, 0, 1],
       [2, 0, 1, 1, 0, 0],
       [0, 0, 1, 1, 1, 1],
       [0, 2, 1, 1, 0, 0],
       [1, 1, 0, 1, 1, 0],
       [0, 0, 1, 1, 2, 0],
       [0, 1, 0, 1, 2, 0],
       [1, 1, 1, 0, 1, 0],
       [0, 2, 1, 0, 1, 0],
       [3, 1, 0, 0, 0, 0]], dtype=int64)

#### Document Frequency

In [39]:
import re

In [40]:
number_docs = X_train_cv_dtm.shape[0]

In [41]:
# count how many times a token appears in the corpus
for token in vect_cv.vocabulary_.keys():
    
    counter = 0
    
    # read each document
    for doc in movies:

        # check if the token appears in the document, if YES, increment the counter
        if re.search(token, str(doc), re.IGNORECASE):
            counter +=1
    
    print('{:15s} count = {:3d}, DF = {:7.2f}'.format(token, counter, (counter/number_docs)*100))

petta           count =   2, DF =   20.00
kaappaan        count =   7, DF =   70.00
idiots          count =   7, DF =   70.00
drishtikone     count =   4, DF =   40.00
joker           count =   6, DF =   60.00
kabir           count =   6, DF =   60.00


#### Max_df

In [43]:
# instantiate the count vectorizer
vect_cv = CountVectorizer(max_df=.65)

# train (Bow) 
vect_cv.fit(movies)

# get all the features/tokens
feature_names = vect_cv.get_feature_names_out()
print(feature_names)

# get count of tokens
print("Vocabulary size: {}".format(len(vect_cv.vocabulary_)))

# print vocab in sorted manner
def get_key(val): 
    for key, value in vect_cv.vocabulary_.items(): 
         if val == value: 
            return key 

print('Position', 'Token')
for v in sorted(vect_cv.vocabulary_.values()) :  
     print('{:8d} {}'.format(v, get_key(v) )) 

['drishtikone' 'joker' 'kabir' 'petta']
Vocabulary size: 4
Position Token
       0 drishtikone
       1 joker
       2 kabir
       3 petta


In [44]:
# prepare dtm
X_train_cv_dtm = vect_cv.transform(movies)

X_train_cv_dtm.toarray()

array([[0, 0, 0, 1],
       [2, 1, 0, 0],
       [0, 1, 1, 1],
       [0, 1, 0, 0],
       [1, 0, 1, 0],
       [0, 1, 2, 0],
       [0, 0, 2, 0],
       [1, 1, 1, 0],
       [0, 1, 1, 0],
       [3, 0, 0, 0]], dtype=int64)

#### min_df

In [45]:
# instantiate the count vectorizer
vect_cv = CountVectorizer(max_df=.75, min_df=.30)

# train (Bow) 
vect_cv.fit(movies)

# get all the features/tokens
feature_names = vect_cv.get_feature_names_out()
print(feature_names)

# get count of tokens
print("Vocabulary size: {}".format(len(vect_cv.vocabulary_)))

# print vocab in sorted manner
def get_key(val): 
    for key, value in vect_cv.vocabulary_.items(): 
         if val == value: 
            return key 

print('Position', 'Token')
for v in sorted(vect_cv.vocabulary_.values()) :  
     print('{:8d} {}'.format(v, get_key(v) )) 

['drishtikone' 'idiots' 'joker' 'kaappaan' 'kabir']
Vocabulary size: 5
Position Token
       0 drishtikone
       1 idiots
       2 joker
       3 kaappaan
       4 kabir


#### Example 3 - ngram_range or n-gram

#### What is an n-gram?

An n-gram is a `contiguous sequence` of n __items__ from a given sequence of text. 

Given a sentence, s, we can construct a list of n-grams from s by finding pairs of words that occur next to each other. 

Here an __item__ can be a character, a word or a sentence and N can be any integer. 

- When N is 2, we call the sequence a bigram.
- Similarly, a sequence of 3 items is called a trigram, and so on.

For example, given the sentence “I am Rajat” you can construct bigrams (n-grams of length 2) by finding consecutive pairs of words.

### word grams

In [24]:
s = "I studied DS/ML/DL at IISc"

In [25]:
tokens = s.split(" ")
tokens

['I', 'studied', 'DS/ML/DL', 'at', 'IISc']

In [26]:
bigrams = [(tokens[i],tokens[i+1]) for i in range(0, len(tokens)-1)]
bigrams

[('I', 'studied'), ('studied', 'DS/ML/DL'), ('DS/ML/DL', 'at'), ('at', 'IISc')]

In [27]:
trigrams = [(tokens[i],tokens[i+1],tokens[i+2]) for i in range(0, len(tokens)-2)]
trigrams

[('I', 'studied', 'DS/ML/DL'),
 ('studied', 'DS/ML/DL', 'at'),
 ('DS/ML/DL', 'at', 'IISc')]

#### n-grams Using NLTK

In [28]:
s = "Natural-language processing (NLP) is an area of computer science " \
    "and artificial intelligence concerned with the interactions " \
    "between computers and human (natural) languages. !!!"

In [29]:
s

'Natural-language processing (NLP) is an area of computer science and artificial intelligence concerned with the interactions between computers and human (natural) languages. !!!'

In [30]:
from nltk.util import ngrams

In [32]:
s = s.lower()

In [33]:
s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
s

'natural language processing  nlp  is an area of computer science and artificial intelligence concerned with the interactions between computers and human  natural  languages     '

In [34]:
tokens = [token for token in s.split(" ") if token != ""]
tokens

['natural',
 'language',
 'processing',
 'nlp',
 'is',
 'an',
 'area',
 'of',
 'computer',
 'science',
 'and',
 'artificial',
 'intelligence',
 'concerned',
 'with',
 'the',
 'interactions',
 'between',
 'computers',
 'and',
 'human',
 'natural',
 'languages']

In [35]:
output = list(ngrams(tokens, 2))
output

[('natural', 'language'),
 ('language', 'processing'),
 ('processing', 'nlp'),
 ('nlp', 'is'),
 ('is', 'an'),
 ('an', 'area'),
 ('area', 'of'),
 ('of', 'computer'),
 ('computer', 'science'),
 ('science', 'and'),
 ('and', 'artificial'),
 ('artificial', 'intelligence'),
 ('intelligence', 'concerned'),
 ('concerned', 'with'),
 ('with', 'the'),
 ('the', 'interactions'),
 ('interactions', 'between'),
 ('between', 'computers'),
 ('computers', 'and'),
 ('and', 'human'),
 ('human', 'natural'),
 ('natural', 'languages')]

#### n-grams in vectors for supervised learning problems

In [28]:
texts = [
    "Penny bought bright blue fishes. !! ) %$&#&#**#*",
    "Penny bought bright blue and orange fish.",
    "The cat ate a fish at the store.",
    "Penny went to the store. Penny ate a bug. Penny saw a fish fish .",
    "It meowed once at the bug, it is still meowing at the bug and the fish",
    "The cat is at the fish store. The cat is orange. The cat is meowing at the fish.",
    "Penny is a fish",
    "lets take this sentence for example"
]

In [29]:
# N-grams (sets of consecutive words) N=2
# instantiate the count vectorizer
vect_cv = CountVectorizer(ngram_range=(1, 2))

In [31]:
# train (Bow) 
vect_cv.fit(texts)

CountVectorizer(ngram_range=(1, 2))

In [32]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_cv.vocabulary_)))

feature_names = vect_cv.get_feature_names_out()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_cv.vocabulary_))

Vocabulary size: 78
['and' 'and orange' 'and the' 'at' 'at the' 'ate' 'ate bug' 'ate fish'
 'blue' 'blue and' 'blue fishes' 'bought' 'bought bright' 'bright'
 'bright blue' 'bug' 'bug and' 'bug it' 'bug penny' 'cat' 'cat ate'
 'cat is' 'example' 'fish' 'fish at' 'fish fish' 'fish store' 'fishes'
 'for' 'for example' 'is' 'is at' 'is fish' 'is meowing' 'is orange'
 'is still' 'it' 'it is' 'it meowed' 'lets' 'lets take' 'meowed'
 'meowed once' 'meowing' 'meowing at' 'once' 'once at' 'orange'
 'orange fish' 'orange the' 'penny' 'penny ate' 'penny bought' 'penny is'
 'penny saw' 'penny went' 'saw' 'saw fish' 'sentence' 'sentence for'
 'still' 'still meowing' 'store' 'store penny' 'store the' 'take'
 'take this' 'the' 'the bug' 'the cat' 'the fish' 'the store' 'this'
 'this sentence' 'to' 'to the' 'went' 'went to']
Vocabulary content:
 {'penny': 50, 'bought': 11, 'bright': 13, 'blue': 8, 'fishes': 27, 'penny bought': 52, 'bought bright': 12, 'bright blue': 14, 'blue fishes': 10, 'and': 0, '

#### Example - 4 (Stopwords)

In [33]:
# instantiate the count vectorizer
vect_cv = CountVectorizer(stop_words='english', max_features=None)

In [34]:
# train (Bow) 
vect_cv.fit(texts)

CountVectorizer(stop_words='english')

In [35]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_cv.vocabulary_)))

feature_names = vect_cv.get_feature_names_out()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_cv.vocabulary_))

Vocabulary size: 18
['ate' 'blue' 'bought' 'bright' 'bug' 'cat' 'example' 'fish' 'fishes'
 'lets' 'meowed' 'meowing' 'orange' 'penny' 'saw' 'sentence' 'store'
 'went']
Vocabulary content:
 {'penny': 13, 'bought': 2, 'bright': 3, 'blue': 1, 'fishes': 8, 'orange': 12, 'fish': 7, 'cat': 5, 'ate': 0, 'store': 16, 'went': 17, 'bug': 4, 'saw': 14, 'meowed': 10, 'meowing': 11, 'lets': 9, 'sentence': 15, 'example': 6}


In [67]:
# notice the lack of stemming .. fish and fishes, meowed	meowing

# CountVectorizer can 
# - lowercase letters, 
# - disregard punctuation and 
# - stopwords, 

# but it can't LEMMATIZE or STEM

In [68]:
# create the stemmer object
porter_stemmer = PorterStemmer()
print(porter_stemmer.stem("fish"))
print(porter_stemmer.stem("fishes"))
print(porter_stemmer.stem("meowed"))
print(porter_stemmer.stem("meowing"))

fish
fish
meow
meow


In [69]:
# Use NLTK's PorterStemmer
def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [70]:
import re    # regular expression

In [71]:
text_string = 'NLPnlp884848^$^$^$&$$&& : -'

In [72]:
re.sub(r"[^A-Za-z0-9]", "-PUNCT-", text_string)

'NLPnlp884848-PUNCT--PUNCT--PUNCT--PUNCT--PUNCT--PUNCT--PUNCT--PUNCT--PUNCT--PUNCT--PUNCT--PUNCT--PUNCT--PUNCT--PUNCT-'

In [73]:
# instantiate the count vectorizer
vect_cv = CountVectorizer(ngram_range=(1, 1), stop_words='english', tokenizer=stemming_tokenizer, max_features=None)

In [74]:
# train (Bow) 
vect_cv.fit(texts)



In [75]:
# get all the feature/token names
print("Vocabulary size: {}".format(len(vect_cv.vocabulary_)))

feature_names = vect_cv.get_feature_names_out()
print(feature_names)

print("Vocabulary content:\n {}".format(vect_cv.vocabulary_))

Vocabulary size: 18
['ate' 'blue' 'bought' 'bright' 'bug' 'cat' 'exampl' 'fish' 'let' 'meow'
 'onc' 'orang' 'penni' 'saw' 'sentenc' 'store' 'thi' 'went']
Vocabulary content:
 {'penni': 12, 'bought': 2, 'bright': 3, 'blue': 1, 'fish': 7, 'orang': 11, 'cat': 5, 'ate': 0, 'store': 15, 'went': 17, 'bug': 4, 'saw': 13, 'meow': 9, 'onc': 10, 'let': 8, 'thi': 16, 'sentenc': 14, 'exampl': 6}
