In [0]:
import numpy as np
import re

# Bag-of-Words

In [0]:
text = ('I propose to consider the question, "Can machines think?" This should begin with ' +
  'definitions of the meaning of the terms "machine" and "think." The definitions might be ' +
  'framed so as to reflect so far as possible the normal use of the words, but this attitude is ' +
  'dangerous, If the meaning of the words "machine" and "think" are to be found by ' +
  'examining how they are commonly used it is difficult to escape the conclusion that the ' +
  'meaning and the answer to the question, "Can machines think?" is to be sought in a ' +
  'statistical survey such as a Gallup poll. But this is absurd. Instead of attempting such a ' +
  'definition I shall replace the question by another, which is closely related to it and is ' +
  'expressed in relatively unambiguous words. ')

In [3]:
def tokenize(sentence):
    t_sent= re.findall(r'\w+|\S', sentence)
    t_sent = [w.lower() for w in t_sent]
    return t_sent

corpus = tokenize(text)
corpus[:12]

['i',
 'propose',
 'to',
 'consider',
 'the',
 'question',
 ',',
 '"',
 'can',
 'machines',
 'think',
 '?']

## Creating the vocabulary of corpus

In [0]:
vocabulary = {}

for w in corpus:
    if w not in vocabulary: vocabulary[w] = 1
    else: vocabulary[w] += 1

In [5]:
vocabulary

{'"': 12,
 ',': 5,
 '.': 4,
 '?': 2,
 'a': 3,
 'absurd': 1,
 'and': 4,
 'another': 1,
 'answer': 1,
 'are': 2,
 'as': 3,
 'attempting': 1,
 'attitude': 1,
 'be': 3,
 'begin': 1,
 'but': 2,
 'by': 2,
 'can': 2,
 'closely': 1,
 'commonly': 1,
 'conclusion': 1,
 'consider': 1,
 'dangerous': 1,
 'definition': 1,
 'definitions': 2,
 'difficult': 1,
 'escape': 1,
 'examining': 1,
 'expressed': 1,
 'far': 1,
 'found': 1,
 'framed': 1,
 'gallup': 1,
 'how': 1,
 'i': 2,
 'if': 1,
 'in': 2,
 'instead': 1,
 'is': 6,
 'it': 2,
 'machine': 2,
 'machines': 2,
 'meaning': 3,
 'might': 1,
 'normal': 1,
 'of': 5,
 'poll': 1,
 'possible': 1,
 'propose': 1,
 'question': 3,
 'reflect': 1,
 'related': 1,
 'relatively': 1,
 'replace': 1,
 'shall': 1,
 'should': 1,
 'so': 2,
 'sought': 1,
 'statistical': 1,
 'such': 2,
 'survey': 1,
 'terms': 1,
 'that': 1,
 'the': 13,
 'they': 1,
 'think': 4,
 'this': 3,
 'to': 7,
 'unambiguous': 1,
 'use': 1,
 'used': 1,
 'which': 1,
 'with': 1,
 'words': 3}

In [0]:
# sorting with respect to counts
bag_of_words = [(k, v) for k,v in sorted(vocabulary.items(), key=lambda x: x[1], reverse=True)] 

In [7]:
bag_of_words[:10] # most frequent 10 word

[('the', 13),
 ('"', 12),
 ('to', 7),
 ('is', 6),
 (',', 5),
 ('of', 5),
 ('think', 4),
 ('and', 4),
 ('.', 4),
 ('question', 3)]

In [0]:
max_length_vocabulary = None
word_to_index = {word[0]: index for index, word in enumerate(bag_of_words[:max_length_vocabulary])} 
index_to_word = {index: word[0] for index, word in enumerate(bag_of_words[:max_length_vocabulary])} 

In [9]:
word_to_index

{'"': 1,
 ',': 4,
 '.': 8,
 '?': 19,
 'a': 15,
 'absurd': 61,
 'and': 7,
 'another': 67,
 'answer': 55,
 'are': 24,
 'as': 13,
 'attempting': 63,
 'attitude': 42,
 'be': 12,
 'begin': 32,
 'but': 23,
 'by': 25,
 'can': 17,
 'closely': 69,
 'commonly': 49,
 'conclusion': 53,
 'consider': 30,
 'dangerous': 43,
 'definition': 64,
 'definitions': 20,
 'difficult': 51,
 'escape': 52,
 'examining': 46,
 'expressed': 71,
 'far': 38,
 'found': 45,
 'framed': 36,
 'gallup': 59,
 'how': 47,
 'i': 16,
 'if': 44,
 'in': 27,
 'instead': 62,
 'is': 3,
 'it': 26,
 'machine': 21,
 'machines': 18,
 'meaning': 11,
 'might': 35,
 'normal': 40,
 'of': 5,
 'poll': 60,
 'possible': 39,
 'propose': 29,
 'question': 9,
 'reflect': 37,
 'related': 70,
 'relatively': 72,
 'replace': 66,
 'shall': 65,
 'should': 31,
 'so': 22,
 'sought': 56,
 'statistical': 57,
 'such': 28,
 'survey': 58,
 'terms': 34,
 'that': 54,
 'the': 0,
 'they': 48,
 'think': 6,
 'this': 10,
 'to': 2,
 'unambiguous': 73,
 'use': 41,
 'used

In [10]:
example = 'Can machine think?'
t_example = tokenize(example)
t_example

['can', 'machine', 'think', '?']

## Vectorizing (One-hot encoding)

In [11]:
vector = np.zeros((len(t_example), len(word_to_index)))
vector.shape # == sentence_length, vocabulary_size

(4, 74)

In [12]:
for index, word in enumerate(t_example):
  # is word in vocabulary or not
  if word in word_to_index: 
    vector[index, word_to_index[word]] = 1.

vector

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.

In [13]:
decoded_text = ''

for word in vector:
  # is word in vocabulary or not
  if word.sum() != 0: 
    token = word.argmax()
    decoded_text += index_to_word[token] + ' '

decoded_text

'can machine think ? '

## sklearn - CountVectorizer 

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer()
bow.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [15]:
bow.vocabulary_

{'absurd': 0,
 'and': 1,
 'another': 2,
 'answer': 3,
 'are': 4,
 'as': 5,
 'attempting': 6,
 'attitude': 7,
 'be': 8,
 'begin': 9,
 'but': 10,
 'by': 11,
 'can': 12,
 'closely': 13,
 'commonly': 14,
 'conclusion': 15,
 'consider': 16,
 'dangerous': 17,
 'definition': 18,
 'definitions': 19,
 'difficult': 20,
 'escape': 21,
 'examining': 22,
 'expressed': 23,
 'far': 24,
 'found': 25,
 'framed': 26,
 'gallup': 27,
 'how': 28,
 'if': 29,
 'in': 30,
 'instead': 31,
 'is': 32,
 'it': 33,
 'machine': 34,
 'machines': 35,
 'meaning': 36,
 'might': 37,
 'normal': 38,
 'of': 39,
 'poll': 40,
 'possible': 41,
 'propose': 42,
 'question': 43,
 'reflect': 44,
 'related': 45,
 'relatively': 46,
 'replace': 47,
 'shall': 48,
 'should': 49,
 'so': 50,
 'sought': 51,
 'statistical': 52,
 'such': 53,
 'survey': 54,
 'terms': 55,
 'that': 56,
 'the': 57,
 'they': 58,
 'think': 59,
 'this': 60,
 'to': 61,
 'unambiguous': 62,
 'use': 63,
 'used': 64,
 'which': 65,
 'with': 66,
 'words': 67}

In [16]:
print(f'Length of vocabulary: {len(bow.vocabulary_)}')

Length of vocabulary: 68


In [0]:
example = 'Can machine think?'
t_example = tokenize(example)
v_example = bow.transform(t_example)

In [18]:
v_example

<4x68 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [19]:
v_example.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]])

# TF-IDF

In [20]:
import nltk
nltk.download('gutenberg')

hamlet = nltk.corpus.gutenberg.words('shakespeare-hamlet.txt')
hamlet = [w.lower() for w in hamlet]

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [21]:
print(f'# of tokens: {len(hamlet)}')
print(f'# of unique words: {len(set(hamlet))}')

# of tokens: 37360
# of unique words: 4716


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf.fit(hamlet)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [23]:
print(f'Length of vocabulary: {len(tfidf.vocabulary_)}')
print(f'How many sentences in the corpus: {len(hamlet)}')

Length of vocabulary: 4688
How many sentences in the corpus: 37360


In [24]:
vector_hamlet = tfidf.transform(hamlet)
vector_hamlet.shape # == sentence_count, vocabulary_length

(37360, 4688)

In [0]:
example = 'Blessed are you whose worthiness gives scope, Being had, to triumph; being lacked, to hope.'
t_example = tokenize(example)
vector = tfidf.transform(t_example)

In [26]:
vector

<20x4688 sparse matrix of type '<class 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>