In [1]:
import numpy as np

In [2]:
sentence = "Thomas Jefferson began building Monticello at the age of 26."

sentence.split()

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [3]:
str.split(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [4]:

token_sequence = str.split(sentence)
vocab = sorted(set(token_sequence))
", ".join(vocab)


'26., Jefferson, Monticello, Thomas, age, at, began, building, of, the'

In [5]:
num_tokens=len(token_sequence)   # Get the number of tokens
vocab_size = len(vocab)     # Get the vocabulary size
onehot_vectors = np.zeros((num_tokens, vocab_size), int)   # Create a matrix of zeros with the same shape as the one-hot vectors will be    
for i, word in enumerate(token_sequence):       # Enumerate() method adds a counter to an iterable and returns it in a form of enumerate object. This enumerate object can then be used directly in for loops or be converted into a list of tuples using list() method.
    onehot_vectors[i, vocab.index(word)] = 1
' '.join(vocab)     # Display vocab

'26. Jefferson Monticello Thomas age at began building of the'

In [6]:
onehot_vectors

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [7]:
import pandas as pd
pd.DataFrame(onehot_vectors, columns=vocab)

Unnamed: 0,26.,Jefferson,Monticello,Thomas,age,at,began,building,of,the
0,0,0,0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1,0
9,1,0,0,0,0,0,0,0,0,0


In [9]:
df = pd.DataFrame(onehot_vectors, columns=vocab)
df[df == 0] = ''
df

  df[df == 0] = ''


Unnamed: 0,26.,Jefferson,Monticello,Thomas,age,at,began,building,of,the
0,,,,1.0,,,,,,
1,,1.0,,,,,,,,
2,,,,,,,1.0,,,
3,,,,,,,,1.0,,
4,,,1.0,,,,,,,
5,,,,,,1.0,,,,
6,,,,,,,,,,1.0
7,,,,,1.0,,,,,
8,,,,,,,,,1.0,
9,1.0,,,,,,,,,


In [10]:
sentence_bow = {}
for token in sentence.split():
    sentence_bow[token] = 1
sorted(sentence_bow.items())

[('26.', 1),
 ('Jefferson', 1),
 ('Monticello', 1),
 ('Thomas', 1),
 ('age', 1),
 ('at', 1),
 ('began', 1),
 ('building', 1),
 ('of', 1),
 ('the', 1)]

In [11]:
df = pd.DataFrame(pd.Series(dict([(token, 1) for token in sentence.split()])), columns=['sent']).T
df

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26.
sent,1,1,1,1,1,1,1,1,1,1


In [17]:
sentences = "Thomas Jefferson began building Monticello at the age of 26.\n"
sentences += "Construction was done mostly by local masons and carpenters.\n"
sentences += "He moved into the South Pavilion in 1770.\n"
sentences += "Turning Monticello into a neoclassical masterpiece was Jefferson's obsession."
corpus = {}
for i, sent in enumerate(sentences.split('\n')):
    corpus['sent{}'.format(i)] = dict((tok, 1) for tok in sent.split())
df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T
df[df.columns[:10]]

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26.
sent0,1,1,1,1,1,1,1,1,1,1
sent1,0,0,0,0,0,0,0,0,0,0
sent2,0,0,0,0,0,0,1,0,0,0
sent3,0,0,0,0,1,0,0,0,0,0


### 2.2 Measuring bag-of-words overlap

In [19]:
df = df.T
df.sent0.dot(df.sent1)

0

In [20]:
df.sent0.dot(df.sent2)

1

In [21]:
df.sent0.dot(df.sent3)

1

In [23]:
[(k, v) for (k, v) in (df.sent0 & df.sent3).items()]

[('Thomas', 0),
 ('Jefferson', 0),
 ('began', 0),
 ('building', 0),
 ('Monticello', 1),
 ('at', 0),
 ('the', 0),
 ('age', 0),
 ('of', 0),
 ('26.', 0),
 ('Construction', 0),
 ('was', 0),
 ('done', 0),
 ('mostly', 0),
 ('by', 0),
 ('local', 0),
 ('masons', 0),
 ('and', 0),
 ('carpenters.', 0),
 ('He', 0),
 ('moved', 0),
 ('into', 0),
 ('South', 0),
 ('Pavilion', 0),
 ('in', 0),
 ('1770.', 0),
 ('Turning', 0),
 ('a', 0),
 ('neoclassical', 0),
 ('masterpiece', 0),
 ("Jefferson's", 0),
 ('obsession.', 0)]

### 2.3 A token improvement

In [24]:
import re
sentence = """Thomas Jefferson began building Monticello at the age of 26."""
tokens = re.split(r'[-\s.,;!?]+', sentence)
    # the '+' makes sure that any number of consecutive delimiters are treated as one.
    # r'[\s] is equivalent to r' \t\n\r\x0b\x0c'
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '']

In [25]:
pattern = re.compile(r'([-\s.,;!?])+')
tokens = pattern.split(sentence)
tokens[-10:] # just the last 10 tokens

[' ', 'the', ' ', 'age', ' ', 'of', ' ', '26', '.', '']

In [26]:
tokens = pattern.split(sentence)
[x for x in tokens if x and x not in '- \t\n.,;!?']

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

You can use the NLTK function  `RegexpTokenizer` to replicate your simple tokenizer example like this:

In [28]:
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')
tokenizer.tokenize(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '.']

An even better tokenizer is the Treebank Word Tokenizer from the NLTK package. It incorporates a variety of common rules for English word tokenization. For example, it separates phrase-terminating punctuation (?!.;,) from adjacent tokens and retains decimal numbers containing a period as a single token.

In [29]:
from nltk.tokenize import TreebankWordDetokenizer
sentence = "Monticello wasn't designated as UNESCO World Heritage Site until 1987."
tokenizer = TreebankWordDetokenizer()
tokenizer.tokenize(sentence)

"M o n t i c e l l o   w a s n' t   d e s i g n a t e d   a s   U N E S C O   W o r l d   H e r i t a g e   S i t e   u n t i l   1 9 8 7."