In [1]:
import numpy as np
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import string
from scipy.spatial.distance import pdist, squareform
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

### Corpus: list of documents
    ['This roller coaster is too, too rickety.', 
    'Roller coasters give me hives.', 
    'That bee hive is approaching at an alarming speed.']
### Term frequency
$$TF_{word,document} = \frac{\#\_of\_times\_word\_appears\_in\_document}{total\_\#\_of\_words\_in\_document}$$
### Inverse document frequency
$$ IDF_{word} = \log\left(\frac{total\_\#\_of\_documents}{\#\_of\_documents\_containing\_word}\right) $$

In [5]:
corpus = ['This roller coaster is too, too rickety.', 
    'Roller coasters give me hives.', 
    'That bee hive is approaching at an alarming speed.']

# tokenized_doc = [doc.lower().split() for doc in corpus]
# tokenized_doc = [word_tokenize(doc.lower()) for doc in corpus]

def our_tokenizer(doc):
    doc = word_tokenize(doc.lower())
    return [tok for tok in doc if tok not in string.punctuation]

tokenized_docs = [our_tokenizer(doc) for doc in corpus]
tokenized_docs

[['this', 'roller', 'coaster', 'is', 'too', 'too', 'rickety'],
 ['roller', 'coasters', 'give', 'me', 'hives'],
 ['that', 'bee', 'hive', 'is', 'approaching', 'at', 'an', 'alarming', 'speed']]

In [6]:
vocab = set()

for doc in tokenized_docs:
    vocab.update(doc)
    
vocab = sorted(list(vocab))
vocab

['alarming',
 'an',
 'approaching',
 'at',
 'bee',
 'coaster',
 'coasters',
 'give',
 'hive',
 'hives',
 'is',
 'me',
 'rickety',
 'roller',
 'speed',
 'that',
 'this',
 'too']

In [7]:
Counter(tokenized_docs[0])

Counter({'coaster': 1,
         'is': 1,
         'rickety': 1,
         'roller': 1,
         'this': 1,
         'too': 2})

In [9]:
doc_vec = np.zeros(len(vocab))
for word, count in Counter(tokenized_docs[0]).items():
    ind = vocab.index(word)
    doc_vec[ind] = count
doc_vec

array([ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,
        1.,  0.,  0.,  1.,  2.])

## sklearn

In [12]:
vect = TfidfVectorizer(stop_words='english')
vector_matrix = vect.fit(corpus)
vect.get_feature_names()

['alarming',
 'approaching',
 'bee',
 'coaster',
 'coasters',
 'hive',
 'hives',
 'rickety',
 'roller',
 'speed']

In [13]:
[word_tokenize(doc) for doc in corpus]

[['This', 'roller', 'coaster', 'is', 'too', ',', 'too', 'rickety', '.'],
 ['Roller', 'coasters', 'give', 'me', 'hives', '.'],
 ['That',
  'bee',
  'hive',
  'is',
  'approaching',
  'at',
  'an',
  'alarming',
  'speed',
  '.']]

In [15]:
def my_tokenizer(doc, lemmatizer=WordNetLemmatizer(), stopwords=None):
    tokens = word_tokenize(doc)
    tokens = [t.lower() for t in tokens if t not in string.punctuation]
    if lemmatizer:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
    if stopwords:
        tokens = [t for t in tokens if t not in stopwords]    
    return tokens

print([my_tokenizer(doc, lemmatizer=None) for doc in corpus])
print()
print([my_tokenizer(doc) for doc in corpus])


[['this', 'roller', 'coaster', 'is', 'too', 'too', 'rickety'], ['roller', 'coasters', 'give', 'me', 'hives'], ['that', 'bee', 'hive', 'is', 'approaching', 'at', 'an', 'alarming', 'speed']]

[['this', 'roller', 'coaster', 'is', 'too', 'too', 'rickety'], ['roller', 'coaster', 'give', 'me', 'hive'], ['that', 'bee', 'hive', 'is', 'approaching', 'at', 'an', 'alarming', 'speed']]


In [17]:
vect = TfidfVectorizer(stop_words='english', tokenizer=my_tokenizer)
vector_matrix = vect.fit_transform(corpus)

print (vect.get_feature_names())
print ('---')
print (vect.vocabulary_)

['alarming', 'approaching', 'bee', 'coaster', 'hive', 'rickety', 'roller', 'speed']
---
{'roller': 6, 'coaster': 3, 'rickety': 5, 'hive': 4, 'bee': 2, 'approaching': 1, 'alarming': 0, 'speed': 7}


In [21]:
print(vector_matrix)
vector_matrix

  (0, 6)	0.517856116168
  (0, 3)	0.517856116168
  (0, 5)	0.680918560399
  (1, 6)	0.57735026919
  (1, 3)	0.57735026919
  (1, 4)	0.57735026919
  (2, 4)	0.35543246785
  (2, 2)	0.467350981811
  (2, 1)	0.467350981811
  (2, 0)	0.467350981811
  (2, 7)	0.467350981811


<3x8 sparse matrix of type '<class 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>

## TF-IDF matrix:


In [22]:
print (vector_matrix.toarray())

[[ 0.          0.          0.          0.51785612  0.          0.68091856
   0.51785612  0.        ]
 [ 0.          0.          0.          0.57735027  0.57735027  0.
   0.57735027  0.        ]
 [ 0.46735098  0.46735098  0.46735098  0.          0.35543247  0.          0.
   0.46735098]]


In [24]:
print(vector_matrix.toarray()[1])

[ 0.          0.          0.          0.57735027  0.57735027  0.
  0.57735027  0.        ]


In [25]:
squareform(pdist(vector_matrix.toarray(), metric='cosine'))

array([[ 0.        ,  0.40203126,  1.        ],
       [ 0.40203126,  0.        ,  0.79479097],
       [ 1.        ,  0.79479097,  0.        ]])

## TF-IDF example: a villanelle

"The highly structured villanelle is a nineteen-line poem with two repeating rhymes and two refrains. The form is made up of five tercets followed by a quatrain. The first and third lines of the opening tercet are repeated alternately in the last lines of the succeeding stanzas; then in the final stanza, the refrain serves as the poem’s two concluding lines. Using capitals for the refrains and lowercase letters for the rhymes, the form could be expressed as: A1 b A2 / a b A1 / a b A2 / a b A1 / a b A2 / a b A1 A2."

In [26]:
raw_poem = '''Do not go gentle into that good night,
Old age should burn and rave at close of day;
Rage, rage against the dying of the light.

Though wise men at their end know dark is right,
Because their words had forked no lightning they
Do not go gentle into that good night.

Good men, the last wave by, crying how bright
Their frail deeds might have danced in a green bay,
Rage, rage against the dying of the light.

Wild men who caught and sang the sun in flight,
And learn, too late, they grieved it on its way,
Do not go gentle into that good night.

Grave men, near death, who see with blinding sight
Blind eyes could blaze like meteors and be gay,
Rage, rage against the dying of the light.

And you, my father, there on the sad height,
Curse, bless, me now with your fierce tears, I pray.
Do not go gentle into that good night.
Rage, rage against the dying of the light.'''

Let's treat each line as a document and the poem as our corpus, then vectorize this baby into a matrix that would make Dylan Thomas proud (sorry).


In [29]:
lines = raw_poem.split('\n')
lines = [l for l in lines if l] #removing empty lines
lines

['Do not go gentle into that good night,',
 'Old age should burn and rave at close of day;',
 'Rage, rage against the dying of the light.',
 'Though wise men at their end know dark is right,',
 'Because their words had forked no lightning they',
 'Do not go gentle into that good night.',
 'Good men, the last wave by, crying how bright',
 'Their frail deeds might have danced in a green bay,',
 'Rage, rage against the dying of the light.',
 'Wild men who caught and sang the sun in flight,',
 'And learn, too late, they grieved it on its way,',
 'Do not go gentle into that good night.',
 'Grave men, near death, who see with blinding sight',
 'Blind eyes could blaze like meteors and be gay,',
 'Rage, rage against the dying of the light.',
 'And you, my father, there on the sad height,',
 'Curse, bless, me now with your fierce tears, I pray.',
 'Do not go gentle into that good night.',
 'Rage, rage against the dying of the light.']

In [31]:
stops = nltk.corpus.stopwords.words('english')
stops[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']

In [35]:
print (my_tokenizer(lines[7], lemmatizer=None))
print (my_tokenizer(lines[7]))
print (my_tokenizer(lines[7], lemmatizer=None, stopwords=set(stops)))
print (my_tokenizer(lines[7], stopwords=set(stops)))

['their', 'frail', 'deeds', 'might', 'have', 'danced', 'in', 'a', 'green', 'bay']
['their', 'frail', 'deed', 'might', 'have', 'danced', 'in', 'a', 'green', 'bay']
['frail', 'deeds', 'might', 'danced', 'green', 'bay']
['frail', 'deed', 'might', 'danced', 'green', 'bay']


In [59]:
def my_vectorizer(corpus, stopwords=None, lemmatizer=None, kind='count'):
    '''
    INPUT: list of strings (documents)
    OUTPUT: 2D numpy array (vector matrix), 
            1D numpy array (sorted vocabulary),
            dictionary (keys: vocab words, values: indices in sorted vocab)
    
    '''
    #Tokenize documents
    tokenized_docs = [my_tokenizer(doc, stopwords=stopwords, lemmatizer=lemmatizer) for doc in corpus]
    
    #Make an array of unique words in the corpus
    vocab_list = set()
    for doc in tokenized_docs:
        vocab_list.update(doc)
    vocab_list = np.array(sorted(list(vocab_list)))
    
    #Make a dictionary mapping vocab tokens (words) to indices in this list
    vocab_dict = dict()
    for i, token in enumerate(vocab_list):
        vocab_dict[token] = i
    
    #Vectorize each document!
    vector_matrix = np.zeros((len(corpus), len(vocab_list)))
    for i, doc in enumerate(tokenized_docs):
        counter = Counter(doc)
        for token, count in counter.items():
            vector_matrix[i,vocab_dict[token]] = count
    
    # TF-IDF code
    if kind == 'tfidf':
        # number of documents
        N = len(corpus)
        
        # document frequency for each word in vocab
        df = np.array([sum(1 for doc in tokenized_docs if word in doc)\
                       for word in vocab_list])
        
        # idf
        idf_list = np.log(N/df)
        
        # vectorizin'
        vector_matrix = np.zeros((len(corpus), len(vocab_list)))
        for i, doc in enumerate(tokenized_docs):
            counter = Counter(doc)
            n = 1.*len(doc)
            for token, count in counter.items():
                idf = idf_list[vocab_dict[token]]
                vector_matrix[i,vocab_dict[token]] = (count/n)*idf
                
    return vector_matrix, vocab_list, vocab_dict

In [43]:
vec_mat, voc_list, voc_dict = my_vectorizer(lines)
print("voc_list \n ", voc_list)
print('voc_dict \n ', voc_dict)

voc_list 
  ['a' 'against' 'age' 'and' 'at' 'bay' 'be' 'because' 'blaze' 'bless'
 'blind' 'blinding' 'bright' 'burn' 'by' 'caught' 'close' 'could' 'crying'
 'curse' 'danced' 'dark' 'day' 'death' 'deeds' 'do' 'dying' 'end' 'eyes'
 'father' 'fierce' 'flight' 'forked' 'frail' 'gay' 'gentle' 'go' 'good'
 'grave' 'green' 'grieved' 'had' 'have' 'height' 'how' 'i' 'in' 'into' 'is'
 'it' 'its' 'know' 'last' 'late' 'learn' 'light' 'lightning' 'like' 'me'
 'men' 'meteors' 'might' 'my' 'near' 'night' 'no' 'not' 'now' 'of' 'old'
 'on' 'pray' 'rage' 'rave' 'right' 'sad' 'sang' 'see' 'should' 'sight'
 'sun' 'tears' 'that' 'the' 'their' 'there' 'they' 'though' 'too' 'wave'
 'way' 'who' 'wild' 'wise' 'with' 'words' 'you' 'your']
voc_dict 
  {'a': 0, 'against': 1, 'age': 2, 'and': 3, 'at': 4, 'bay': 5, 'be': 6, 'because': 7, 'blaze': 8, 'bless': 9, 'blind': 10, 'blinding': 11, 'bright': 12, 'burn': 13, 'by': 14, 'caught': 15, 'close': 16, 'could': 17, 'crying': 18, 'curse': 19, 'danced': 20, 'dark': 21

In [45]:
print (vec_mat.shape)
print (vec_mat)

(19, 98)
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  1. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]]


In [46]:
print (lines[2])
print (vec_mat[2])

Rage, rage against the dying of the light.
[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
  2.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  2.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.]


In [54]:
sk_vectorizer = CountVectorizer()
sk_vec_mat = sk_vectorizer.fit_transform(lines).toarray()
print(sk_vectorizer.get_feature_names()[:10], '\n')
print(sk_vectorizer.vocabulary_)

['against', 'age', 'and', 'at', 'bay', 'be', 'because', 'blaze', 'bless', 'blind'] 

{'do': 24, 'not': 64, 'go': 35, 'gentle': 34, 'into': 45, 'that': 80, 'good': 36, 'night': 62, 'old': 67, 'age': 1, 'should': 76, 'burn': 12, 'and': 2, 'rave': 71, 'at': 3, 'close': 15, 'of': 66, 'day': 21, 'rage': 70, 'against': 0, 'the': 81, 'dying': 25, 'light': 53, 'though': 85, 'wise': 91, 'men': 57, 'their': 82, 'end': 26, 'know': 49, 'dark': 20, 'is': 46, 'right': 72, 'because': 6, 'words': 93, 'had': 40, 'forked': 31, 'no': 63, 'lightning': 54, 'they': 84, 'last': 50, 'wave': 87, 'by': 13, 'crying': 17, 'how': 43, 'bright': 11, 'frail': 32, 'deeds': 23, 'might': 59, 'have': 41, 'danced': 19, 'in': 44, 'green': 38, 'bay': 4, 'wild': 90, 'who': 89, 'caught': 14, 'sang': 74, 'sun': 78, 'flight': 30, 'learn': 52, 'too': 86, 'late': 51, 'grieved': 39, 'it': 47, 'on': 68, 'its': 48, 'way': 88, 'grave': 37, 'near': 61, 'death': 22, 'see': 75, 'with': 92, 'blinding': 10, 'sight': 77, 'blind': 9, 'eyes'

In [55]:
sk_vec_mat[2]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0], dtype=int64)

In [56]:
pairwise_dist = squareform(pdist(vec_mat, metric='cosine'))
pairwise_dist.shape

(19, 19)

In [57]:
for i in range(len(lines[:5])):
    for ind in pairwise_dist[i].argsort()[:5]:
        print (lines[ind])
    print ('-----')

Do not go gentle into that good night,
Do not go gentle into that good night.
Do not go gentle into that good night.
Do not go gentle into that good night.
Good men, the last wave by, crying how bright
-----
Old age should burn and rave at close of day;
And you, my father, there on the sad height,
Blind eyes could blaze like meteors and be gay,
Wild men who caught and sang the sun in flight,
Though wise men at their end know dark is right,
-----
Rage, rage against the dying of the light.
Rage, rage against the dying of the light.
Rage, rage against the dying of the light.
Rage, rage against the dying of the light.
And you, my father, there on the sad height,
-----
Though wise men at their end know dark is right,
Because their words had forked no lightning they
Good men, the last wave by, crying how bright
Grave men, near death, who see with blinding sight
Wild men who caught and sang the sun in flight,
-----
Because their words had forked no lightning they
Though wise men at their end 

In [60]:
tf_vec_mat, tf_voc_list, tf_voc_dict = my_vectorizer(lines, kind='tfidf')
tf_vec_mat[2]

array([ 0.        ,  0.19476808,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.19476808,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.19476808,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.16687513,  0.  

In [61]:
sk_tf_vec = TfidfVectorizer()
sk_tf_mat = sk_tf_vec.fit_transform(lines).toarray()
sk_tf_mat[2]

array([ 0.30960308,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.30960308,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.30960308,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.28594828,  0.        ,  0.        ,  0.  

In [62]:
sum(sk_tf_mat[2]**2)

0.99999999999999978

In [63]:
pairwise_dist_tf = squareform(pdist(tf_vec_mat, metric='cosine'))
pairwise_dist_tf.shape

(19, 19)

In [65]:
# tfidf vector similarity
for i in range(len(lines[:5])):
    for ind in pairwise_dist_tf[i].argsort()[:5]:
        print (lines[ind])
    print ('-----')

Do not go gentle into that good night,
Do not go gentle into that good night.
Do not go gentle into that good night.
Do not go gentle into that good night.
Good men, the last wave by, crying how bright
-----
Old age should burn and rave at close of day;
Though wise men at their end know dark is right,
Rage, rage against the dying of the light.
Rage, rage against the dying of the light.
Rage, rage against the dying of the light.
-----
Rage, rage against the dying of the light.
Rage, rage against the dying of the light.
Rage, rage against the dying of the light.
Rage, rage against the dying of the light.
Good men, the last wave by, crying how bright
-----
Though wise men at their end know dark is right,
Old age should burn and rave at close of day;
Because their words had forked no lightning they
Their frail deeds might have danced in a green bay,
Good men, the last wave by, crying how bright
-----
Because their words had forked no lightning they
And learn, too late, they grieved it on i

In [66]:
# count vector similarity
for i in range(len(lines[:5])):
    for ind in pairwise_dist[i].argsort()[:5]:
        print (lines[ind])
    print ('-----')

Do not go gentle into that good night,
Do not go gentle into that good night.
Do not go gentle into that good night.
Do not go gentle into that good night.
Good men, the last wave by, crying how bright
-----
Old age should burn and rave at close of day;
And you, my father, there on the sad height,
Blind eyes could blaze like meteors and be gay,
Wild men who caught and sang the sun in flight,
Though wise men at their end know dark is right,
-----
Rage, rage against the dying of the light.
Rage, rage against the dying of the light.
Rage, rage against the dying of the light.
Rage, rage against the dying of the light.
And you, my father, there on the sad height,
-----
Though wise men at their end know dark is right,
Because their words had forked no lightning they
Good men, the last wave by, crying how bright
Grave men, near death, who see with blinding sight
Wild men who caught and sang the sun in flight,
-----
Because their words had forked no lightning they
Though wise men at their end 