## Count Vectorizer

In [42]:
from sklearn.feature_extraction.text import CountVectorizer

In [43]:
vect = CountVectorizer(binary=True)
corpus = ["Bill Nye went to DC", "Science is great", "Bill Nye met George Washington through a time machine"]

In [44]:
vect.fit(corpus)

CountVectorizer(binary=True)

In [45]:
vocab = vect.vocabulary_
vocab

{'bill': 0,
 'nye': 7,
 'went': 13,
 'to': 11,
 'dc': 1,
 'science': 8,
 'is': 4,
 'great': 3,
 'met': 6,
 'george': 2,
 'washington': 12,
 'through': 9,
 'time': 10,
 'machine': 5}

In [46]:
for key in sorted(vocab.keys()):
    print(key, vocab[key])

bill 0
dc 1
george 2
great 3
is 4
machine 5
met 6
nye 7
science 8
through 9
time 10
to 11
washington 12
went 13


In [47]:
vect.transform(["Was Bill Nye the Queen's rival?"]).toarray()

array([[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]], dtype=int64)

## Similarity

In [48]:
from sklearn.metrics.pairwise import cosine_similarity

In [49]:
# Similarity is dependent upon the corpus we use
sim = cosine_similarity(vect.transform(["Bill Nye the Physics Guy"]).toarray(), vect.transform(["Bill Nye the Science Guy"]).toarray())
sim

array([[0.81649658]])

## TF IDF Vectorizer

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
vect = TfidfVectorizer(binary=True)
corpus = ["Bill Nye went to DC", "Science is great", "Bill Nye met George Washington through a time machine"]

In [37]:
vect.fit(corpus)

TfidfVectorizer(binary=True)

In [38]:
vocab = vect.vocabulary_
vocab

{'bill': 0,
 'nye': 7,
 'went': 13,
 'to': 11,
 'dc': 1,
 'science': 8,
 'is': 4,
 'great': 3,
 'met': 6,
 'george': 2,
 'washington': 12,
 'through': 9,
 'time': 10,
 'machine': 5}

In [39]:
for key in sorted(vocab.keys()):
    print(key, vocab[key])

bill 0
dc 1
george 2
great 3
is 4
machine 5
met 6
nye 7
science 8
through 9
time 10
to 11
washington 12
went 13


In [40]:
vect.transform(["Was Bill Nye the Queen's rival?"]).toarray()

array([[0.70710678, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]])

## Similarity

In [41]:
sim = cosine_similarity(vect.transform(["Bill Nye the Physics Guy"]).toarray(), vect.transform(["Bill Nye the Science Guy"]).toarray())
sim

array([[0.73235914]])