# One hot encoding of categorical variables

In [3]:
from sklearn.feature_extraction import DictVectorizer

one_hot_encoder = DictVectorizer()
instances = [{'city': 'A'}, {'city': 'B'}, {'city': 'C'}]
one_hot_encoder.fit_transform(instances).toarray()

array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.]])

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['UNC played Duke in basketball', 'Duke lost the basketball game', 'I ate a sandwich']

vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(corpus).todense()
counts

matrix([[0, 1, 1, 0, 1, 0, 1, 0, 0, 1],
        [0, 1, 1, 1, 0, 1, 0, 0, 1, 0],
        [1, 0, 0, 0, 0, 0, 0, 1, 0, 0]], dtype=int64)

In [9]:
vectorizer.vocabulary_

{'ate': 0,
 'basketball': 1,
 'duke': 2,
 'game': 3,
 'in': 4,
 'lost': 5,
 'played': 6,
 'sandwich': 7,
 'the': 8,
 'unc': 9}

In [15]:
from sklearn.metrics.pairwise import euclidean_distances

print(euclidean_distances(counts[0], counts[1]))
print(euclidean_distances(counts[0], counts[2]))
print(euclidean_distances(counts[1], counts[2]))

[[ 2.44948974]]
[[ 2.64575131]]
[[ 2.64575131]]


In [17]:
vectorizer2 = CountVectorizer(stop_words='english')
counts2 = vectorizer2.fit_transform(corpus).todense()
counts2

vectorizer2.vocabulary_

{'ate': 0,
 'basketball': 1,
 'duke': 2,
 'game': 3,
 'lost': 4,
 'played': 5,
 'sandwich': 6,
 'unc': 7}

In [18]:
print(euclidean_distances(counts2[0], counts2[1]))
print(euclidean_distances(counts2[0], counts2[2]))
print(euclidean_distances(counts2[1], counts2[2]))

[[ 2.]]
[[ 2.44948974]]
[[ 2.44948974]]


In [20]:
corpus2 = ['He ate the sandwiches', 'Every sandwich was eaten by him']
vectorizer3 = CountVectorizer(binary=True, stop_words='english')
print(vectorizer3.fit_transform(corpus2).todense())
print(vectorizer3.vocabulary_)

[[1 0 0 1]
 [0 1 1 0]]
{'ate': 0, 'sandwiches': 3, 'sandwich': 2, 'eaten': 1}


In [25]:
from sklearn.feature_extraction.text import CountVectorizer 
corpus3 = ['The dog ate a sandwich, the wizard transfigured a sandwich, and I ate a sandwich']

vectorizer = CountVectorizer(stop_words='english')
print(vectorizer.fit_transform(corpus3).todense())
vectorizer.vocabulary_

[[2 1 3 1 1]]


{'ate': 0, 'dog': 1, 'sandwich': 2, 'transfigured': 3, 'wizard': 4}