In [1]:
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
# list of text documents
text = ["Hi my name is sahil and i like to read about data sciences."]
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_)

{'hi': 3, 'my': 6, 'name': 7, 'is': 4, 'sahil': 9, 'and': 1, 'like': 5, 'to': 11, 'read': 8, 'about': 0, 'data': 2, 'sciences': 10}


In [3]:
dictvocab = vectorizer.vocabulary_

for key, value in dictvocab.items():
    print("For Key ",key+' , Value is:',value)

For Key  hi , Value is: 3
For Key  my , Value is: 6
For Key  name , Value is: 7
For Key  is , Value is: 4
For Key  sahil , Value is: 9
For Key  and , Value is: 1
For Key  like , Value is: 5
For Key  to , Value is: 11
For Key  read , Value is: 8
For Key  about , Value is: 0
For Key  data , Value is: 2
For Key  sciences , Value is: 10


In [4]:
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(type(vector))
print(vector.toarray())

(1, 12)
<class 'scipy.sparse.csr.csr_matrix'>
[[1 1 1 1 1 1 1 1 1 1 1 1]]


In [5]:
text2= ["I do not like data sceinces"]
vector = vectorizer.transform(text2)
print(vector.toarray())

[[0 0 1 0 0 1 0 0 0 0 0 0]]


In [6]:
text3= ["read and read read read "]
vector = vectorizer.transform(text3)
print(vector.toarray())


[[0 1 0 0 0 0 0 0 4 0 0 0]]


# TF-IDF

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog.",
		"The dog.",
		"The fox"]
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_)
print(vectorizer.idf_)

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}
[1.69314718 1.28768207 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718 1.        ]


In [8]:
# encode document
vector = vectorizer.transform([text[0]])
# summarize encoded vector
print(vector.shape)
print(vector.toarray())


(1, 8)
[[0.36388646 0.27674503 0.27674503 0.36388646 0.36388646 0.36388646
  0.36388646 0.42983441]]


# HashingVectorizer

In [9]:

from sklearn.feature_extraction.text import HashingVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]
# create the transform
vectorizer = HashingVectorizer(n_features=20)
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(vector.toarray())


(1, 20)
[[ 0.          0.          0.          0.          0.          0.33333333
   0.         -0.33333333  0.33333333  0.          0.          0.33333333
   0.          0.          0.         -0.33333333  0.          0.
  -0.66666667  0.        ]]


# Put Vectorizer in a Data Frame

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

docs = ['why hello there', 'omg hello pony', 'she went there? omg']
vec = CountVectorizer()
X = vec.fit_transform(docs)
df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
df

Unnamed: 0,hello,omg,pony,she,there,went,why
0,1,0,0,0,1,0,1
1,1,1,1,0,0,0,0
2,0,1,0,1,1,1,0
