In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
# list of text documents
text = ["The quick brown fox jumped over the lazy dog.",
"The dog.",
"The fox"]

In [3]:
# create the transform
vectorizer = TfidfVectorizer()

In [4]:
# tokenize and build vocab
vectorizer.fit(text)

TfidfVectorizer()

In [5]:
# summarize
print(vectorizer.vocabulary_)

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}


In [6]:
# The inverse document frequencies are calculated for each
# word in the vocabulary, assigning the lowest score of 1.0 to the most frequently observed word:
# the at index 7.

In [7]:
print(vectorizer.idf_)

[1.69314718 1.28768207 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718 1.        ]


In [8]:
# encode document
vector = vectorizer.transform([text[0]])

In [9]:
vector

<1x8 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [10]:
# summarize encoded vector
print(vector.shape)


(1, 8)


In [11]:
print(vector.toarray())

[[0.36388646 0.27674503 0.27674503 0.36388646 0.36388646 0.36388646
  0.36388646 0.42983441]]


In [12]:
# for all the three docs

In [13]:
vector1 = vectorizer.transform(text)

In [14]:
print(vector1.shape)

(3, 8)


In [15]:
print(vector1.toarray())

[[0.36388646 0.27674503 0.27674503 0.36388646 0.36388646 0.36388646
  0.36388646 0.42983441]
 [0.         0.78980693 0.         0.         0.         0.
  0.         0.61335554]
 [0.         0.         0.78980693 0.         0.         0.
  0.         0.61335554]]


In [16]:
import pandas as pd

In [17]:
df = pd.DataFrame(vector1.toarray())

In [18]:
df

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.363886,0.276745,0.276745,0.363886,0.363886,0.363886,0.363886,0.429834
1,0.0,0.789807,0.0,0.0,0.0,0.0,0.0,0.613356
2,0.0,0.0,0.789807,0.0,0.0,0.0,0.0,0.613356


In [19]:
columns = list()


for i in range(len(vectorizer.vocabulary_)):
    for k,c in vectorizer.vocabulary_.items():
        if i == c:
            col = k
            #print(col)
            columns.append(col)
    
        
                
columns    

['brown', 'dog', 'fox', 'jumped', 'lazy', 'over', 'quick', 'the']

In [20]:
df.columns = columns

In [21]:
df

Unnamed: 0,brown,dog,fox,jumped,lazy,over,quick,the
0,0.363886,0.276745,0.276745,0.363886,0.363886,0.363886,0.363886,0.429834
1,0.0,0.789807,0.0,0.0,0.0,0.0,0.0,0.613356
2,0.0,0.0,0.789807,0.0,0.0,0.0,0.0,0.613356


In [22]:
df.index = ["doc 0" ,"doc 1" ,"doc 2"]

In [23]:
df

Unnamed: 0,brown,dog,fox,jumped,lazy,over,quick,the
doc 0,0.363886,0.276745,0.276745,0.363886,0.363886,0.363886,0.363886,0.429834
doc 1,0.0,0.789807,0.0,0.0,0.0,0.0,0.0,0.613356
doc 2,0.0,0.0,0.789807,0.0,0.0,0.0,0.0,0.613356
