Building Feature Vectors using TF-IDF 

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

train_text = ['A bird in the hand is worth two in the bush. ',
'GOOD things comes to those to wait. ',
'These watches cost $255. ',
'Mr.Smith gies to Washington. ',
'Doogle Broswer M.D.']

In [3]:
count_vectorizer = CountVectorizer()
freq_term_matrix = count_vectorizer.fit_transform(train_text)
len(count_vectorizer.vocabulary_)

24

In [4]:
freq_term_matrix.shape

(5, 24)

In [5]:
# Array representation
freq_term_matrix.toarray()

array([[0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0,
        0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 0, 1, 0,
        0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        1, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,
        0, 0],
       [0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]], dtype=int64)

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer

In [15]:
tfidf_trans = TfidfTransformer()

In [17]:
tf_idf_vec1 = tfidf_trans.fit_transform(freq_term_matrix)

In [18]:
tf_idf_vec1.shape

(5, 24)

The actual numeric features represnted using TF-IDF are represented using 
toarray() conversion :

In [19]:
tf_idf_vec1.toarray()

array([[0.        , 0.26726124, 0.        , 0.26726124, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.26726124,
        0.53452248, 0.26726124, 0.        , 0.        , 0.53452248,
        0.        , 0.        , 0.        , 0.        , 0.26726124,
        0.        , 0.        , 0.        , 0.26726124],
       [0.        , 0.        , 0.        , 0.        , 0.36265071,
        0.        , 0.        , 0.        , 0.36265071, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.36265071, 0.36265071, 0.58516862, 0.        ,
        0.36265071, 0.        , 0.        , 0.        ],
       [0.5       , 0.        , 0.        , 0.        , 0.        ,
        0.5       , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5       , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.5       , 0.        ],
       [0.   

Scikit Learn will allow you to easyily Text to Tfidf vectors using 
TfidfVectorizer = CountVectorizer + TfidfTransoformer 

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
tfidf_vectorizer = TfidfVectorizer()

In [33]:
tfidf_vector2 = tfidf_vectorizer.fit_transform(train_text)
tfidf_vectorizer.vocabulary_

{'bird': 1,
 'in': 10,
 'the': 14,
 'hand': 9,
 'is': 11,
 'worth': 23,
 'two': 19,
 'bush': 3,
 'good': 8,
 'things': 16,
 'comes': 4,
 'to': 18,
 'those': 17,
 'wait': 20,
 'these': 15,
 'watches': 22,
 'cost': 5,
 '255': 0,
 'mr': 12,
 'smith': 13,
 'gies': 7,
 'washington': 21,
 'doogle': 6,
 'broswer': 2}

In [34]:
tfidf_vector2.shape

(5, 24)

In [35]:
tfidf_vectorizer.idf_

array([2.09861229, 2.09861229, 2.09861229, 2.09861229, 2.09861229,
       2.09861229, 2.09861229, 2.09861229, 2.09861229, 2.09861229,
       2.09861229, 2.09861229, 2.09861229, 2.09861229, 2.09861229,
       2.09861229, 2.09861229, 2.09861229, 1.69314718, 2.09861229,
       2.09861229, 2.09861229, 2.09861229, 2.09861229])

In [36]:
dict(zip(tfidf_vectorizer.get_feature_names(), tfidf_vectorizer.idf_))

{'255': 2.09861228866811,
 'bird': 2.09861228866811,
 'broswer': 2.09861228866811,
 'bush': 2.09861228866811,
 'comes': 2.09861228866811,
 'cost': 2.09861228866811,
 'doogle': 2.09861228866811,
 'gies': 2.09861228866811,
 'good': 2.09861228866811,
 'hand': 2.09861228866811,
 'in': 2.09861228866811,
 'is': 2.09861228866811,
 'mr': 2.09861228866811,
 'smith': 2.09861228866811,
 'the': 2.09861228866811,
 'these': 2.09861228866811,
 'things': 2.09861228866811,
 'those': 2.09861228866811,
 'to': 1.6931471805599454,
 'two': 2.09861228866811,
 'wait': 2.09861228866811,
 'washington': 2.09861228866811,
 'watches': 2.09861228866811,
 'worth': 2.09861228866811}