In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
sents = ['coronavirus is a highly infectious disease',
   'coronavirus affects older people the most', 
   'older people are at high risk due to this disease']

In [2]:
cv = CountVectorizer()

In [5]:
X = cv.fit_transform(sents)

In [6]:
X = X.toarray()

In [7]:
print(X)

[[0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 0 0 1 1 1 0 1 0 0]
 [0 1 1 0 1 1 1 0 0 0 0 1 1 1 0 1 1]]


In [8]:
sorted(cv.vocabulary_.items())

[('affects', 0),
 ('are', 1),
 ('at', 2),
 ('coronavirus', 3),
 ('disease', 4),
 ('due', 5),
 ('high', 6),
 ('highly', 7),
 ('infectious', 8),
 ('is', 9),
 ('most', 10),
 ('older', 11),
 ('people', 12),
 ('risk', 13),
 ('the', 14),
 ('this', 15),
 ('to', 16)]

In [9]:
cv = CountVectorizer(ngram_range=(1,3))

In [10]:
X = cv.fit_transform(sents)

In [11]:
X = X.toarray()

In [12]:
print(X)

[[0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 1 0
  0 1 1 0 0 0 1 1 0 0 0 0 0]
 [0 0 0 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 0 1 1
  1 0 0 1 1 1 0 0 1 1 1 1 1]]


In [None]:
sorted(cv.vocabulary_.items())

[('affects', 0),
 ('affects older', 1),
 ('affects older people', 2),
 ('are', 3),
 ('are at', 4),
 ('are at high', 5),
 ('at', 6),
 ('at high', 7),
 ('at high risk', 8),
 ('coronavirus', 9),
 ('coronavirus affects', 10),
 ('coronavirus affects older', 11),
 ('coronavirus is', 12),
 ('coronavirus is highly', 13),
 ('disease', 14),
 ('due', 15),
 ('due to', 16),
 ('due to this', 17),
 ('high', 18),
 ('high risk', 19),
 ('high risk due', 20),
 ('highly', 21),
 ('highly infectious', 22),
 ('highly infectious disease', 23),
 ('infectious', 24),
 ('infectious disease', 25),
 ('is', 26),
 ('is highly', 27),
 ('is highly infectious', 28),
 ('most', 29),
 ('older', 30),
 ('older people', 31),
 ('older people are', 32),
 ('older people the', 33),
 ('people', 34),
 ('people are', 35),
 ('people are at', 36),
 ('people the', 37),
 ('people the most', 38),
 ('risk', 39),
 ('risk due', 40),
 ('risk due to', 41),
 ('the', 42),
 ('the most', 43),
 ('this', 44),
 ('this disease', 45),
 ('to', 46),
 ('

**TF-IDF**

In [13]:
sents = ['coronavirus is a highly infectious disease',
   'coronavirus affects older people the most', 
   'older people are at high risk due to this disease']

In Bag of Words, we witnessed how vectorization was just concerned with the frequency of vocabulary words in a given document. As a result, articles, prepositions, and conjunctions which don’t contribute a lot to the meaning get as much importance as, say, adjectives.

TF-IDF helps us to overcome this issue. Words that get repeated too often don’t overpower less frequent but important words.

TF
TF stands for Term Frequency. It can be understood as a normalized frequency score. It is calculated via the following formula:





IDF stands for Inverse Document Frequency, but before we go into IDF, we must make sense of DF – Document Frequency. It’s given by the following formula:



In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
sents = ['coronavirus is a highly infectious disease',
   'coronavirus affects older people the most', 
   'older people are at high risk due to this disease']

In [16]:
tfidf = TfidfVectorizer()

In [17]:
transformed = tfidf.fit_transform(sents)

In [None]:
dir(tfidf)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_params',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_tfidf',
 '_validate_data',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',
 '_white_spaces',
 '_word_ngrams',
 'analyzer',
 'binary',
 'build_analyzer',
 'build_preprocessor',
 'build_tokenizer',
 'decode',
 'decode_error',
 

Why inverse the DF?

Just as we discussed above, the intuition behind it is that the more common a word is across all documents, the lesser its importance is for the current document.



In [20]:
sorted(tfidf.vocabulary_.items())

[('affects', 0),
 ('are', 1),
 ('at', 2),
 ('coronavirus', 3),
 ('disease', 4),
 ('due', 5),
 ('high', 6),
 ('highly', 7),
 ('infectious', 8),
 ('is', 9),
 ('most', 10),
 ('older', 11),
 ('people', 12),
 ('risk', 13),
 ('the', 14),
 ('this', 15),
 ('to', 16)]

In [None]:
tfidf.build_tokenizer

<bound method _VectorizerMixin.build_tokenizer of TfidfVectorizer()>

In [None]:
sorted(tfidf.vocabulary_.items())

[('affects', 0),
 ('are', 1),
 ('at', 2),
 ('coronavirus', 3),
 ('disease', 4),
 ('due', 5),
 ('high', 6),
 ('highly', 7),
 ('infectious', 8),
 ('is', 9),
 ('most', 10),
 ('older', 11),
 ('people', 12),
 ('risk', 13),
 ('the', 14),
 ('this', 15),
 ('to', 16)]

In [None]:
len(tfidf.get_feature_names())



17

In [24]:
print(transformed.T)

  (4, 0)	0.3730219858594306
  (8, 0)	0.49047908420610337
  (7, 0)	0.49047908420610337
  (9, 0)	0.49047908420610337
  (3, 0)	0.3730219858594306
  (10, 1)	0.45954803293870056
  (14, 1)	0.45954803293870056
  (12, 1)	0.3494981241087058
  (11, 1)	0.3494981241087058
  (0, 1)	0.45954803293870056
  (3, 1)	0.3494981241087058
  (15, 2)	0.33834800036072993
  (16, 2)	0.33834800036072993
  (5, 2)	0.33834800036072993
  (13, 2)	0.33834800036072993
  (6, 2)	0.33834800036072993
  (2, 2)	0.33834800036072993
  (1, 2)	0.33834800036072993
  (12, 2)	0.25732237534738955
  (11, 2)	0.25732237534738955
  (4, 2)	0.25732237534738955


In [None]:
tfidf.decode(transformed[0])

<1x17 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [25]:
print(tfidf.get_params(deep=True))

{'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.float64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 1.0, 'max_features': None, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2', 'preprocessor': None, 'smooth_idf': True, 'stop_words': None, 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'use_idf': True, 'vocabulary': None}


In [26]:
import pandas as pd

In [27]:
df = pd.DataFrame(transformed[2].T.todense(),index=tfidf.get_feature_names(),columns=["TF-IDF"])



In [28]:
print(transformed[2].T)

  (15, 0)	0.33834800036072993
  (16, 0)	0.33834800036072993
  (5, 0)	0.33834800036072993
  (13, 0)	0.33834800036072993
  (6, 0)	0.33834800036072993
  (2, 0)	0.33834800036072993
  (1, 0)	0.33834800036072993
  (12, 0)	0.25732237534738955
  (11, 0)	0.25732237534738955
  (4, 0)	0.25732237534738955


In [29]:
print(transformed[2].T.shape)

(17, 1)


In [30]:
print(transformed[2].T.todense().shape)

(17, 1)


In [31]:
print(transformed[2].T)

  (15, 0)	0.33834800036072993
  (16, 0)	0.33834800036072993
  (5, 0)	0.33834800036072993
  (13, 0)	0.33834800036072993
  (6, 0)	0.33834800036072993
  (2, 0)	0.33834800036072993
  (1, 0)	0.33834800036072993
  (12, 0)	0.25732237534738955
  (11, 0)	0.25732237534738955
  (4, 0)	0.25732237534738955


In [32]:
transformed[2].T.todense()

matrix([[0.        ],
        [0.338348  ],
        [0.338348  ],
        [0.        ],
        [0.25732238],
        [0.338348  ],
        [0.338348  ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.25732238],
        [0.25732238],
        [0.338348  ],
        [0.        ],
        [0.338348  ],
        [0.338348  ]])

So, according to TF-IDF, the word ‘infectious’ is the most important feature out there, while many words which would have been used for feature building in a naive approach like Bag of Words, simply amount to 0 here. This is what we wanted all along.



A few pointers about TF-IDF:

The concept of n-grams is applicable here as well, we can combine words in groups of 2,3,4, and so on to build our final set of features.
Along with n-grams, there are also a number of parameters such as min_df, max_df, max_features, sublinear_tf, etc. to play around with. Carefully tuning these parameters can do wonders for your model’s capabilities.
Despite being so simple, TF-IDF is known to be extensively used in tasks like Information Retrieval to judge which response is the best for a query, especially useful in a chatbot or in Keyword Extraction to determine which word is the most relevant in a document, and thus, you’ll often find yourself banking on the intuitive wisdom of the TF-IDF.

So far, we’ve seen frequency-based methods for encoding text, now it’s time to take a look at more sophisticated methods which changed the world of word embeddings as we know it, and opened new research opportunities in NLP.