# Feature Extraction from Text

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Create sample documents

In [3]:
docs = [
   "This quick brown fox jumps over the lazy dogs.",
   "The dog is not lazy, just resting",
   "My dog is a good dog and it likes to play."]

docs

['This quick brown fox jumps over the lazy dogs.',
 'The dog is not lazy, just resting',
 'My dog is a good dog and it likes to play.']

# Bag of Words

In [5]:
bow_vectorizer = CountVectorizer()
bow_vectorizer

In [6]:
bow_matrix = bow_vectorizer.fit_transform(docs)
bow_matrix.shape

(3, 21)

# Vocabulary

In [7]:
len(bow_vectorizer.get_feature_names_out())

21

In [8]:
bow_vectorizer.get_feature_names_out()

array(['and', 'brown', 'dog', 'dogs', 'fox', 'good', 'is', 'it', 'jumps',
       'just', 'lazy', 'likes', 'my', 'not', 'over', 'play', 'quick',
       'resting', 'the', 'this', 'to'], dtype=object)

# Print the BoW Matrix

In [9]:
bow_matrix

<3x21 sparse matrix of type '<class 'numpy.int64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [11]:
print(bow_matrix.toarray().shape)
bow_matrix.toarray()

(3, 21)


array([[0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0],
       [0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0],
       [1, 0, 2, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1]])

# TF-IDF

In [12]:
tfidf_vectorizer = TfidfVectorizer()

In [13]:
tfidf_matrix = tfidf_vectorizer.fit_transform(docs)

In [14]:
tfidf_vectorizer.get_feature_names_out()

array(['and', 'brown', 'dog', 'dogs', 'fox', 'good', 'is', 'it', 'jumps',
       'just', 'lazy', 'likes', 'my', 'not', 'over', 'play', 'quick',
       'resting', 'the', 'this', 'to'], dtype=object)

In [15]:
tfidf_matrix

<3x21 sparse matrix of type '<class 'numpy.float64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [18]:
tfidf_matrix.toarray().shape

(3, 21)

In [17]:
tfidf_matrix.toarray()

array([[0.        , 0.35013871, 0.        , 0.35013871, 0.35013871,
        0.        , 0.        , 0.        , 0.35013871, 0.        ,
        0.26628951, 0.        , 0.        , 0.        , 0.35013871,
        0.        , 0.35013871, 0.        , 0.26628951, 0.35013871,
        0.        ],
       [0.        , 0.        , 0.32992832, 0.        , 0.        ,
        0.        , 0.32992832, 0.        , 0.        , 0.43381609,
        0.32992832, 0.        , 0.        , 0.43381609, 0.        ,
        0.        , 0.        , 0.43381609, 0.32992832, 0.        ,
        0.        ],
       [0.3179494 , 0.        , 0.48361742, 0.        , 0.        ,
        0.3179494 , 0.24180871, 0.3179494 , 0.        , 0.        ,
        0.        , 0.3179494 , 0.3179494 , 0.        , 0.        ,
        0.3179494 , 0.        , 0.        , 0.        , 0.        ,
        0.3179494 ]])

# N-grams

In [25]:
#ngram_vectorizer = CountVectorizer(ngram_range = (2,2)) # bi-grams only

ngram_vectorizer = CountVectorizer(ngram_range = (1,2)) # uni-gram and bi-grams both


In [26]:
ngram_matrix = ngram_vectorizer.fit_transform(docs)

In [27]:
ngram_vectorizer.get_feature_names_out().shape

(43,)

In [28]:
ngram_vectorizer.get_feature_names_out()

array(['and', 'and it', 'brown', 'brown fox', 'dog', 'dog and', 'dog is',
       'dogs', 'fox', 'fox jumps', 'good', 'good dog', 'is', 'is good',
       'is not', 'it', 'it likes', 'jumps', 'jumps over', 'just',
       'just resting', 'lazy', 'lazy dogs', 'lazy just', 'likes',
       'likes to', 'my', 'my dog', 'not', 'not lazy', 'over', 'over the',
       'play', 'quick', 'quick brown', 'resting', 'the', 'the dog',
       'the lazy', 'this', 'this quick', 'to', 'to play'], dtype=object)

In [29]:
ngram_matrix.toarray()

array([[0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0],
       [0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1,
        0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 2, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]])