# Features Extraction - ScikitLearn

## Import Libraries

In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

## Creating Data

In [3]:
text = ['This is the first line',
       'This is the second line',
       'This is another line']

## Count Vectorizer

In [4]:
# count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()

In [5]:
sparse_matrix = count_vectorizer.fit_transform(text)

In [6]:
sparse_matrix.todense()

matrix([[0, 1, 1, 1, 0, 1, 1],
        [0, 0, 1, 1, 1, 1, 1],
        [1, 0, 1, 1, 0, 0, 1]], dtype=int64)

In [7]:
count_vectorizer.vocabulary_

{'this': 6,
 'is': 2,
 'the': 5,
 'first': 1,
 'line': 3,
 'second': 4,
 'another': 0}

## TF-IDF Transformer

In [8]:
tfidf_transformer = TfidfTransformer()

In [9]:
sparse_matrix

<3x7 sparse matrix of type '<class 'numpy.int64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [10]:
tfidf_results = tfidf_transformer.fit_transform(sparse_matrix)

In [11]:
tfidf_results.todense()

matrix([[0.        , 0.61722732, 0.3645444 , 0.3645444 , 0.        ,
         0.46941728, 0.3645444 ],
        [0.        , 0.        , 0.3645444 , 0.3645444 , 0.61722732,
         0.46941728, 0.3645444 ],
        [0.69903033, 0.        , 0.41285857, 0.41285857, 0.        ,
         0.        , 0.41285857]])

## TF-IDF Vectorizer

It does both Count Vectorizer and TFIDF Transformer in one step

In [12]:
tfidf_vectorizer = TfidfVectorizer()

In [13]:
tfidf_vectorizer_results = tfidf_vectorizer.fit_transform(text)

In [14]:
tfidf_vectorizer_results

<3x7 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [15]:
tfidf_vectorizer_results.todense()

matrix([[0.        , 0.61722732, 0.3645444 , 0.3645444 , 0.        ,
         0.46941728, 0.3645444 ],
        [0.        , 0.        , 0.3645444 , 0.3645444 , 0.61722732,
         0.46941728, 0.3645444 ],
        [0.69903033, 0.        , 0.41285857, 0.41285857, 0.        ,
         0.        , 0.41285857]])

## Note

Do not use .todense() incase of large corpus