# Part Two:  Feature Extraction with Scikit-Learn

Let's explore the more realistic process of using sklearn to complete the tasks mentioned above!

In [1]:
text = ['this is a line',
       'this is another line',
       'its compltely diffrent line']

## CountVectorizer

In [2]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer

In [11]:
cv = CountVectorizer()

In [12]:
sparse_txt = cv.fit_transform(text)

In [13]:
sparse_txt.todense()

matrix([[0, 0, 0, 1, 0, 1, 1],
        [1, 0, 0, 1, 0, 1, 1],
        [0, 1, 1, 0, 1, 1, 0]], dtype=int64)

In [15]:
cv.vocabulary_

{'this': 6,
 'is': 3,
 'line': 5,
 'another': 0,
 'its': 4,
 'compltely': 1,
 'diffrent': 2}

In [16]:
cv = CountVectorizer(stop_words='english')

In [19]:
cv.fit_transform(text).todense()

matrix([[0, 0, 1],
        [0, 0, 1],
        [1, 1, 1]], dtype=int64)

In [20]:
cv.vocabulary_

{'line': 2, 'compltely': 0, 'diffrent': 1}

## TfidfTransformer

TfidfVectorizer is used on sentences, while TfidfTransformer is used on an existing count matrix, such as one returned by CountVectorizer

In [30]:
tfid_transform = TfidfTransformer()

In [31]:
cv = CountVectorizer()

In [32]:
count = cv.fit_transform(text)

In [33]:
count

<3x7 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [35]:
tdid = tfid_transform.fit_transform(count)

In [36]:
tdid.todense()

matrix([[0.        , 0.        , 0.        , 0.61980538, 0.        ,
         0.48133417, 0.61980538],
        [0.63174505, 0.        , 0.        , 0.4804584 , 0.        ,
         0.37311881, 0.4804584 ],
        [0.        , 0.54645401, 0.54645401, 0.        , 0.54645401,
         0.32274454, 0.        ]])

## #pipeline

In [38]:
from sklearn.pipeline import Pipeline

In [43]:
pipe = Pipeline([('cv',CountVectorizer()),('tfidf',TfidfTransformer())])

In [45]:
results = pipe.fit_transform(text)

In [46]:
results

<3x7 sparse matrix of type '<class 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [47]:
results.todense()

matrix([[0.        , 0.        , 0.        , 0.61980538, 0.        ,
         0.48133417, 0.61980538],
        [0.63174505, 0.        , 0.        , 0.4804584 , 0.        ,
         0.37311881, 0.4804584 ],
        [0.        , 0.54645401, 0.54645401, 0.        , 0.54645401,
         0.32274454, 0.        ]])

## TfidfVectorizer
all steps using one method

In [48]:
tfid_v  = TfidfVectorizer() 

In [50]:
result=tfid_v.fit_transform(text)

In [51]:
result.todense()

matrix([[0.        , 0.        , 0.        , 0.61980538, 0.        ,
         0.48133417, 0.61980538],
        [0.63174505, 0.        , 0.        , 0.4804584 , 0.        ,
         0.37311881, 0.4804584 ],
        [0.        , 0.54645401, 0.54645401, 0.        , 0.54645401,
         0.32274454, 0.        ]])