# Encode Text Data

## 0. Introduction

This notebook contains:
  1. Convert text to word count vectors with CountVectorizer
  2. Convert text to word frequency vectors with TfIdfVectorizer
  3. Convert text to word unique integers with HashingVectorizer

## 1. Word counts with CountVectorizer

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
text = ['The quick brown fox jumped over a lazy dog']
vectorizer = CountVectorizer()
vectorizer.fit(text)
print(vectorizer.vocabulary_)

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}


In [4]:
vector = vectorizer.transform(text)
print(vector)

  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	1
  (0, 7)	1


In [6]:
print(vector.shape)
print(type(vector))
print(vector.toarray())

(1, 8)
<class 'scipy.sparse._csr.csr_matrix'>
[[1 1 1 1 1 1 1 1]]


## 2. Word frequency with TfIdfVectorizer

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
text = ['The quick brown fox jumped over a lazy dog.',
        'The dog.',
        'The fox.']
vectorizer = TfidfVectorizer()
vectorizer.fit(text)
print(vectorizer.vocabulary_)
print(vectorizer.idf_)

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}
[1.69314718 1.28768207 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718 1.        ]


In [17]:
vector = vectorizer.transform([text[0]])
print(vector)

  (0, 7)	0.23155850231624595
  (0, 6)	0.3920626253314354
  (0, 5)	0.3920626253314354
  (0, 4)	0.3920626253314354
  (0, 3)	0.3920626253314354
  (0, 2)	0.298173732156414
  (0, 1)	0.298173732156414
  (0, 0)	0.3920626253314354


In [18]:
print(vector.shape)
print(type(vector))
print(vector.toarray())

(1, 8)
<class 'scipy.sparse._csr.csr_matrix'>
[[0.39206263 0.29817373 0.29817373 0.39206263 0.39206263 0.39206263
  0.39206263 0.2315585 ]]


## 3. Hashing with HashingVectorizer

In [19]:
from sklearn.feature_extraction.text import HashingVectorizer

In [20]:
text = ['The quick brown fox jumped over a lazy dog.']
vectorizer = HashingVectorizer(n_features=20)
vector = vectorizer.transform(text)
print(vector.shape)
print(type(vector))
print(vector.toarray())

(1, 20)
<class 'scipy.sparse._csr.csr_matrix'>
[[ 0.          0.          0.          0.          0.          0.40824829
   0.         -0.40824829  0.40824829  0.          0.          0.40824829
   0.          0.          0.         -0.40824829  0.          0.
  -0.40824829  0.        ]]
