# Tutorial - vectorize text documents in sklearn

This tutorial demonstrates how to use the Sci-kit Learn (sklearn) package to vectorize text documents.

# Step 1: Read in data

In [1]:
# read in the tsv file as input
import pandas as p
input=p.read_csv("../A-data/moviereview.tsv", delimiter='\t')
docs=input['text']
print(docs[0:2].values)

['\'plot : two teen couples go to a church party , drink and then drive . \\nthey get into an accident . \\none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \\nwhat\\\'s the deal ? \\nwatch the movie and \\" sorta \\" find out . . . \\ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \\nwhich is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn\\\'t snag this one correctly . \\nthey seem to have taken this pretty neat concept , but executed it terribly . \\nso what are the problems with the movie ? \\nwell , its main problem is that it\\\'s simply too jumbled . \\nit starts off \\" normal \\" but then downshifts into this \\" fantasy \\" world in w

# Step 2: Prepare Vectorizer

In [2]:
# sklearn contains two vectorizers

# CountVectorizer can give you Boolean or TF vectors
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

# TfidfVectorizer can give you TF or TFIDF vectors
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

# Read the sklearn documentation to understand all vectorization options

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# several commonly used vectorizer setting

#  unigram boolean vectorizer, set minimum document frequency to 5
unigram_bool_vectorizer = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')

#  unigram term frequency vectorizer, set minimum document frequency to 5
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')

#  unigram and bigram term frequency vectorizer, set minimum document frequency to 5
gram12_count_vectorizer = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')

#  unigram tfidf vectorizer, set minimum document frequency to 5
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')


# Step 3: Vectorize the text documents

In [4]:
# The vectorizer can do "fit" and "transform"
# fit is a process to collect unique tokens into the vocabulary
# transform is a process to convert each document to vector based on the vocabulary
# These two processes can be done together using fit_transform(), or used individually: fit() or transform()

# fit vocabulary in documents and transform the documents into vectors
vecs = unigram_count_vectorizer.fit_transform(docs)

# check the content of a document vector
print(vecs.shape)

# check the size of the constructed vocabulary
print(len(unigram_count_vectorizer.vocabulary_))

# get words in the vocabulary
words = unigram_count_vectorizer.get_feature_names_out(); 
print(words[:10])

# note the value in the vocabulary is the word index, not frequency
print(unigram_count_vectorizer.vocabulary_.get('year'))

(2000, 13724)
13724
['00' '000' '007' '10' '100' '1000' '101' '102' '105' '11']
13670


In [8]:
# retrieve word frequency

# use sklearn vectorizer to get the word frequency
# it converts uppercase to lowercase and output word frequency by default

# google "how to use sklearn vectorizer to get word frequency"
# first hit https://www.google.com/search?client=safari&rls=en&q=how+to+use+sklearn+vectorizer+to+get+word+frequency&ie=UTF-8&oe=UTF-8

# after vectorization, the first method would convert sparse matrix to arrays and then sum, which can be slow for large data set

cv = CountVectorizer(encoding='latin-1')
vecs=cv.fit_transform(docs)    
word_list = cv.get_feature_names();    
count_list = vecs.toarray().sum(axis=0) 

In [9]:
freq = dict(zip(word_list,count_list))
print(freq.get('couples')) # word frequency
print(cv.vocabulary_.get('couples')) # word index, not frequency

27
7962


In [25]:
# https://investigate.ai/text-analysis/counting-words-with-scikit-learns-countvectorizer/



In [26]:
import pandas as pd

counts = pd.DataFrame(vecs.toarray(),
                      columns=cv.get_feature_names_out())
counts

Unnamed: 0,00,000,0009f,007,00s,03,04,05,05425,10,...,zukovsky,zulu,zundel,zurg,zus,zweibel,zwick,zwigoff,zycie,zzzzzzz
0,0,0,0,0,0,0,0,0,0,10,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1998,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [40]:
print(len(cv.get_feature_names_out()))
counts[['excellent', 'terrible', 'great', 'poor', 'sad', 'angry', 'funny', 'hilarious']]

44502


Unnamed: 0,excellent,terrible,great,poor,sad,angry,funny,hilarious
0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
1995,0,0,0,0,1,0,5,1
1996,0,0,1,0,0,0,0,0
1997,0,0,0,0,0,0,0,0
1998,0,0,0,0,0,0,0,0


In [33]:
# Show us the top 10 most common words
counts.T.sort_values(by=1, ascending=False).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
the,36,11,36,19,45,27,16,29,28,49,...,59,36,70,42,30,39,18,68,40,69
of,16,8,12,11,13,29,22,8,16,15,...,18,23,31,13,15,20,8,30,31,37
and,19,8,11,10,17,12,11,8,20,17,...,18,15,34,17,16,31,12,37,12,21
movie,6,5,2,0,2,2,0,7,0,7,...,0,2,2,4,4,14,0,5,3,8
in,8,4,2,3,14,15,14,14,14,12,...,14,8,12,12,10,8,4,17,7,8
know,1,4,1,0,1,0,0,1,1,2,...,0,1,3,1,0,2,0,0,0,1
that,13,4,13,5,8,13,5,8,7,12,...,17,14,16,7,7,9,4,5,9,10
here,1,3,0,1,1,0,0,0,0,0,...,0,0,1,2,1,1,1,0,1,1
is,12,3,17,6,18,14,9,11,6,13,...,18,3,39,0,13,18,14,26,19,20
it,21,3,8,6,13,7,9,3,5,8,...,11,7,37,6,3,10,3,4,4,13


In [34]:
counts['love']

0       0
1       0
2       0
3       0
4       2
       ..
1995    0
1996    0
1997    0
1998    0
1999    0
Name: love, Length: 2000, dtype: int64