In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['ytick.minor.visible'] = False

# CountVectorizer

In [31]:
# CountVectorizer counts the number of times each word appears in each document

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
'The woods are lovely, dark and deep',  
'But I have promises to keep',   
'And miles to go before I sleep',   
'And miles to go before I sleep'
]
# A corpus is a collection of documents
# In this case, each document is a string

# Initialize a CountVectorizer object
vectorizer = CountVectorizer()

# Fit the vectorizer on the corpus of documents
# vectorizer.fit(corpus)

# Transform the corpus into a matrix of word counts
# X = vectorizer.transform(corpus)

# Fit and transform the corpus into a matrix of word counts
X = vectorizer.fit_transform(corpus) # is equivalent to vectorizer.fit(corpus).transform(corpus)

# Print the resulting matrix of word counts
# print(X.toarray())

# Print the resulting DataFrame
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
display(df.head())

# Notice that the word I is not included in the DataFrame
# because the "default configuration tokenizes the string by extracting words of at least 2 letters."
# https://scikit-learn.org/stable/modules/feature_extraction.html#common-vectorizer-usage

Unnamed: 0,and,are,before,but,dark,deep,go,have,keep,lovely,miles,promises,sleep,the,to,woods
0,1,1,0,0,1,1,0,0,0,1,0,0,0,1,0,1
1,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0
2,1,0,1,0,0,0,1,0,0,0,1,0,1,0,1,0
3,1,0,1,0,0,0,1,0,0,0,1,0,1,0,1,0


In [33]:
df.sum() # axis=0 by default, so it sums across the rows (i.e. each document)  

and         3
are         1
before      2
but         1
dark        1
deep        1
go          2
have        1
keep        1
lovely      1
miles       2
promises    1
sleep       2
the         1
to          3
woods       1
dtype: int64

In [34]:
# A prettier way to display the results:
(pd.DataFrame(df.sum(axis=0).
             sort_values(ascending=False))
             .rename_axis('word')
             .rename(columns={0:'count'}))

Unnamed: 0_level_0,count
word,Unnamed: 1_level_1
and,3
to,3
before,2
go,2
miles,2
sleep,2
are,1
but,1
dark,1
deep,1


# TfidfVectorizer

In [35]:
# https://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting

In [36]:
# TfidfVectorizer is similar to CountVectorizer, but weights the word counts by how often they appear in the corpus

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a corpus of text documents
corpus = [
'The woods are lovely, dark and deep',  
'But I have promises to keep',   
'And miles to go before I sleep',   
'And miles to go before I sleep'
]

# Initialize a TfidfVectorizer object
vectorizer = TfidfVectorizer(norm=None)  # norm=None to disable normalization, which means that the output won't be normalized to unit length

# Fit the vectorizer on the corpus of documents
vectorizer.fit(corpus)

# Transform the corpus into a matrix of TF-IDF features
X = vectorizer.transform(corpus)

# Print the resulting matrix of TF-IDF features
#print(X.toarray())

# Convert the matrix to a pandas DataFrame
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Print the resulting DataFrame
df

Unnamed: 0,and,are,before,but,dark,deep,go,have,keep,lovely,miles,promises,sleep,the,to,woods
0,1.223144,1.916291,0.0,0.0,1.916291,1.916291,0.0,0.0,0.0,1.916291,0.0,0.0,0.0,1.916291,0.0,1.916291
1,0.0,0.0,0.0,1.916291,0.0,0.0,0.0,1.916291,1.916291,0.0,0.0,1.916291,0.0,0.0,1.223144,0.0
2,1.223144,0.0,1.510826,0.0,0.0,0.0,1.510826,0.0,0.0,0.0,1.510826,0.0,1.510826,0.0,1.223144,0.0
3,1.223144,0.0,1.510826,0.0,0.0,0.0,1.510826,0.0,0.0,0.0,1.510826,0.0,1.510826,0.0,1.223144,0.0


## Let's look at a much simpler example to understand where these numbers come from.

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a corpus of text documents
corpus = [
'The first',
'The second',
]
# Initialize a TfidfVectorizer object
vectorizer = TfidfVectorizer(norm=None)  # norm=None to disable normalization, which means that the output won't be normalized to unit length

# Fit the vectorizer on the corpus of documents
vectorizer.fit(corpus)

# Transform the corpus into a matrix of TF-IDF features
X = vectorizer.transform(corpus)

# Print the resulting matrix of TF-IDF features
#print(X.toarray())

# Convert the matrix to a pandas DataFrame
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Print the resulting DataFrame
df

Unnamed: 0,first,second,the
0,1.405465,0.0,1.0
1,0.0,1.405465,1.0


## Even simpler...

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a corpus of text documents
corpus = [
'first',
'second',
]
# Initialize a TfidfVectorizer object # Equivalent to CountVectorizer followed by TfidfTransformer
vectorizer = TfidfVectorizer(norm=False)  # norm=None to disable normalization, which means that the output won't be normalized to unit length

# Fit the vectorizer on the corpus of documents
vectorizer.fit(corpus)

# Transform the corpus into a matrix of TF-IDF features
X = vectorizer.transform(corpus)

# Print the resulting matrix of TF-IDF features
#print(X.toarray())

# Convert the matrix to a pandas DataFrame
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Print the resulting DataFrame
df

Unnamed: 0,first,second
0,1.405465,0.0
1,0.0,1.405465


## So where do these numbers come from?

tf-idf means term-frequency times inverse document-frequency

From the sklearn [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer):

The goal of using tf-idf instead of the raw frequencies of occurrence of a token in a given document is to scale down the impact of tokens that occur very frequently in a given corpus and that are hence empirically less informative than features that occur in a small fraction of the training corpus.

If smooth_idf=True (the default), the constant “1” is added to the numerator and denominator of the idf as if an extra document was seen containing every term in the collection exactly once, which prevents zero divisions: idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1.

In [40]:
# Use the equations in the documentation to work out the maths explicitly

n = 2  # n is the number of documents in the corpus
df_first = 1  # df in this case is the number of documents in which the word ("first") appears

#idf1 = np.log(n / df_first) + 1  # smooth_idf=False
#print(idf1)

idf2 = np.log( (1 + n) / (1 + df_first) ) + 1  # smooth_idf=True (the default)
print(idf2)

1.4054651081081644
