In [1]:
import numpy as np
import pandas as pd
from sklearn import set_config
set_config(transform_output='pandas')

In [2]:
# Sample sentences
X = np.array([
    "I enjoy learning new programming languages. The best is Python. Programming is so fun!",
    "I love programming, I would give it an A+!",
    "Programming is amazing. Programming is love. Programming is life.",
    "Python is my favorite programming language."
])

**Count Vectorization**

In [3]:
# Sample sentences
X = np.array([
    "I enjoy learning new programming languages. The best is Python. Programming is so fun!",
    "I love programming, I would give it an A+!",
    "Programming is amazing. Programming is love. Programming is life.",
    "Python is my favorite programming language."
])

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
# instantiate a vectorizer
vectorizer = CountVectorizer()
# Fit it on the data 
vectorizer.fit(X)

In [5]:
# Saves vocab - matches number of columns above
vocab_dict = vectorizer.vocabulary_
type(vocab_dict)

dict

In [6]:
# how many words in our vocab?
len(vocab_dict)

21

In [7]:
vocab_dict

{'enjoy': 3,
 'learning': 11,
 'new': 15,
 'programming': 16,
 'languages': 10,
 'the': 19,
 'best': 2,
 'is': 7,
 'python': 17,
 'so': 18,
 'fun': 5,
 'love': 13,
 'would': 20,
 'give': 6,
 'it': 8,
 'an': 1,
 'amazing': 0,
 'life': 12,
 'my': 14,
 'favorite': 4,
 'language': 9}

In [8]:
# To obtain the count, transform the X data
X_count = vectorizer.transform(X)
type(X_count)

scipy.sparse._csr.csr_matrix

In [9]:
# Convert sparse matrix to array for display
X_count.toarray()

array([[0, 0, 1, 1, 0, 1, 0, 2, 0, 0, 1, 1, 0, 0, 0, 1, 2, 1, 1, 1, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 1, 1, 0, 0, 3, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0]])

In [10]:
# Check the shape of the array
X_count.shape

(4, 21)

In [11]:
# Make array into a df
X_count_df = pd.DataFrame(X_count.toarray(), columns= vectorizer.get_feature_names_out())
X_count_df

Unnamed: 0,amazing,an,best,enjoy,favorite,fun,give,is,it,language,...,learning,life,love,my,new,programming,python,so,the,would
0,0,0,1,1,0,1,0,2,0,0,...,1,0,0,0,1,2,1,1,1,0
1,0,1,0,0,0,0,1,0,1,0,...,0,0,1,0,0,1,0,0,0,1
2,1,0,0,0,0,0,0,3,0,0,...,0,1,1,0,0,3,0,0,0,0
3,0,0,0,0,1,0,0,1,0,1,...,0,0,0,1,0,1,1,0,0,0


**Using Sklearn's TfidfVectorizer**

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
# TfidfVectorizer Example
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns= tfidf_vectorizer.get_feature_names_out())
X_tfidf_df.round(4)

Unnamed: 0,amazing,an,best,enjoy,favorite,fun,give,is,it,language,...,learning,life,love,my,new,programming,python,so,the,would
0,0.0,0.0,0.297,0.297,0.0,0.297,0.0,0.3791,0.0,0.0,...,0.297,0.0,0.0,0.0,0.297,0.3099,0.2341,0.297,0.297,0.0
1,0.0,0.452,0.0,0.0,0.0,0.0,0.452,0.0,0.452,0.0,...,0.0,0.0,0.3564,0.0,0.0,0.2359,0.0,0.0,0.0,0.452
2,0.3383,0.0,0.0,0.0,0.0,0.0,0.0,0.6477,0.0,0.0,...,0.0,0.3383,0.2667,0.0,0.0,0.5296,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.4822,0.0,0.0,0.3078,0.0,0.4822,...,0.0,0.0,0.0,0.4822,0.0,0.2516,0.3801,0.0,0.0,0.0


**Preprocessing using sklearn's vectorizers**

In [13]:
# instantiate a vectorizer and removing stopwords
vectorizer_stopped = CountVectorizer(stop_words='english')
# Fit it on the data 
X_vec = vectorizer_stopped.fit_transform(X)
X_stopped = pd.DataFrame(X_vec.toarray(), columns= vectorizer_stopped.get_feature_names_out())
X_stopped

Unnamed: 0,amazing,best,enjoy,favorite,fun,language,languages,learning,life,love,new,programming,python
0,0,1,1,0,1,0,1,1,0,0,1,2,1
1,0,0,0,0,0,0,0,0,0,1,0,1,0
2,1,0,0,0,0,0,0,0,1,1,0,3,0
3,0,0,0,1,0,1,0,0,0,0,0,1,1


In [14]:
# Comparing default vocab 
print(f"# of terms in original vocabulary: {len(vectorizer.vocabulary_)}")
print(f"# of terms in stopwords-removed vocabulary: {len(vectorizer_stopped.vocabulary_)}")

# of terms in original vocabulary: 21
# of terms in stopwords-removed vocabulary: 13


In [15]:
# instantiate a vectorizer
vectorizer_tfidf_stopped = TfidfVectorizer(stop_words='english')
# Fit it on the data 
X_vec_tfidf_stopped = vectorizer_tfidf_stopped.fit_transform(X)
X_stopped_tfidf = pd.DataFrame(X_vec_tfidf_stopped.toarray(), columns= vectorizer_tfidf_stopped.get_feature_names_out())
X_stopped_tfidf

Unnamed: 0,amazing,best,enjoy,favorite,fun,language,languages,learning,life,love,new,programming,python
0,0.0,0.360121,0.360121,0.0,0.360121,0.0,0.360121,0.360121,0.0,0.0,0.360121,0.375852,0.283924
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.833884,0.0,0.551939,0.0
2,0.444008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.444008,0.350061,0.0,0.695105,0.0
3,0.0,0.0,0.0,0.587838,0.0,0.587838,0.0,0.0,0.0,0.0,0.0,0.306758,0.463458


In [16]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
custom_stopwords = [*ENGLISH_STOP_WORDS, 'programming']
# instantiate a vectorizer
vectorizer_stopped_custom = CountVectorizer(stop_words=custom_stopwords)
# Fit it on the data 
X_vec = vectorizer_stopped_custom.fit_transform(X)
X_stopped_custom = pd.DataFrame(X_vec.toarray(), columns= vectorizer_stopped_custom.get_feature_names_out())
X_stopped_custom

Unnamed: 0,amazing,best,enjoy,favorite,fun,language,languages,learning,life,love,new,python
0,0,1,1,0,1,0,1,1,0,0,1,1
1,0,0,0,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0,0,1,1,0,0
3,0,0,0,1,0,1,0,0,0,0,0,1


**Change Tokenization Method**

In [17]:
from nltk import wordpunct_tokenize
# instantiate a vectorizer with english stopwords
vectorizer_nltk = CountVectorizer(stop_words='english',
                                  tokenizer=wordpunct_tokenize, token_pattern = None)
# Fit it on the data 
X_count_nltk = vectorizer_nltk.fit_transform(X)
# Getting the feature names (vocabulary)
X_count_nltk_df = pd.DataFrame(X_count_nltk.toarray(), columns= vectorizer_nltk.get_feature_names_out())
X_count_nltk_df

Unnamed: 0,!,+!,",",.,amazing,best,enjoy,favorite,fun,language,languages,learning,life,love,new,programming,python
0,1,0,0,2,0,1,1,0,1,0,1,1,0,0,1,2,1
1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0
2,0,0,0,3,1,0,0,0,0,0,0,0,1,1,0,3,0
3,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,1


In [18]:
from nltk import wordpunct_tokenize
# instantiate a vectorizer with english stopwords
vectorizer_nltk = TfidfVectorizer(stop_words='english',
                                  tokenizer=wordpunct_tokenize, token_pattern = None)
# Fit it on the data 
X_count_nltk = vectorizer_nltk.fit_transform(X)
# Getting the feature names (vocabulary)
X_count_nltk_df = pd.DataFrame(X_count_nltk.toarray(), columns= vectorizer_nltk.get_feature_names_out())
X_count_nltk_df

Unnamed: 0,!,+!,",",.,amazing,best,enjoy,favorite,fun,language,languages,learning,life,love,new,programming,python
0,0.310978,0.0,0.0,0.396986,0.0,0.310978,0.310978,0.0,0.310978,0.0,0.310978,0.310978,0.0,0.0,0.310978,0.324562,0.245178
1,0.0,0.587838,0.587838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.463458,0.0,0.306758,0.0
2,0.0,0.0,0.0,0.647743,0.338271,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.338271,0.266697,0.0,0.529572,0.0
3,0.0,0.0,0.0,0.351295,0.0,0.0,0.0,0.550372,0.0,0.550372,0.0,0.0,0.0,0.0,0.0,0.287207,0.433919


In [19]:
# NGRAMS

# instantiate a vectorizer to include bigrams
vectorizer_ngrams = CountVectorizer(stop_words='english', ngram_range=(1,2))
# Fit it on the data 
X_vec = vectorizer_ngrams.fit_transform(X)
X_ngrams = pd.DataFrame(X_vec.toarray(), columns= vectorizer_ngrams.get_feature_names_out())
X_ngrams

Unnamed: 0,amazing,amazing programming,best,best python,enjoy,enjoy learning,favorite,favorite programming,fun,language,...,programming,programming amazing,programming fun,programming language,programming languages,programming life,programming love,python,python favorite,python programming
0,0,0,1,1,1,1,0,0,1,0,...,2,0,1,0,1,0,0,1,0,1
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,3,1,0,0,0,1,1,0,0,0
3,0,0,0,0,0,0,1,1,0,1,...,1,0,0,1,0,0,0,1,1,0


In [20]:
# instantiate a vectorizer to include bigrams and trigrams
vectorizer_ngrams = CountVectorizer(stop_words='english', ngram_range=(1,3))
# Fit it on the data 
X_vec = vectorizer_ngrams.fit_transform(X)
X_ngrams = pd.DataFrame(X_vec.toarray(), columns= vectorizer_ngrams.get_feature_names_out())
X_ngrams

Unnamed: 0,amazing,amazing programming,amazing programming love,best,best python,best python programming,enjoy,enjoy learning,enjoy learning new,favorite,...,programming languages,programming languages best,programming life,programming love,programming love programming,python,python favorite,python favorite programming,python programming,python programming fun
0,0,0,0,1,1,1,1,1,1,0,...,1,1,0,0,0,1,0,0,1,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,1,1,0,0


In [21]:
len(vectorizer_ngrams.vocabulary_)

42

**Manually Controlling Vocabulary Size**

In [22]:
# instantiate a vectorizer
vectorizer_max10 = CountVectorizer(stop_words='english', max_features=10)
# Fit it on the data 
X_vec = vectorizer_max10.fit_transform(X)
X_max10 = pd.DataFrame(X_vec.toarray(), columns= vectorizer_max10.get_feature_names_out())
X_max10

Unnamed: 0,amazing,best,enjoy,favorite,fun,language,languages,love,programming,python
0,0,1,1,0,1,0,1,0,2,1
1,0,0,0,0,0,0,0,1,1,0
2,1,0,0,0,0,0,0,1,3,0
3,0,0,0,1,0,1,0,0,1,1


In [23]:
# instantiate a vectorizer
vectorizer_maxdf = CountVectorizer(stop_words='english', max_df = .5)
# Fit it on the data 
X_vec = vectorizer_maxdf.fit_transform(X)
X_maxdf = pd.DataFrame(X_vec.toarray(), columns= vectorizer_maxdf.get_feature_names_out())
X_maxdf

Unnamed: 0,amazing,best,enjoy,favorite,fun,language,languages,learning,life,love,new,python
0,0,1,1,0,1,0,1,1,0,0,1,1
1,0,0,0,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0,0,1,1,0,0
3,0,0,0,1,0,1,0,0,0,0,0,1


In [24]:
# instantiate a vectorizer
vectorizer_mindf = CountVectorizer(stop_words='english', min_df = .5)
# Fit it on the data 
X_vec = vectorizer_mindf.fit_transform(X)
X_mindf = pd.DataFrame(X_vec.toarray(), columns= vectorizer_mindf.get_feature_names_out())
X_mindf

Unnamed: 0,love,programming,python
0,0,2,1
1,1,1,0
2,1,3,0
3,0,1,1
