In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pprint

In [2]:
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

In [3]:
# Unigrams
unigram_vectorizer = CountVectorizer(ngram_range=(1, 1))
unigram_counts = unigram_vectorizer.fit_transform(documents)
unigram_features = unigram_vectorizer.get_feature_names_out()

print("Unigrams:")
print("Features:", unigram_features)
print("Counts:\n", unigram_counts.toarray())

Unigrams:
Features: ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
Counts:
 [[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [10]:
# Bigrams
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
bigram_counts = bigram_vectorizer.fit_transform(documents)
bigram_features = bigram_vectorizer.get_feature_names_out()

print("\nBigrams:")
print("Features:", bigram_features)
print("Counts:\n", bigram_counts.toarray())


Bigrams:
Features: ['and this' 'document is' 'first document' 'is the' 'is this'
 'second document' 'the first' 'the second' 'the third' 'third one'
 'this document' 'this is' 'this the']
Counts:
 [[0 0 1 1 0 0 1 0 0 0 0 1 0]
 [0 1 0 1 0 1 0 1 0 0 1 0 0]
 [1 0 0 1 0 0 0 0 1 1 0 1 0]
 [0 0 1 0 1 0 1 0 0 0 0 0 1]]


In [11]:
# Trigrams
trigram_vectorizer = CountVectorizer(ngram_range=(3, 3))
trigram_counts = trigram_vectorizer.fit_transform(documents)
trigram_features = trigram_vectorizer.get_feature_names_out()

print("\nTrigrams:")
print("Features:", trigram_features)
print("Counts:\n", trigram_counts.toarray())


Trigrams:
Features: ['and this is' 'document is the' 'is the first' 'is the second'
 'is the third' 'is this the' 'the first document' 'the second document'
 'the third one' 'this document is' 'this is the' 'this the first']
Counts:
 [[0 0 1 0 0 0 1 0 0 0 1 0]
 [0 1 0 1 0 0 0 1 0 1 0 0]
 [1 0 0 0 1 0 0 0 1 0 1 0]
 [0 0 0 0 0 1 1 0 0 0 0 1]]


In [12]:
# TF-IDF (using unigrams as an example, but can be used with bigrams/trigrams too)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1)) # or (2,2) for bigrams, etc.
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
tfidf_features = tfidf_vectorizer.get_feature_names_out()

print("\nTF-IDF (Unigrams):")
print("Features:", tfidf_features)
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())



TF-IDF (Unigrams):
Features: ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
TF-IDF Matrix:
 [[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [18]:
# Example with bigrams TF-IDF
tfidf_bigram_vectorizer = TfidfVectorizer(ngram_range=(2, 2))
tfidf_bigram_matrix = tfidf_bigram_vectorizer.fit_transform(documents)
tfidf_bigram_features = tfidf_bigram_vectorizer.get_feature_names_out()

print("\nTF-IDF (Bigrams):")
print("Features:", tfidf_bigram_features)
# print("TF-IDF Matrix:\n", tfidf_bigram_matrix.toarray())
pprint.pprint(tfidf_bigram_matrix.toarray())


TF-IDF (Bigrams):
Features: ['and this' 'document is' 'first document' 'is the' 'is this'
 'second document' 'the first' 'the second' 'the third' 'third one'
 'this document' 'this is' 'this the']
array([[0.        , 0.        , 0.52303503, 0.42344193, 0.        ,
        0.        , 0.52303503, 0.        , 0.        , 0.        ,
        0.        , 0.52303503, 0.        ],
       [0.        , 0.47633035, 0.        , 0.30403549, 0.        ,
        0.47633035, 0.        , 0.47633035, 0.        , 0.        ,
        0.47633035, 0.        , 0.        ],
       [0.49819711, 0.        , 0.        , 0.31799276, 0.        ,
        0.        , 0.        , 0.        , 0.49819711, 0.49819711,
        0.        , 0.39278432, 0.        ],
       [0.        , 0.        , 0.43779123, 0.        , 0.55528266,
        0.        , 0.43779123, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.55528266]])
