In [1]:
import math
from collections import defaultdict

In [2]:
# Sample text data (documents)
docs = [
    "this is a sample document",
    "this document is a sample",
    "sample document is simple and short"
]

In [3]:
# Preprocess: Tokenize and lowercase
def preprocess(text):
    return text.lower().split()

In [4]:
# Generate n-grams (unigram, bigram, trigram)
def generate_ngrams(tokens, n):
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

In [5]:
# Build the corpus with unigrams, bigrams, trigrams
def build_corpus(docs):
    corpus = []
    for doc in docs:
        tokens = preprocess(doc)
        unigrams = generate_ngrams(tokens, 1)
        bigrams = generate_ngrams(tokens, 2)
        trigrams = generate_ngrams(tokens, 3)
        corpus.append(unigrams + bigrams + trigrams)
    return corpus

In [6]:
# Term Frequency (TF)
def compute_tf(corpus):
    tf_scores = []
    for doc in corpus:
        tf = defaultdict(int)
        for term in doc:
            tf[term] += 1
        total_terms = len(doc)
        for term in tf:
            tf[term] /= total_terms
        tf_scores.append(tf)
    return tf_scores

In [7]:
# Inverse Document Frequency (IDF)
def compute_idf(corpus):
    N = len(corpus)
    idf = defaultdict(int)
    # Count how many documents contain each term
    for doc in corpus:
        unique_terms = set(doc)
        for term in unique_terms:
            idf[term] += 1
    # Compute IDF score
    for term in idf:
        idf[term] = math.log(N / idf[term]) + 1  # Adding 1 to avoid zero division
    return idf

In [8]:
# Compute TF-IDF
def compute_tfidf(tf_scores, idf_scores):
    tfidf = []
    for doc_tf in tf_scores:
        doc_tfidf = {}
        for term, tf in doc_tf.items():
            doc_tfidf[term] = tf * idf_scores[term]
        tfidf.append(doc_tfidf)
    return tfidf

In [9]:
# Run the pipeline
corpus = build_corpus(docs)
tf_scores = compute_tf(corpus)
idf_scores = compute_idf(corpus)
tfidf_scores = compute_tfidf(tf_scores, idf_scores)

In [10]:
# Display results
for i, doc_tfidf in enumerate(tfidf_scores):
    print(f"\nDocument {i+1} TF-IDF:")
    for term, score in doc_tfidf.items():
        print(f"{term}: {score:.4f}")


Document 1 TF-IDF:
this: 0.1171
is: 0.0833
a: 0.1171
sample: 0.0833
document: 0.0833
this is: 0.1749
is a: 0.1171
a sample: 0.1171
sample document: 0.1171
this is a: 0.1749
is a sample: 0.1171
a sample document: 0.1749

Document 2 TF-IDF:
this: 0.1171
document: 0.0833
is: 0.0833
a: 0.1171
sample: 0.0833
this document: 0.1749
document is: 0.1171
is a: 0.1171
a sample: 0.1171
this document is: 0.1749
document is a: 0.1749
is a sample: 0.1171

Document 3 TF-IDF:
sample: 0.0667
document: 0.0667
is: 0.0667
simple: 0.1399
and: 0.1399
short: 0.1399
sample document: 0.0937
document is: 0.0937
is simple: 0.1399
simple and: 0.1399
and short: 0.1399
sample document is: 0.1399
document is simple: 0.1399
is simple and: 0.1399
simple and short: 0.1399


In [10]:
import pprint as p
p.pprint(tfidf_scores)

[{'a': 0.11712209234234702,
  'a sample': 0.11712209234234702,
  'a sample document': 0.17488435738900915,
  'document': 0.08333333333333333,
  'is': 0.08333333333333333,
  'is a': 0.11712209234234702,
  'is a sample': 0.11712209234234702,
  'sample': 0.08333333333333333,
  'sample document': 0.11712209234234702,
  'this': 0.11712209234234702,
  'this is': 0.17488435738900915,
  'this is a': 0.17488435738900915},
 {'a': 0.11712209234234702,
  'a sample': 0.11712209234234702,
  'document': 0.08333333333333333,
  'document is': 0.11712209234234702,
  'document is a': 0.17488435738900915,
  'is': 0.08333333333333333,
  'is a': 0.11712209234234702,
  'is a sample': 0.11712209234234702,
  'sample': 0.08333333333333333,
  'this': 0.11712209234234702,
  'this document': 0.17488435738900915,
  'this document is': 0.17488435738900915},
 {'and': 0.13990748591120733,
  'and short': 0.13990748591120733,
  'document': 0.06666666666666667,
  'document is': 0.09369767387387762,
  'document is simple'

In [11]:
p.pprint(corpus)

[['this',
  'is',
  'a',
  'sample',
  'document',
  'this is',
  'is a',
  'a sample',
  'sample document',
  'this is a',
  'is a sample',
  'a sample document'],
 ['this',
  'document',
  'is',
  'a',
  'sample',
  'this document',
  'document is',
  'is a',
  'a sample',
  'this document is',
  'document is a',
  'is a sample'],
 ['sample',
  'document',
  'is',
  'simple',
  'and',
  'short',
  'sample document',
  'document is',
  'is simple',
  'simple and',
  'and short',
  'sample document is',
  'document is simple',
  'is simple and',
  'simple and short']]
