In [3]:
import math
from collections import Counter

# Example corpus (list of documents, each doc = string)
corpus = [
    "this is the first document",
    "this document is the second document",
    "and this is the third one"
]

# Step 1: Tokenize documents
tokenized_corpus = [doc.lower().split() for doc in corpus]

# Step 2: Build vocabulary
vocab = sorted(set([word for doc in tokenized_corpus for word in doc]))
vocab_index = {word: i for i, word in enumerate(vocab)}

# Step 3: Compute Document Frequencies (df)
N = len(tokenized_corpus)
df = Counter()
for doc in tokenized_corpus:
    unique_terms = set(doc)
    for term in unique_terms:
        df[term] += 1

# Step 4: Compute IDF
idf = {term: math.log(N / (1 + df[term])) for term in vocab}

# Step 5: Compute TF-IDF for each document
tfidf_matrix = []
for doc in tokenized_corpus:
    tf = Counter(doc)
    doc_len = len(doc)
    doc_vector = [0.0] * len(vocab)
    
    for term in doc:
        tf_val = tf[term] / doc_len
        doc_vector[vocab_index[term]] = tf_val * idf[term]
    
    tfidf_matrix.append(doc_vector)

# Show result
import pandas as pd
df_tfidf = pd.DataFrame(tfidf_matrix, columns=vocab)
df_tfidf.round(3)


Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.0,0.081,-0.058,0.0,0.0,-0.058,0.0,-0.058
1,0.0,0.0,0.0,-0.048,0.0,0.068,-0.048,0.0,-0.048
2,0.068,0.0,0.0,-0.048,0.068,0.0,-0.048,0.068,-0.048
