In [None]:

import math
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd


In [None]:

corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]


In [None]:

def calculate_tf(sentence):
    tf = {}
    words = sentence.split()
    total_words = len(words)
    for word in words:
        tf[word] = tf.get(word, 0) + 1
    for word in tf:
        tf[word] = tf[word] / total_words
    return tf


In [None]:

def calculate_idf(all_sentences):
    N = len(all_sentences)
    idf = {}
    all_words = set(word for sentence in all_sentences for word in sentence.split())
    for word in all_words:
        count = 0
        for sentence in all_sentences:
            if word in sentence.split():
                count += 1
        idf[word] = math.log((N + 1) / (count + 1)) + 1  # smoothed IDF
    return idf


In [None]:

def compute_manual_tfidf(corpus):
    tfidf_all = []
    idf = calculate_idf(corpus)
    for sentence in corpus:
        tf = calculate_tf(sentence)
        tfidf = {}
        for word in tf:
            tfidf[word] = tf[word] * idf[word]
        tfidf_all.append(tfidf)
    return tfidf_all


In [None]:

my_tfidf = compute_manual_tfidf(corpus)
my_tfidf_df = pd.DataFrame(my_tfidf).fillna(0)
my_tfidf_df


In [None]:

count_vec = CountVectorizer()
count_matrix = count_vec.fit_transform(corpus)
count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vec.get_feature_names_out())
count_df


In [None]:

tfidf_vec = TfidfVectorizer()
tfidf_matrix = tfidf_vec.fit_transform(corpus)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vec.get_feature_names_out())
tfidf_df



- **Manual TF-IDF** shows similar results to **TfidfVectorizer**.
- Common words like `"the"` have **lower scores** in TF-IDF because they appear in **all documents**.
- **TF-IDF** is better than **raw counts** for capturing the importance of unique words.
