# Assignment 2.1: Text Vectorization Implementation
Manual TF-IDF computation and comparison with scikit-learn's CountVectorizer and TfidfVectorizer using NLTK's word_tokenize.

In [None]:
import math
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

## Step 1: Tokenization using NLTK

In [None]:
def tokenize(doc):
    return word_tokenize(doc.lower())

tokenized_corpus = [tokenize(doc) for doc in corpus]
tokenized_corpus

## Step 2: Compute Term Frequency (TF)

In [None]:
def compute_tf(doc_tokens):
    tf_dict = {}
    total_terms = len(doc_tokens)
    for word in doc_tokens:
        tf_dict[word] = tf_dict.get(word, 0) + 1
    for word in tf_dict:
        tf_dict[word] /= total_terms
    return tf_dict

tf_corpus = [compute_tf(doc) for doc in tokenized_corpus]
tf_corpus

## Step 3: Compute Inverse Document Frequency (IDF)

In [None]:
def compute_idf(tokenized_corpus):
    N = len(tokenized_corpus)
    idf_dict = {}
    all_words = set(word for doc in tokenized_corpus for word in doc)
    for word in all_words:
        doc_count = sum(word in doc for doc in tokenized_corpus)
        idf_dict[word] = math.log(N / (1 + doc_count)) + 1  # Smoothing
    return idf_dict

idf_dict = compute_idf(tokenized_corpus)
idf_dict

## Step 4: Compute Manual TF-IDF

In [None]:
def compute_tfidf(tf, idf):
    tfidf = {}
    for word, val in tf.items():
        tfidf[word] = val * idf[word]
    return tfidf

manual_tfidf = [compute_tfidf(tf_doc, idf_dict) for tf_doc in tf_corpus]
manual_tfidf

## Step 5: scikit-learn Vectorizers

In [None]:
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(corpus)
count_feature_names = count_vectorizer.get_feature_names_out()
count_result = count_matrix.toarray()

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_result = tfidf_matrix.toarray()

## Step 6: Display Results in DataFrames

In [None]:
# Manual TF-IDF
all_words = sorted(set(word for tfidf_doc in manual_tfidf for word in tfidf_doc))
manual_df = pd.DataFrame([{word: doc.get(word, 0.0) for word in all_words} for doc in manual_tfidf])

# CountVectorizer
count_df = pd.DataFrame(count_result, columns=count_feature_names)

# TfidfVectorizer
tfidf_df = pd.DataFrame(tfidf_result, columns=tfidf_feature_names)

print("Manual TF-IDF:\n", manual_df.round(3), "\n")
print("CountVectorizer:\n", count_df, "\n")
print("TfidfVectorizer:\n", tfidf_df.round(3), "\n")