In [1]:
import nltk
import math
from collections import Counter

# Sample corpus (replace with your corpus)
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

def calculate_tf(document):
    # Tokenize the document
    tokens = nltk.word_tokenize(document.lower())
    # Calculate term frequency (TF)
    tf = Counter(tokens)
    # Normalize TF by dividing by the total number of terms
    total_terms = len(tokens)
    tf_normalized = {term: freq/total_terms for term, freq in tf.items()}
    return tf_normalized

def calculate_idf(corpus):
    # Get unique terms in the corpus
    all_tokens = [token for document in corpus for token in nltk.word_tokenize(document.lower())]
    unique_tokens = set(all_tokens)
    # Calculate inverse document frequency (IDF)
    idf = {}
    num_documents = len(corpus)
    for token in unique_tokens:
        num_documents_containing_token = sum([1 for document in corpus if token in document])
        idf[token] = math.log(num_documents / (1 + num_documents_containing_token))
    return idf

def calculate_tfidf(corpus):
    tfidf_scores = []
    idf = calculate_idf(corpus)
    for document in corpus:
        tf = calculate_tf(document)
        tfidf = {term: tf[term] * idf[term] for term in tf.keys()}
        tfidf_scores.append(tfidf)
    return tfidf_scores

# Calculate TF-IDF scores for the corpus
tfidf_scores = calculate_tfidf(corpus)

# Print TF-IDF scores for each document
for i, tfidf in enumerate(tfidf_scores):
    print(f"Document {i+1}:")
    for term, score in tfidf.items():
        print(f"\t{term}: {score}")


Document 1:
	this: 0.04794701207529681
	is: -0.03719059188570162
	the: -0.03719059188570162
	first: 0.04794701207529681
	document: 0.0
	.: 0.0
Document 2:
	this: 0.04109743892168297
	document: 0.0
	is: -0.031877650187744244
	the: -0.031877650187744244
	second: 0.09902102579427789
	.: 0.0
Document 3:
	and: 0.19804205158855578
	this: 0.04109743892168297
	is: -0.031877650187744244
	the: -0.031877650187744244
	third: 0.09902102579427789
	one: 0.09902102579427789
	.: 0.0
Document 4:
	is: -0.03719059188570162
	this: 0.04794701207529681
	the: -0.03719059188570162
	first: 0.04794701207529681
	document: 0.0
	?: 0.11552453009332421
