Manual TF-IDF Implementation:

In [24]:
import math

In [25]:
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

In [26]:
words = [doc.lower().split() for doc in corpus]
words

[['the', 'sun', 'is', 'a', 'star'],
 ['the', 'moon', 'is', 'a', 'satellite'],
 ['the', 'sun', 'and', 'moon', 'are', 'celestial', 'bodies']]

In [27]:
def compute_tf(doc):
    tf = {}
    for word in doc:
        tf[word] = tf.get(word, 0) + 1
    total_words = len(doc)
    for word in tf:
        tf[word] /= total_words
    return tf

tf_list = [compute_tf(doc) for doc in words]


In [28]:
def compute_idf(corpus):
    N = len(corpus)
    all_words = set(word for doc in corpus for word in doc)
    idf = {}
    for word in all_words:
        count = sum(1 for doc in corpus if word in doc)
        idf[word] = math.log(N / (count))
    return idf

idf = compute_idf(words)


In [29]:
def compute_tfidf(tf, idf):
    tfidf = {}
    for word, val in tf.items():
        tfidf[word] = val * idf[word]
    return tfidf

tfidf_list = [compute_tfidf(tf, idf) for tf in tf_list]

In [30]:
tfidf_list

[{'the': 0.0,
  'sun': 0.08109302162163289,
  'is': 0.08109302162163289,
  'a': 0.08109302162163289,
  'star': 0.21972245773362198},
 {'the': 0.0,
  'moon': 0.08109302162163289,
  'is': 0.08109302162163289,
  'a': 0.08109302162163289,
  'satellite': 0.21972245773362198},
 {'the': 0.0,
  'sun': 0.05792358687259491,
  'and': 0.15694461266687282,
  'moon': 0.05792358687259491,
  'are': 0.15694461266687282,
  'celestial': 0.15694461266687282,
  'bodies': 0.15694461266687282}]

TF-IDF Implementation using scikit-learn's CountVectorizer and TfidfVectorizer

In [31]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

In [32]:
count_vectorizer = CountVectorizer()

X_count = count_vectorizer.fit_transform(corpus)

df_count = pd.DataFrame(X_count.toarray(), columns=count_vectorizer.get_feature_names_out())

print(df_count)

   and  are  bodies  celestial  is  moon  satellite  star  sun  the
0    0    0       0          0   1     0          0     1    1    1
1    0    0       0          0   1     1          1     0    0    1
2    1    1       1          1   0     1          0     0    1    1


In [33]:
tfidf_vectorizer = TfidfVectorizer()

X_tfidf = tfidf_vectorizer.fit_transform(corpus)

df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print(df_tfidf)


        and       are    bodies  celestial        is      moon  satellite  \
0  0.000000  0.000000  0.000000   0.000000  0.480458  0.000000   0.000000   
1  0.000000  0.000000  0.000000   0.000000  0.480458  0.480458   0.631745   
2  0.426184  0.426184  0.426184   0.426184  0.000000  0.324124   0.000000   

       star       sun       the  
0  0.631745  0.480458  0.373119  
1  0.000000  0.000000  0.373119  
2  0.000000  0.324124  0.251711  
