# Manual Implementation

In [85]:
import math

In [86]:
def TF(document, word):
  return document.split().count(word) / len(document.split())

In [87]:
def IDF(corpus, word):
  return math.log(len(corpus) / sum(list(map(lambda document: word in document, corpus))))

In [88]:
def TF_IDF(corpus):
  word_set = set()
  for document in corpus:
    for word in document.split():
      if word not in word_set:
        word_set.add(word)
  print("PRINTING ALL IDF VALUES\n\n")
  for word in word_set:
    print(word, ":", IDF(corpus, word))
  print("\n\nPRINTING THE TF-IDF VALUES NOW\n\n")
  for document in corpus:
    for word in word_set:
      print(document, ":", word, ":", TF(document, word)*IDF(corpus, word))

In [89]:
corpus = [
    'the sun is is is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

In [90]:
TF_IDF(corpus)

PRINTING ALL IDF VALUES


star : 1.0986122886681098
sun : 0.4054651081081644
celestial : 1.0986122886681098
and : 1.0986122886681098
bodies : 1.0986122886681098
moon : 0.4054651081081644
is : 0.4054651081081644
satellite : 1.0986122886681098
a : 0.0
are : 1.0986122886681098
the : 0.0


PRINTING THE TF-IDF VALUES NOW


the sun is is is a star : star : 0.15694461266687282
the sun is is is a star : sun : 0.05792358687259491
the sun is is is a star : celestial : 0.0
the sun is is is a star : and : 0.0
the sun is is is a star : bodies : 0.0
the sun is is is a star : moon : 0.0
the sun is is is a star : is : 0.17377076061778474
the sun is is is a star : satellite : 0.0
the sun is is is a star : a : 0.0
the sun is is is a star : are : 0.0
the sun is is is a star : the : 0.0
the moon is a satellite : star : 0.0
the moon is a satellite : sun : 0.0
the moon is a satellite : celestial : 0.0
the moon is a satellite : and : 0.0
the moon is a satellite : bodies : 0.0
the moon is a satellite : moon :

# CountVectorizer

In [91]:
from sklearn.feature_extraction.text import CountVectorizer

In [92]:
vectorizer = CountVectorizer()

In [93]:
result_vectorizer = vectorizer.fit_transform(corpus)

In [94]:
vectorizer.get_feature_names_out()

array(['and', 'are', 'bodies', 'celestial', 'is', 'moon', 'satellite',
       'star', 'sun', 'the'], dtype=object)

In [95]:
result_vectorizer.toarray()

array([[0, 0, 0, 0, 3, 0, 0, 1, 1, 1],
       [0, 0, 0, 0, 1, 1, 1, 0, 0, 1],
       [1, 1, 1, 1, 0, 1, 0, 0, 1, 1]])

# TF-IDF Vectorizer

In [96]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [97]:
tfidf = TfidfVectorizer()

In [98]:
result = tfidf.fit_transform(corpus)

In [99]:
# get idf values
print('\nidf values:')
for ele1, ele2 in zip(tfidf.get_feature_names_out(), tfidf.idf_):
    print(ele1, ':', ele2)


idf values:
and : 1.6931471805599454
are : 1.6931471805599454
bodies : 1.6931471805599454
celestial : 1.6931471805599454
is : 1.2876820724517808
moon : 1.2876820724517808
satellite : 1.6931471805599454
star : 1.6931471805599454
sun : 1.2876820724517808
the : 1.0


In [100]:
# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)

# display tf-idf values
print('\ntf-idf value:')
print(result)

# in matrix form
print('\ntf-idf values in matrix form:')
print(result.toarray())


Word indexes:
{'the': 9, 'sun': 8, 'is': 4, 'star': 7, 'moon': 5, 'satellite': 6, 'and': 0, 'are': 1, 'celestial': 3, 'bodies': 2}

tf-idf value:
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 15 stored elements and shape (3, 10)>
  Coords	Values
  (0, 9)	0.22114371290186152
  (0, 8)	0.28476279453915065
  (0, 4)	0.854288383617452
  (0, 7)	0.37442885399834486
  (1, 9)	0.3731188059313277
  (1, 4)	0.4804583972923858
  (1, 5)	0.4804583972923858
  (1, 6)	0.6317450542765208
  (2, 9)	0.2517108425440014
  (2, 8)	0.3241235393856436
  (2, 5)	0.3241235393856436
  (2, 0)	0.42618350336974425
  (2, 1)	0.42618350336974425
  (2, 3)	0.42618350336974425
  (2, 2)	0.42618350336974425

tf-idf values in matrix form:
[[0.         0.         0.         0.         0.85428838 0.
  0.         0.37442885 0.28476279 0.22114371]
 [0.         0.         0.         0.         0.4804584  0.4804584
  0.63174505 0.         0.         0.37311881]
 [0.4261835  0.4261835  0.4261835  0.4261835  0.         0.