<a href="https://colab.research.google.com/github/srilamaiti/srilamaiti.github.io/blob/main/ml_algo_from_scratch/tfidf_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [77]:
import math
import os
import numpy as np
import pandas as pd
import nltk
import itertools
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [89]:
class TF_IDF:
    def __init__(self, documents):
        self.documents = documents
        self.tf_idf_model = []

    def preprocess(self, document): # Add document as a parameter
        tokens = word_tokenize(document) # Tokenize individual document
        tokens = [token.lower() for token in tokens if token.isalpha()]

        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
        '''
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        '''
        return tokens # Return preprocessed tokens

    def get_tf_idf(self, word, document):
        return document.count(word) / len(document)

    def get_idf(self, word):
        doc_freq = sum(1 for document in self.preprocessed_documents if word in document)
        return math.log((len(self.preprocessed_documents) + 1) / ( 1 + doc_freq)) + 1

    def get_tf_idf_list(self):
        self.preprocessed_documents = [self.preprocess(document) for document in self.documents] # Preprocess each document
        self.unique_tokens = list(set(list(itertools.chain(*self.preprocessed_documents))))
        for token in self.unique_tokens:
            for idx, document in enumerate(self.preprocessed_documents):
                tf = self.get_tf_idf(token, document)
                idf = self.get_idf(token)
                #print(token, idx, tf, idf)
                #print("***************************")
                tf_idf = tf * idf
                self.tf_idf_model.append((token, idx, tf_idf))

        return self.tf_idf_model # Return the model outside the loop to get the complete model

In [87]:
documents = [
    "this is a sample",
    "this is another example",
    "this example is different"
]

In [90]:
tfidf = TF_IDF(documents=documents)
tfidf.get_tf_idf_list()

[('different', 0, 0.0),
 ('different', 1, 0.0),
 ('different', 2, 0.8465735902799727),
 ('sample', 0, 1.6931471805599454),
 ('sample', 1, 0.0),
 ('sample', 2, 0.0),
 ('another', 0, 0.0),
 ('another', 1, 0.8465735902799727),
 ('another', 2, 0.0),
 ('example', 0, 0.0),
 ('example', 1, 0.6438410362258904),
 ('example', 2, 0.6438410362258904)]

In [91]:
vectorizer = TfidfVectorizer()
# Fit the vectorizer to the documents and transform them into a TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(documents)

# Print the resulting TF-IDF matrix
print(tfidf_matrix)

  (0, 5)	0.4532946552278861
  (0, 3)	0.4532946552278861
  (0, 4)	0.7674945674619879
  (1, 5)	0.39148397136265967
  (1, 3)	0.39148397136265967
  (1, 0)	0.6628399823470976
  (1, 2)	0.5041068915759233
  (2, 5)	0.39148397136265967
  (2, 3)	0.39148397136265967
  (2, 2)	0.5041068915759233
  (2, 1)	0.6628399823470976
