<a href="https://colab.research.google.com/github/prakher2pratyush/tf_idf/blob/main/custom_implementation_of_tf_idf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### What does TF-IDF mean?
TF-IDF stands for term frequency-inverse document frequency, and it means converting a data corpus into a sparse matrix using frequency of a word in a document (TF) along with log of inverse of frequency of documents that contains the word in data corpus (IDF)

In [None]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### SkLearn Implementation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [None]:
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [None]:
print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [None]:
skl_output.shape

(4, 9)

In [None]:
print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [None]:
print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


### Custom Implementation

In [None]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy

# fit function

def fit(dataset):    

  if isinstance(dataset, (list,)):

    # To get the vocabulary
    vocabulary = [];
    for sentance in dataset:
      for word in sentance.split(" "):
        if len(word) < 2:
          continue
        if word not in vocabulary:
          vocabulary.append(word)           # If word not exist in Vocabulary, add.
    vocabulary = sorted(list(vocabulary))   # Sort the vocabulary.
    
    # To get the IDF
    total_documents = len(dataset)
    idf = []
    for word in vocabulary:
      sum_idf = 0
      for sentance in dataset:
        if word in sentance:
          sum_idf += 1  
      idf_word = 1 + math.log((1 + total_documents)/(1 + sum_idf))
      idf.append(idf_word)

    return vocabulary, idf

  else:
        print("Kindly pass list of strings")
        
feature_names, idf_ = fit(corpus)

In [None]:
print(feature_names)

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [None]:
print(idf_)

[1.916290731874155, 1.2231435513142097, 1.5108256237659907, 1.0, 1.916290731874155, 1.916290731874155, 1.0, 1.916290731874155, 1.0]


In [None]:
# transform function

def transform(dataset,vocab,idf):
    rows = []
    columns = []
    values = []
    if isinstance(dataset, (list,)):
        for idx, row in enumerate(tqdm(dataset)): 
            word_freq = dict(Counter(row.split()))
            for word, freq in word_freq.items():               
                if len(word) < 2:
                    continue

                # Get the Term Frequency
                tf_word = freq/len(row)

                # Get the IDF
                vocab_index = vocab.index(word)
                idf_word = idf[vocab_index]

                # Get the TF-IDF
                tfidf = tf_word*idf_word

                col_index = vocab_index 
                if col_index !=-1:
                    rows.append(idx)
                    columns.append(col_index)
                    values.append(tfidf)
        return normalize(csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocab))))
    else:
        print("Kindly pass list of strings")

output_transform = transform(corpus, feature_names, idf_)

100%|██████████| 4/4 [00:00<00:00, 2362.99it/s]


In [None]:
output_transform.shape

(4, 9)

In [None]:
print(output_transform[0])

  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149


In [None]:
print(output_transform[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


## Modified TF-IDF

### Implement max features functionality

Modified TF-IDF such that vocab will contain only 50 terms with top idf scores.

In [None]:
import pickle

# Python pickle module is used for serializing and de-serializing a Python object structure. 
# Any object in Python can be pickled so that it can be saved on disk. 
# Pickling is a way to convert a python object (list, dict, etc.) into a character stream.

with open('/content/drive/MyDrive/Colab Notebooks/Data/cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


In [None]:
def updated_fit(dataset):    

  if isinstance(dataset, (list,)):

    # To get the vocabulary
    vocabulary = [];
    for sentance in dataset:
      for word in sentance.split(" "):
        if len(word) < 2:
          continue
        if word not in vocabulary:
          vocabulary.append(word)           # If word not exist in Vocabulary, add.
    vocabulary = sorted(list(vocabulary))   # Sort the vocabulary.
    
    # To get the IDF
    total_documents = len(dataset)
    idf = []
    for word in vocabulary:
      sum_idf = 0
      for sentance in dataset:
        if word in sentance:
          sum_idf += 1  
      idf_word = 1 + math.log((1 + total_documents)/(1 + sum_idf))
      idf.append(idf_word)
    
    updated_vocab = []
    updated_idf = []

    for i in range(0,50):
      idx = idf.index(max(idf))
      updated_idf.append(idf[idx])
      updated_vocab.append(vocabulary[idx])
      idf.pop(idx)
      vocabulary.pop(idx)
      i += 1

    return updated_vocab, updated_idf

  else:
        print("Kindly pass list of strings")
        
updated_feature_names, updated_idf_ = updated_fit(corpus)

def updated_transform(dataset,vocab,idf):
    rows = []
    columns = []
    values = []
    if isinstance(dataset, (list,)):
        for idx, row in enumerate(tqdm(dataset)): 
            word_freq = dict(Counter(row.split()))
            for word, freq in word_freq.items():  
                # Additional condition to remove words which are not in Vocab             
                if len(word) < 2 or word not in vocab:
                    continue

                # Get the Term Frequency
                tf_word = freq/len(row)

                # Get the IDF
                vocab_index = vocab.index(word)
                idf_word = idf[vocab_index]

                # Get the TF-IDF
                tfidf = tf_word*idf_word

                col_index = vocab_index 
                if col_index !=-1:
                    rows.append(idx)
                    columns.append(col_index)
                    values.append(tfidf)
        return normalize(csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocab))))
    else:
        print("Kindly pass list of strings")

updated_output_transform = updated_transform(corpus, updated_feature_names, updated_idf_)

100%|██████████| 746/746 [00:00<00:00, 33699.35it/s]


In [None]:
print("Size of Vocabulary : " + str(len(updated_feature_names)))
print("Size of IDF : " + str(len(updated_idf_)))
print("\n")
print("IDF\t\t\t:\tWord")
for i in range(0,50):
  print(str(updated_idf_[i]) + "\t:\t" + updated_feature_names[i])

Size of Vocabulary : 50
Size of IDF : 50


IDF			:	Word
6.922918004572872	:	aailiyah
6.922918004572872	:	abandoned
6.922918004572872	:	abroad
6.922918004572872	:	abstruse
6.922918004572872	:	academy
6.922918004572872	:	accents
6.922918004572872	:	accessible
6.922918004572872	:	acclaimed
6.922918004572872	:	accolades
6.922918004572872	:	accurately
6.922918004572872	:	achille
6.922918004572872	:	ackerman
6.922918004572872	:	adams
6.922918004572872	:	added
6.922918004572872	:	admins
6.922918004572872	:	admiration
6.922918004572872	:	admitted
6.922918004572872	:	adrift
6.922918004572872	:	adventure
6.922918004572872	:	aesthetically
6.922918004572872	:	affected
6.922918004572872	:	affleck
6.922918004572872	:	afternoon
6.922918004572872	:	agreed
6.922918004572872	:	aimless
6.922918004572872	:	aired
6.922918004572872	:	akasha
6.922918004572872	:	alert
6.922918004572872	:	alike
6.922918004572872	:	allison
6.922918004572872	:	allowing
6.922918004572872	:	alongside
6.922918004572872	:	amateurish

In [None]:
updated_output_transform.shape

(746, 50)

In [None]:
print(updated_output_transform[0])

  (0, 24)	1.0


In [None]:
print(updated_output_transform[0].toarray())

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]]
