### Corpus

In [1]:
## SkLearn# Collection of string documents

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

### SkLearn Implementation

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [3]:
# sklearn feature names, they are sorted in alphabetic order by default.

print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [4]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [5]:
# shape of sklearn tfidf vectorizer output after applying transform method.

skl_output.shape

(4, 9)

In [6]:
# sklearn tfidf values for first line of the above corpus.
# Here the output is a sparse matrix

print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [7]:
# sklearn tfidf values for first line of the above corpus.
# To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.
# Notice that this output is normalized using L2 normalization. sklearn does this by default.

print(skl_output.toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


### custom implementation with python


In [8]:
from tqdm import tqdm 
import pandas as pd
import math
from collections import Counter
import numpy as np
from numpy import array
from numpy.linalg import norm
from scipy import sparse
#https://tqdm.github.io/
# https://www.freecodecamp.org/news/how-to-process-textual-data-using-tf-idf-in-python-cd2bbc0a94a3/
# https://github.com/mayank408/TFIDF
# https://stackoverflow.com/questions/20510768/count-frequency-of-words-in-a-list-and-sort-by-frequency

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

def fit(docset):
    N = len(docset)
    word_set = set()
    #creating total Bag of words
    for doc in docset:
        bow = doc.split(" ")
        word_set = word_set | (set(bow))
    word_set = sorted(list(word_set))
    vocab = {j:i for i,j in enumerate(word_set)}

    #calculating IDF for each document
    idf_dict = {}
    idf_dict = dict.fromkeys(vocab.keys(), 0)
    for doc in docset:
        for word in set(doc.split(" ")):
            idf_dict[word]+=1
    for word in idf_dict.keys():
        idf_dict[word] = 1+math.log((1+N)/(1+idf_dict[word]))      
    return idf_dict

def transform(docset, vocab):
    result = []
    for doc in docset:
        counts = Counter(doc.split(" "))
        tfidf = dict.fromkeys(vocab.keys(), 0)
        for word in tfidf.keys():
            tfidf[word] = vocab[word]*counts[word]
        r = list(tfidf.values())
        r = r/norm(r)
        result.append(r)
        
    result = sparse.csr_matrix(result)
    return result

vocab = fit(corpus)
print(transform(corpus, vocab))

  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 3)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 8)	0.38408524091481483
  (1, 1)	0.6876235979836938
  (1, 3)	0.281088674033753
  (1, 5)	0.5386476208856763
  (1, 6)	0.281088674033753
  (1, 8)	0.281088674033753
  (2, 0)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 8)	0.267103787642168
  (3, 1)	0.46979138557992045
  (3, 2)	0.5802858236844359
  (3, 3)	0.38408524091481483
  (3, 6)	0.38408524091481483
  (3, 8)	0.38408524091481483


## Task-2

<font face='georgia'>
    <h5><strong>2. Implement max features functionality:</strong></h5>



In [9]:
# Below is the code to load the cleaned_strings pickle file provided
# Here corpus is of list type

import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


In [21]:
import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)
l = [(i,j) for i,j in zip(vectorizer.get_feature_names(), vectorizer.idf_)]
l = sorted(l, key=lambda x: x[1], reverse=True)
l = l[:50]
l = sorted(l, key=lambda x: x[0])
l = dict(l)
print(l)

{'aailiyah': 6.922918004572872, 'abandoned': 6.922918004572872, 'abroad': 6.922918004572872, 'abstruse': 6.922918004572872, 'academy': 6.922918004572872, 'accents': 6.922918004572872, 'accessible': 6.922918004572872, 'acclaimed': 6.922918004572872, 'accolades': 6.922918004572872, 'accurate': 6.922918004572872, 'accurately': 6.922918004572872, 'achille': 6.922918004572872, 'ackerman': 6.922918004572872, 'actions': 6.922918004572872, 'adams': 6.922918004572872, 'add': 6.922918004572872, 'added': 6.922918004572872, 'admins': 6.922918004572872, 'admiration': 6.922918004572872, 'admitted': 6.922918004572872, 'adrift': 6.922918004572872, 'adventure': 6.922918004572872, 'aesthetically': 6.922918004572872, 'affected': 6.922918004572872, 'affleck': 6.922918004572872, 'afternoon': 6.922918004572872, 'aged': 6.922918004572872, 'ages': 6.922918004572872, 'agree': 6.922918004572872, 'agreed': 6.922918004572872, 'aimless': 6.922918004572872, 'aired': 6.922918004572872, 'akasha': 6.922918004572872, '

In [25]:
# sklearn implementation

import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)
l = [(i,j) for i,j in zip(vectorizer.get_feature_names(), vectorizer.idf_)]
l = sorted(l, key=lambda x: x[1], reverse=True)
l = l[:50]
l = sorted(l, key=lambda x: x[0])
x = dict(l)
print(x)

{'aailiyah': 6.922918004572872, 'abandoned': 6.922918004572872, 'abroad': 6.922918004572872, 'abstruse': 6.922918004572872, 'academy': 6.922918004572872, 'accents': 6.922918004572872, 'accessible': 6.922918004572872, 'acclaimed': 6.922918004572872, 'accolades': 6.922918004572872, 'accurate': 6.922918004572872, 'accurately': 6.922918004572872, 'achille': 6.922918004572872, 'ackerman': 6.922918004572872, 'actions': 6.922918004572872, 'adams': 6.922918004572872, 'add': 6.922918004572872, 'added': 6.922918004572872, 'admins': 6.922918004572872, 'admiration': 6.922918004572872, 'admitted': 6.922918004572872, 'adrift': 6.922918004572872, 'adventure': 6.922918004572872, 'aesthetically': 6.922918004572872, 'affected': 6.922918004572872, 'affleck': 6.922918004572872, 'afternoon': 6.922918004572872, 'aged': 6.922918004572872, 'ages': 6.922918004572872, 'agree': 6.922918004572872, 'agreed': 6.922918004572872, 'aimless': 6.922918004572872, 'aired': 6.922918004572872, 'akasha': 6.922918004572872, '

False