## Implementing TF-IDF without Scikit-Learn

In [1]:
from collections import Counter
import math
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize 

In [2]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
    
]
unique_words = set() # at first we will initialize an empty set
if isinstance(corpus, (list,)):
    for row in corpus:
        for word in row.split(" "):
            if len(word) < 2:
                continue
            unique_words.add(word)    
    unique_words = sorted(list(unique_words))
    vocab = {j:i for i,j in enumerate(unique_words)}        # for each review in the dataset
    print(vocab)        # for each word in the review. #split method converts a string into list of words
else:
    print("you need to pass list of sentance") 

{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}


In [3]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
    
]

rows = []
columns = []
values = []
counts = Counter()
a=[]
b=[]
v=[]
def a_features(document):
    terms = tuple(document.split())
    return terms
Df = Counter()
for d in corpus:
    features = a_features(d)
    for t in Counter(features):
        Df[t]=Df[t]+1
        
p=dict(Df)
for word,IDFreq in p.items():
    j=(1+len(corpus))/(1+IDFreq)
    log_result=math.log(j)
    k=1+log_result
    print(word,k)
    v.append(k)
print(v)    
    
    #print(word,k)  ###Printing IDF
    

tfidf_sentences = []  


for idx, row in enumerate(corpus):
    word_freq = dict(Counter(row.split()))
    for word,freq in word_freq.items():
        
        tf=freq/len(row.split())
        b.append(tf)
        for tf_sentence in b:
            tf_idf_score =(tf_sentence*k)
            col_index = vocab.get(word, -1)
            if col_index !=-1:
                rows.append(idx)
                columns.append(col_index)
                tfidf_sentences.append(tf_idf_score)

y=csr_matrix((tfidf_sentences, (rows,columns)), shape=(len(corpus),len(vocab)))


m=normalize(y, norm='l2', axis=1, copy=True, return_norm=False)
print(m)



      
       
    


this 1.0
is 1.0
the 1.0
first 1.5108256237659907
document 1.2231435513142097
second 1.916290731874155
and 1.916290731874155
third 1.916290731874155
one 1.916290731874155
[1.0, 1.0, 1.0, 1.5108256237659907, 1.2231435513142097, 1.916290731874155, 1.916290731874155, 1.916290731874155, 1.916290731874155]
  (0, 1)	0.674199862463242
  (0, 2)	0.5393598899705936
  (0, 3)	0.2696799449852968
  (0, 6)	0.40451991747794525
  (0, 8)	0.1348399724926484
  (1, 1)	0.40451991747794525
  (1, 3)	0.4494665749754947
  (1, 5)	0.5393598899705937
  (1, 6)	0.49441323247304425
  (1, 8)	0.3146266024828463
  (2, 0)	0.34034212517093304
  (2, 3)	0.39270245212030735
  (2, 4)	0.4712429425443689
  (2, 6)	0.4188826155949945
  (2, 7)	0.44506277906968167
  (2, 8)	0.3665222886456202
  (3, 1)	0.4953774046180699
  (3, 2)	0.4706085343871664
  (3, 3)	0.39630192369445594
  (3, 6)	0.4458396641562629
  (3, 8)	0.4210707939253594
