In [None]:
# using python 
# using map-reduce/spark

In [None]:
# https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

# https://towardsdatascience.com/tf-idf-calculation-using-map-reduce-algorithm-in-pyspark-e89b5758e64c
# https://dzone.com/articles/calculating-tf-idf-with-apache-spark

In [27]:
from collections import defaultdict
tf_dict = defaultdict(int)
idf_dict = defaultdict(int)
tf_idf_dict = defaultdict(int)

In [28]:
# TF depends on term for a particular doc
def compute_TF(corpus):
    for doc_id, doc in enumerate(corpus):
        terms = doc.split()
        for term in terms:
            tf_dict[(term, doc_id)] = tf_dict[(term, doc_id)] + 1  
            

In [29]:
# IDF is independent of particular doc
import math
def compute_IDF(corpus_size):
    for term, doc_id in tf_dict.keys():
        idf_dict[term] = idf_dict[term] + 1
    
    for term in idf_dict.keys():
        idf_dict[term] = math.log10(corpus_size/(idf_dict[term]))
      

In [30]:
# TF-IDF is on term for a particular doc
def compute_TF_IDF(): 
    for term, doc_id in tf_dict.keys():
        tf_idf_dict[(term, doc_id)] =  tf_dict[(term, doc_id)] * idf_dict[term]
        

In [31]:
corpus = ['Hi I am good', 'Hi how are you', 'good now honey']


compute_TF(corpus)
print(tf_dict)
compute_IDF(len(corpus))
print(idf_dict)
compute_TF_IDF()
print(tf_idf_dict)


defaultdict(<class 'int'>, {('Hi', 0): 1, ('I', 0): 1, ('am', 0): 1, ('good', 0): 1, ('Hi', 1): 1, ('how', 1): 1, ('are', 1): 1, ('you', 1): 1, ('good', 2): 1, ('now', 2): 1, ('honey', 2): 1})
defaultdict(<class 'int'>, {'Hi': 0.17609125905568124, 'I': 0.47712125471966244, 'am': 0.47712125471966244, 'good': 0.17609125905568124, 'how': 0.47712125471966244, 'are': 0.47712125471966244, 'you': 0.47712125471966244, 'now': 0.47712125471966244, 'honey': 0.47712125471966244})
defaultdict(<class 'int'>, {('Hi', 0): 0.17609125905568124, ('I', 0): 0.47712125471966244, ('am', 0): 0.47712125471966244, ('good', 0): 0.17609125905568124, ('Hi', 1): 0.17609125905568124, ('how', 1): 0.47712125471966244, ('are', 1): 0.47712125471966244, ('you', 1): 0.47712125471966244, ('good', 2): 0.17609125905568124, ('now', 2): 0.47712125471966244, ('honey', 2): 0.47712125471966244})


In [None]:
# Map-Reduce way 
# ouput of prev step becomes the input the next step

In [None]:
# TF: defined for a term in a particular doc
# i) Map -> Input: doc, Output: ( (term,doc), 1) for every occurrence of term
# ii) Reduce -> Reduce by prev map's key, Output: ( (term,doc), TF)
# iii) Map -> to help join in step 1 of TF-IDF, Output: ( (term, (doc,TF))  [= tf]

In [None]:
# IDF: defined independent of particular doc
# i) Map -> Input: ( (term,doc), TF), Output: (term, 1) for every occurrence as (term,doc) pair is unique
# ii) Reduce -> sums up the count of term and computes log (num of docs) / count of term, Output: (term, IDF) [= idf]

In [None]:
# TF-IDF: defined for a term in a particular doc
# i) Join tf with idf on the key "term", Output: ( term, ( (doc,TF), IDF) )
# ii) Map -> Output: ( (term, doc), TF-IDF)