In [1]:
import math

In [2]:
doc_a = "The cat sat on my face"
doc_b = "The dog sat on my bed" 

In [3]:
bow_a = doc_a.split(" ")
bow_b = doc_b.split(" ")

In [4]:
bow_b

['The', 'dog', 'sat', 'on', 'my', 'bed']

### Merge two sentences

In [5]:
word_set = set(bow_a).union(set(bow_b))

In [6]:
word_set

{'The', 'bed', 'cat', 'dog', 'face', 'my', 'on', 'sat'}

### Convert to dictionary and set initial values to 0

In [7]:
word_dict_a = dict.fromkeys(word_set, 0) 
word_dict_b = dict.fromkeys(word_set, 0) 

In [8]:
word_dict_a

{'sat': 0, 'bed': 0, 'The': 0, 'on': 0, 'face': 0, 'cat': 0, 'dog': 0, 'my': 0}

### Get count of words per each sentence

In [9]:
for word in bow_a:
    word_dict_a[word]+=1
    
for word in bow_b:
    word_dict_b[word]+=1

In [10]:
word_dict_b

{'sat': 1, 'bed': 1, 'The': 1, 'on': 1, 'face': 0, 'cat': 0, 'dog': 1, 'my': 1}

### Converting to Dataframe

In [11]:
import pandas as pd
pd.DataFrame([word_dict_a, word_dict_b])

Unnamed: 0,sat,bed,The,on,face,cat,dog,my
0,1,0,1,1,1,1,0,1
1,1,1,1,1,0,0,1,1


### Compute Term Frequency

![alt text](tf.png "TF Formula")

In [12]:
def computeTF(word_dict, bow):
    tf_dict = {}
    bow_count = len(bow)
    for word, count in word_dict.items():
        tf_dict[word] = count/float(bow_count)
    
    return tf_dict

In [13]:
tf_bow_a = computeTF(word_dict_a, bow_a)
tf_bow_a

{'sat': 0.16666666666666666,
 'bed': 0.0,
 'The': 0.16666666666666666,
 'on': 0.16666666666666666,
 'face': 0.16666666666666666,
 'cat': 0.16666666666666666,
 'dog': 0.0,
 'my': 0.16666666666666666}

In [14]:
tf_bow_b = computeTF(word_dict_b, bow_b)
tf_bow_b

{'sat': 0.16666666666666666,
 'bed': 0.16666666666666666,
 'The': 0.16666666666666666,
 'on': 0.16666666666666666,
 'face': 0.0,
 'cat': 0.0,
 'dog': 0.16666666666666666,
 'my': 0.16666666666666666}

### Compute IDF

![alt text](idf.png "IDF Formula")

In [15]:
def computeIDF(doc_list):
    idf_dict = {}
    
    N = len(doc_list)
    
    idf_dict = dict.fromkeys(doc_list[0].keys(), 0)
    for doc in doc_list:
        for word, val in doc.items():
            if val > 0:
                idf_dict[word] += 1
            
    
    for word, val in idf_dict.items():
        idf_dict[word] = math.log10(N / float(val))
        
    return idf_dict   

In [16]:
idfs = computeIDF([word_dict_a,word_dict_b])

In [17]:
idfs

{'sat': 0.0,
 'bed': 0.3010299956639812,
 'The': 0.0,
 'on': 0.0,
 'face': 0.3010299956639812,
 'cat': 0.3010299956639812,
 'dog': 0.3010299956639812,
 'my': 0.0}

In [24]:
math.log10(2/1) # 2 - ilang documents # 1 ilang beses nag occur

0.3010299956639812

### Compute TFIDF

![alt text](tfidf.png "TFIDF Formula")

In [18]:
def computeTFIDF(tf_bow, idfs):
    tfidf = {}
    for word, val in tf_bow.items():
        tfidf[word] = val*idfs[word]
        
    return tfidf

In [26]:
tfidf_bow_a = computeTFIDF(tf_bow_a,idfs)
tfidf_bow_b = computeTFIDF(tf_bow_b,idfs)

In [27]:
import pandas as pd
pd.DataFrame([tfidf_bow_a, tfidf_bow_b])

Unnamed: 0,sat,bed,The,on,face,cat,dog,my
0,0.0,0.0,0.0,0.0,0.050172,0.050172,0.0,0.0
1,0.0,0.050172,0.0,0.0,0.0,0.0,0.050172,0.0


In [28]:
math.log10(2/1)*0.166666666667

0.05017166594409721