# Term Frequency-Inverse Document Frequency

In [1]:
d1 = "The cat sat on my face I hate a cat"
d2 = "The dog sat on my bed I love a dog"

In [2]:
bowA = d1.split()
bowB = d2.split()

In [3]:
wordSet = set(bowA).union(set(bowB))

In [4]:
wordDictA = dict.fromkeys(wordSet, 0)
wordDictB = dict.fromkeys(wordSet, 0)

In [6]:
for word in bowA:
    wordDictA[word]+=1
for word in bowB:
    wordDictB[word]+=1

In [7]:
import pandas as pd
pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,I,The,a,bed,cat,dog,face,hate,love,my,on,sat
0,1,1,1,0,2,0,1,1,0,1,1,1
1,1,1,1,1,0,2,0,0,1,1,1,1


In [8]:
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict

In [9]:
tfBowA = computeTF(wordDictA, bowA)
tfBowB = computeTF(wordDictB, bowB)

# IDF 계산

In [13]:
def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(),0)
    for doc in docList:
        for word, val in doc.items():
            if val>0:
                idfDict[word] +=1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N/float(val))
    return idfDict

In [14]:
idfs = computeIDF([wordDictA, wordDictB])

In [15]:
idfs

{'face': 0.3010299956639812,
 'bed': 0.3010299956639812,
 'on': 0.0,
 'hate': 0.3010299956639812,
 'my': 0.0,
 'dog': 0.3010299956639812,
 'love': 0.3010299956639812,
 'sat': 0.0,
 'cat': 0.3010299956639812,
 'The': 0.0,
 'I': 0.0,
 'a': 0.0}

# TF-IDF 계산

In [18]:
def computeTFIDF(tfBow, idfs):
    tfidf= {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [19]:
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)

In [20]:
import pandas as pd
pd.DataFrame([tfidfBowA, tfidfBowB])

Unnamed: 0,I,The,a,bed,cat,dog,face,hate,love,my,on,sat
0,0.0,0.0,0.0,0.0,0.060206,0.0,0.030103,0.030103,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.030103,0.0,0.060206,0.0,0.0,0.030103,0.0,0.0,0.0


In [23]:
from math import log10

def f(t,d):
    return d.count(t)

def tf(t,d):
    return 0.5 + 0.5*f(t,d)/max([f(w,d) for w in d])

def idf(t,D):
    numerator = len(D)
    denominator = 1+len([True for d in D if t in d])
    return log10(numerator/denominator)

def tfidf(t, d, D):
    print(t)
    print(tf(t,d))
    print(idf(t,D))
    print(tf(t,d)*idf(t,D))
    print('===')
    return tf(t,d)*idf(t,D)

def tokenizer(d):
    return d.split()

def tfidfScorer(D):
    tokenized_D = [tokenizer(d) for d in D]
    result = []
    for d in tokenized_D:
        result.append([(t, tfidf (t,d,tokenized_D)) for t in d])
    return result

corpus = [d1, d2]

for i, doc in enumerate(tfidfScorer(corpus)):
    print('====== document[%d] ====='%i)
    print(doc)

The
0.75
-0.17609125905568127
-0.13206844429176096
===
cat
1.0
0.0
0.0
===
sat
0.75
-0.17609125905568127
-0.13206844429176096
===
on
0.75
-0.17609125905568127
-0.13206844429176096
===
my
0.75
-0.17609125905568127
-0.13206844429176096
===
face
0.75
0.0
0.0
===
I
0.75
-0.17609125905568127
-0.13206844429176096
===
hate
0.75
0.0
0.0
===
a
0.75
-0.17609125905568127
-0.13206844429176096
===
cat
1.0
0.0
0.0
===
The
0.75
-0.17609125905568127
-0.13206844429176096
===
dog
1.0
0.0
0.0
===
sat
0.75
-0.17609125905568127
-0.13206844429176096
===
on
0.75
-0.17609125905568127
-0.13206844429176096
===
my
0.75
-0.17609125905568127
-0.13206844429176096
===
bed
0.75
0.0
0.0
===
I
0.75
-0.17609125905568127
-0.13206844429176096
===
love
0.75
0.0
0.0
===
a
0.75
-0.17609125905568127
-0.13206844429176096
===
dog
1.0
0.0
0.0
===
[('The', -0.13206844429176096), ('cat', 0.0), ('sat', -0.13206844429176096), ('on', -0.13206844429176096), ('my', -0.13206844429176096), ('face', 0.0), ('I', -0.13206844429176096), ('ha

# sklearn 사용하여 계산

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

In [28]:
document_ls = [d1, d2, d2]
vetorizer = TfidfVectorizer()
tfidf = vetorizer.fit_transform(document_ls)

word2id = defaultdict(lambda : 0)
for idx, feature in enumerate(vetorizer.get_feature_names()):
    word2id[feature] = idx

defaultdict(<function <lambda> at 0x000000000C3BD510>, {})


In [27]:
import pandas as pd
count_vect_df = pd.DataFrame(tfidf.todense(), columns= vetorizer.get_feature_names())
count_vect_df

Unnamed: 0,bed,cat,dog,face,hate,love,my,on,sat,the
0,0.0,0.735448,0.0,0.367724,0.367724,0.0,0.217184,0.217184,0.217184,0.217184
1,0.344779,0.0,0.689558,0.0,0.0,0.344779,0.267752,0.267752,0.267752,0.267752
2,0.344779,0.0,0.689558,0.0,0.0,0.344779,0.267752,0.267752,0.267752,0.267752


* tdm 은 중요 단어 캐치가 어려움  
-> 이것의 보완 : tf-idf

TF 가 높다 :  
    내 문서 안에서 유니크  
IDF가 높다:  
    타 문서보다 유니크하다   