# 建立TFIDF模型

參考[https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76](https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76)

$tf-idf = tf * idf$

In [1]:
import nltk
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'

# 做tokenize，取出所有文件的單字

In [3]:
tokenize_A = nltk.word_tokenize(documentA)
tokenize_B = nltk.word_tokenize(documentB)

uniqueWords = set(tokenize_A).union(set(tokenize_B)) # 所有文件的單詞

# 計算每個文件，所有uniqueWords出現的次數

In [4]:
numofwordsA = dict.fromkeys(uniqueWords, 0)
for word in tokenize_A:
    numofwordsA[word] += 1
numofwordsB = dict.fromkeys(uniqueWords, 0)
for word in tokenize_B:
    numofwordsB[word] += 1

In [5]:
numofwordsA

{'the': 1,
 'fire': 0,
 'sat': 0,
 'children': 0,
 'walk': 1,
 'for': 1,
 'around': 0,
 'went': 1,
 'a': 1,
 'man': 1,
 'out': 1}

In [6]:
numofwordsB

{'the': 2,
 'fire': 1,
 'sat': 1,
 'children': 1,
 'walk': 0,
 'for': 0,
 'around': 1,
 'went': 0,
 'a': 0,
 'man': 0,
 'out': 0}

# 計算TF

$tf_{i,j} = \frac{n_{i,j}}{\sum_{k}(n_{i, j})}$

TF: The number of times a word appears in a document divded by the total number of words in the document.

In [7]:
def computeTF(wordDict, tokenize_item):
    '''
    wordDict : 文件內單詞所出現的字典
    tokenize_item : 文件tokenize後的輸出
    '''
    tfDict = {}
    bagofwordscount = len(tokenize_item) # tokenize_item 單字數量
    for word , count in wordDict.items():
        tfDict[word] = count / bagofwordscount #單詞在該文件出現次數 / 該文件擁有所有單詞數量
    return tfDict

# 計算IDF

$idf = log(\frac{N}{df_t})$

IDF = Inverse data frequency determines the weight of rare words across all documents in the corpus.

In [8]:
def computeIDF(documentsDict):
    '''
    documentsDict: 為一個list，包含所有文件的wordDict
    '''
    import math
    N = len(documentsDict)

    idfDict = dict.fromkeys(documentsDict[0].keys(), 0)
    for document in documentsDict:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1 ##計算單詞在多少文件中出現過
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / val) #計算IDF, log(所有文件的數目/包含這個單詞的文件數目)
    
    return idfDict

# 計算TFIDF

$w_{i,j} = tf_{i,j}*log(\dfrac{N}{df_i})$

In [10]:
def computeTFIDF(tf_item, idfs):
    tfidf = {}
    for word, val in tf_item.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [12]:
tfA = computeTF(numofwordsA, tokenize_A)
tfB = computeTF(numofwordsB, tokenize_B)

idfs = computeIDF([numofwordsA, numofwordsB])

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

In [13]:
tfidfA

{'the': 0.0,
 'fire': 0.0,
 'sat': 0.0,
 'children': 0.0,
 'walk': 0.09902102579427789,
 'for': 0.09902102579427789,
 'around': 0.0,
 'went': 0.09902102579427789,
 'a': 0.09902102579427789,
 'man': 0.09902102579427789,
 'out': 0.09902102579427789}

In [14]:
tfidfB

{'the': 0.0,
 'fire': 0.11552453009332421,
 'sat': 0.11552453009332421,
 'children': 0.11552453009332421,
 'walk': 0.0,
 'for': 0.0,
 'around': 0.11552453009332421,
 'went': 0.0,
 'a': 0.0,
 'man': 0.0,
 'out': 0.0}