## 作業目標：搭建一個TFIDF 模型

---

#### Reference:https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

In [1]:
import nltk
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'

## 首先我們做tokenize，並取出所有文件中的單詞

In [None]:

tokenize_A = nltk.word_tokenize(documentA)
tokenize_B = nltk.word_tokenize(documentB)

uniqueWords = set(tokenize_A).union(set(tokenize_B)) ##所有文件中的單詞
uniqueWords

## 計算每個文件中，所有uniqueWords出現的次數

In [None]:
numOfWordsA = {i:tokenize_A.count(i) for i in uniqueWords}
numOfWordsB = {i:tokenize_B.count(i) for i in uniqueWords}

In [12]:
numOfWordsA

['the', 'man', 'went', 'out', 'for', 'a', 'walk']

In [11]:
numOfWordsB

{'man': 0,
 'fire': 1,
 'around': 1,
 'went': 0,
 'out': 0,
 'children': 1,
 'walk': 0,
 'sat': 1,
 'a': 0,
 'the': 2,
 'for': 0}

## 定義function:計算TF

In [69]:
def computeTF(wordDict, tokenize_item):
    """
    wordDict : 文件內單詞對應出現數量的字典
    tokenize_item : 文件tokenize後的輸出
    """
    
    bagOfWordsCount = len(tokenize_item) ## tokenize_item單詞數量
    tfDict = {i:wordDict.get(i,0)/bagOfWordsCount for i in wordDict.keys()}
    return tfDict

## 定義function:計算IDF

In [52]:
def computeIDF(documentsDict):
    """
    documentsDict:為一個list，包含所有文件的wordDict
    """
    import math
    N = len(documentsDict)
    word_list = []
    for i in documentsDict:
        word_list.extend(list(i.keys()))
    word_list = list(set(word_list))
    idfDict = {k:sum([i for i in map(lambda x:1 if x.get(k,0) > 0 else 0,documentsDict)]) for k in word_list}
    idfDict = {i:math.log(N/idfDict[i]) for i in idfDict.keys()}
    return idfDict

## 定義function:計算TFIDF

In [54]:

def computeTFIDF(tf_item, idfs):
    tfidf = {}
    for word, val in tf_item.items():
        tfidf[word] = val * idfs[word]
    return tfidf



In [70]:
tfA = computeTF(numOfWordsA, tokenize_A)
tfB = computeTF(numOfWordsB, tokenize_B)

idfs = computeIDF([numOfWordsA, numOfWordsB])


tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

In [45]:
tfidfA

{'a': 0.09902102579427789,
 'out': 0.09902102579427789,
 'for': 0.09902102579427789,
 'man': 0.09902102579427789,
 'fire': 0.0,
 'sat': 0.0,
 'went': 0.09902102579427789,
 'the': 0.0,
 'children': 0.0,
 'around': 0.0,
 'walk': 0.09902102579427789}

In [46]:
tfidfB

{'a': 0.0,
 'out': 0.0,
 'for': 0.0,
 'man': 0.0,
 'fire': 0.11552453009332421,
 'sat': 0.11552453009332421,
 'went': 0.0,
 'the': 0.0,
 'children': 0.11552453009332421,
 'around': 0.11552453009332421,
 'walk': 0.0}