## 作業目標：搭建一個TFIDF 模型

---

#### Reference:https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

In [1]:
import nltk
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'
documentC = 'children like ice cream'
documentD = 'parents need to take care of their children'

## 首先我們做tokenize，並取出所有文件中的單詞

In [2]:

tokenize_A = nltk.word_tokenize(documentA)
tokenize_B = nltk.word_tokenize(documentB)
tokenize_C = nltk.word_tokenize(documentC)
tokenize_D = nltk.word_tokenize(documentD)

uniqueWords = set(tokenize_A).union(set(tokenize_B)).union(set(tokenize_C)).union(set(tokenize_D)) ##所有文件中的單詞

print('tokenize_A', tokenize_A)
print('tokenize_B', tokenize_B)
print('tokenize_C', tokenize_C)
print('tokenize_D', tokenize_D)
print('uniqueWords', uniqueWords)

tokenize_A ['the', 'man', 'went', 'out', 'for', 'a', 'walk']
tokenize_B ['the', 'children', 'sat', 'around', 'the', 'fire']
tokenize_C ['children', 'like', 'ice', 'cream']
tokenize_D ['parents', 'need', 'to', 'take', 'care', 'of', 'their', 'children']
uniqueWords {'around', 'went', 'out', 'sat', 'care', 'the', 'their', 'ice', 'cream', 'like', 'a', 'fire', 'walk', 'of', 'need', 'to', 'parents', 'take', 'children', 'for', 'man'}


## 計算每個文件中，所有uniqueWords出現的次數

In [3]:

# 建立 dict for A, 用來儲存單字在 document A 出現的次數
numOfWordsA = dict.fromkeys(uniqueWords, 0)

print('numOfWordsA', numOfWordsA)
for word in tokenize_A:
    numOfWordsA[word] += 1

    
# 建立 dict for B, 用來儲存單字在 document B 出現的次數
numOfWordsB = dict.fromkeys(uniqueWords, 0)
print('numOfWordsB', numOfWordsB)

for word in tokenize_B:
    numOfWordsB[word] += 1

# 建立 dict for C, 用來儲存單字在 document C 出現的次數
numOfWordsC = dict.fromkeys(uniqueWords, 0)
print('numOfWordsC', numOfWordsC)

for word in tokenize_C:
    numOfWordsC[word] += 1

# 建立 dict for D, 用來儲存單字在 document D 出現的次數
numOfWordsD = dict.fromkeys(uniqueWords, 0)
print('numOfWordsD', numOfWordsD)

for word in tokenize_D:
    numOfWordsD[word] += 1

numOfWordsA {'around': 0, 'went': 0, 'out': 0, 'sat': 0, 'care': 0, 'the': 0, 'their': 0, 'ice': 0, 'cream': 0, 'like': 0, 'a': 0, 'fire': 0, 'walk': 0, 'of': 0, 'need': 0, 'to': 0, 'parents': 0, 'take': 0, 'children': 0, 'for': 0, 'man': 0}
numOfWordsB {'around': 0, 'went': 0, 'out': 0, 'sat': 0, 'care': 0, 'the': 0, 'their': 0, 'ice': 0, 'cream': 0, 'like': 0, 'a': 0, 'fire': 0, 'walk': 0, 'of': 0, 'need': 0, 'to': 0, 'parents': 0, 'take': 0, 'children': 0, 'for': 0, 'man': 0}
numOfWordsC {'around': 0, 'went': 0, 'out': 0, 'sat': 0, 'care': 0, 'the': 0, 'their': 0, 'ice': 0, 'cream': 0, 'like': 0, 'a': 0, 'fire': 0, 'walk': 0, 'of': 0, 'need': 0, 'to': 0, 'parents': 0, 'take': 0, 'children': 0, 'for': 0, 'man': 0}
numOfWordsD {'around': 0, 'went': 0, 'out': 0, 'sat': 0, 'care': 0, 'the': 0, 'their': 0, 'ice': 0, 'cream': 0, 'like': 0, 'a': 0, 'fire': 0, 'walk': 0, 'of': 0, 'need': 0, 'to': 0, 'parents': 0, 'take': 0, 'children': 0, 'for': 0, 'man': 0}


## 定義function:計算TF

In [4]:
def computeTF(wordDict, tokenize_item):
    """
    wordDict : 文件內單詞對應出現數量的字典
    tokenize_item : 文件tokenize後的輸出
    """
    print('\n wordDict', wordDict)
    print('\n tokenize_item', tokenize_item)
    
    tfDict = {}
    bagOfWordsCount = len(tokenize_item) ## tokenize_item單詞數量
    for word, count in wordDict.items():
        tfDict[word] = count/bagOfWordsCount ##單詞在該文件出現的次數/該文件擁有的所有單詞數量
    return tfDict

## 定義function:計算IDF

In [5]:
def computeIDF(documentsDict):
    """
    documentsDict:為一個list，包含所有文件的wordDict
    """
    import math # 可以計算 log
    
    # 文件數量(分母)
    N = len(documentsDict)
    
    # initialize idfDict
    idfDict = dict.fromkeys(documentsDict[0].keys(), 0)
    print('initialize idfDict', idfDict)
    
    for document in documentsDict:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1 ## 計算單詞在多少文件中出現過
    
    for word, val in idfDict.items():
        if val == 0:
            continue
        else:
            idfDict[word] = math.log(N / val) ## 計算IDF，Log (所有文件的數目/包含這個單詞的文件數目)
    return idfDict

## 定義function:計算TFIDF

In [6]:
def computeTFIDF(tf_item, idfs):
    tfidf = {}
    for word, val in tf_item.items():
        tfidf[word] = val * idfs[word]
    return tfidf



In [7]:

# comput TF
tfA = computeTF(numOfWordsA, tokenize_A)
print('tfA {}\n'.format(tfA))
tfB = computeTF(numOfWordsB, tokenize_B)
print('tfB {}\n'.format(tfB))
tfC = computeTF(numOfWordsC, tokenize_C)
print('tfC {}\n'.format(tfB))
tfD = computeTF(numOfWordsD, tokenize_D)
print('tfD {}\n'.format(tfD))

# comput idf => 不同文件的 IDF 值是一樣的
idfs = computeIDF([numOfWordsA, numOfWordsB, numOfWordsC, numOfWordsD])
print('idfs {}\n'.format(idfs))

# explain tfidf
print("""
說明：當一個單詞的 TF * IDF 值越大時，代表這個單詞對整段文章的重要性也越大，我們可以歸納出：

不同單詞在同一個文章中獲得的 TFIDF 值可能不相同，值的高低代表了單詞對整段文章的重要性。
同一個單詞在不同文章所得到的 TFIDF 值也可能不同。
""")
# comput TFIDF for document A
tfidfA = computeTFIDF(tfA, idfs)
print('document A: {}'.format(documentA))
print('tfidfA {}\n'.format(tfidfA))

# comput TFIDF for document B
tfidfB = computeTFIDF(tfB, idfs)
print('document B: {}'.format(documentB))
print('tfidfB {}\n'.format(tfidfB))

# comput TFIDF for document C
tfidfC = computeTFIDF(tfC, idfs)
print('document C: {}'.format(documentC))
print('tfidfC {}\n'.format(tfidfC))

# comput TFIDF for document B
tfidfD = computeTFIDF(tfD, idfs)
print('document D: {}'.format(documentD))
print('tfidfD {}\n'.format(tfidfD))


 wordDict {'around': 0, 'went': 1, 'out': 1, 'sat': 0, 'care': 0, 'the': 1, 'their': 0, 'ice': 0, 'cream': 0, 'like': 0, 'a': 1, 'fire': 0, 'walk': 1, 'of': 0, 'need': 0, 'to': 0, 'parents': 0, 'take': 0, 'children': 0, 'for': 1, 'man': 1}

 tokenize_item ['the', 'man', 'went', 'out', 'for', 'a', 'walk']
tfA {'around': 0.0, 'went': 0.14285714285714285, 'out': 0.14285714285714285, 'sat': 0.0, 'care': 0.0, 'the': 0.14285714285714285, 'their': 0.0, 'ice': 0.0, 'cream': 0.0, 'like': 0.0, 'a': 0.14285714285714285, 'fire': 0.0, 'walk': 0.14285714285714285, 'of': 0.0, 'need': 0.0, 'to': 0.0, 'parents': 0.0, 'take': 0.0, 'children': 0.0, 'for': 0.14285714285714285, 'man': 0.14285714285714285}


 wordDict {'around': 1, 'went': 0, 'out': 0, 'sat': 1, 'care': 0, 'the': 2, 'their': 0, 'ice': 0, 'cream': 0, 'like': 0, 'a': 0, 'fire': 1, 'walk': 0, 'of': 0, 'need': 0, 'to': 0, 'parents': 0, 'take': 0, 'children': 1, 'for': 0, 'man': 0}

 tokenize_item ['the', 'children', 'sat', 'around', 'the', 'fi

In [8]:
import pandas as pd

In [9]:
tfidf_all = [tfidfA, tfidfB, tfidfC, tfidfD]

df = pd.DataFrame(tfidf_all)
df

Unnamed: 0,around,went,out,sat,care,the,their,ice,cream,like,...,fire,walk,of,need,to,parents,take,children,for,man
0,0.0,0.198042,0.198042,0.0,0.0,0.099021,0.0,0.0,0.0,0.0,...,0.0,0.198042,0.0,0.0,0.0,0.0,0.0,0.0,0.198042,0.198042
1,0.231049,0.0,0.0,0.231049,0.0,0.231049,0.0,0.0,0.0,0.0,...,0.231049,0.0,0.0,0.0,0.0,0.0,0.0,0.047947,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.346574,0.346574,0.346574,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071921,0.0,0.0
3,0.0,0.0,0.0,0.0,0.173287,0.0,0.173287,0.0,0.0,0.0,...,0.0,0.0,0.173287,0.173287,0.173287,0.173287,0.173287,0.03596,0.0,0.0
