* Reference: https://radimrehurek.com/gensim/models/fasttext.html

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = ['This is the first document, we can put another document here.', 'That is the second document.','No comment']
X_test = ['This is the third document.']

# max_df=0.5 單詞如果在50%以上的文件出現就不考慮 viyl hhl
# min_df=5 單詞如果出現次數少於5次就不考慮, min_df=0.1 單詞如果在10%以下的文件中出現就不考慮
# stop_words => list
# vectorizer = TfidfVectorizer(stop_words=['this'], min_df=0.4) 
vectorizer = TfidfVectorizer() 


## 用 X_train來 build 字典，字數還有document 數量
vectorizer.fit(X_train)

## 訓練完後，才可以取得 get_feature_names => 擷取(要分析)的單字
words = vectorizer.get_feature_names()
print('words \n', words)

# 屬性 vocabulary_ => A mapping of terms to feature indices.
print('\n 單詞對應index: \n{}'.format(vectorizer.vocabulary_))

## .transform => Transform documents to document-term matrix.
## 得到 tfidf vector
tfidf_train = vectorizer.transform(X_train)
tfidf_test = vectorizer.transform(X_test)


print('\n type(tfidf_train)', type(tfidf_train))
print('\n tfidf_train', tfidf_train)

# 條列訓練資料中的每個 sentence 的 wordVector
print('\n tfidf_train.toarray()', tfidf_train.toarray())

# 條列 "測試資料" 中的每個 sentence 的 wordVector
print('\n tfidf_test.toarray()', tfidf_test.toarray())





words 
 ['another', 'can', 'comment', 'document', 'first', 'here', 'is', 'no', 'put', 'second', 'that', 'the', 'this', 'we']

 單詞對應index: 
{'this': 12, 'is': 6, 'the': 11, 'first': 4, 'document': 3, 'we': 13, 'can': 1, 'put': 8, 'another': 0, 'here': 5, 'that': 10, 'second': 9, 'no': 7, 'comment': 2}

 type(tfidf_train) <class 'scipy.sparse.csr.csr_matrix'>

 tfidf_train   (0, 13)	0.3090426585183487
  (0, 12)	0.3090426585183487
  (0, 11)	0.23503490751779094
  (0, 8)	0.3090426585183487
  (0, 6)	0.23503490751779094
  (0, 5)	0.3090426585183487
  (0, 4)	0.3090426585183487
  (0, 3)	0.4700698150355819
  (0, 1)	0.3090426585183487
  (0, 0)	0.3090426585183487
  (1, 11)	0.39351120409397233
  (1, 10)	0.5174199439321682
  (1, 9)	0.5174199439321682
  (1, 6)	0.39351120409397233
  (1, 3)	0.39351120409397233
  (2, 7)	0.7071067811865476
  (2, 2)	0.7071067811865476

 tfidf_train.toarray() [[0.30904266 0.30904266 0.         0.47006982 0.30904266 0.30904266
  0.23503491 0.         0.30904266 0.         0.

In [24]:
vectorizer

TfidfVectorizer()

---

#### 我們也可以將TfidfVectorizer拆解成兩個步驟：CountVectorizer, TfidfTransformer

In [27]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [28]:
#文本
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]


#### CountVectorizer是通過fit_transform函數將文本中的詞語轉換為詞頻矩陣
* get_feature_names()可看到所有文本的關鍵字
* vocabulary_可看到所有文本的關鍵字和其位置
* toarray()可看到詞頻矩陣的結果

In [30]:
#CountVectorizer 就是 tokenlizer
vectorizer = CountVectorizer()
count = vectorizer.fit_transform(corpus)
print('\n get_feature_names', vectorizer.get_feature_names())  
print('\n vocabulary_', vectorizer.vocabulary_)
print('\n toarray', count.toarray())




 get_feature_names ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

 vocabulary_ {'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}

 toarray [[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]


#### TfidfTransformer是統計CountVectorizer中每個詞語的tf-idf權值

In [32]:
transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(count)

print('tfidf_matrix', tfidf_matrix)
print(tfidf_matrix.toarray())

tfidf_matrix   (0, 8)	0.4387767428592343
  (0, 6)	0.35872873824808993
  (0, 3)	0.4387767428592343
  (0, 2)	0.5419765697264572
  (0, 1)	0.4387767428592343
  (1, 8)	0.2723014675233404
  (1, 6)	0.22262429232510395
  (1, 5)	0.8532257361452786
  (1, 3)	0.2723014675233404
  (1, 1)	0.2723014675233404
  (2, 7)	0.5528053199908667
  (2, 6)	0.2884767487500274
  (2, 4)	0.5528053199908667
  (2, 0)	0.5528053199908667
  (3, 8)	0.4387767428592343
  (3, 6)	0.35872873824808993
  (3, 3)	0.4387767428592343
  (3, 2)	0.5419765697264572
  (3, 1)	0.4387767428592343
[[0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]
 [0.         0.27230147 0.         0.27230147 0.         0.85322574
  0.22262429 0.         0.27230147]
 [0.55280532 0.         0.         0.         0.55280532 0.
  0.28847675 0.55280532 0.        ]
 [0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]]
