# 参考
https://www.runoob.com/python/python-reg-expressions.html

https://www.liaoxuefeng.com/wiki/1016959663602400/1017639890281664

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import pandas as pd
import numpy as np

语料为含有n个句子的list，每个句子以空格分词。

In [4]:
# 语料
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]


使用CountVectorizer把文本中的词语转化为词频矩阵，即先统计词数，然后使用one-hot向量。可以使用get_feature_names()获取特征名字（即词表）。

In [5]:
# 将文本中的词语转换为词频矩阵
vectorizer = CountVectorizer()
# 计算个词语出现的次数
X = vectorizer.fit_transform(corpus)
# 获取词袋中所有文本关键词
word = vectorizer.get_feature_names()
print(word)


['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [7]:
# 查看词频结果
print(X.toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]


使用TF-IDF进行计算，将矩阵X输入到transformer中。

In [10]:
transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(X)

tfidf_matrix中每一行表示一个句子的向量，第i个位置没有数值，则表示该词没有出现过，有值表示该词在该文章中的TF*IDF的值，进行从大到小排序后，提取出该位置的词即为所需要的关键词。

In [12]:
tfidf_matrix.toarray()

array([[0.        , 0.43877674, 0.54197657, 0.43877674, 0.        ,
        0.        , 0.35872874, 0.        , 0.43877674],
       [0.        , 0.27230147, 0.        , 0.27230147, 0.        ,
        0.85322574, 0.22262429, 0.        , 0.27230147],
       [0.55280532, 0.        , 0.        , 0.        , 0.55280532,
        0.        , 0.28847675, 0.55280532, 0.        ],
       [0.        , 0.43877674, 0.54197657, 0.43877674, 0.        ,
        0.        , 0.35872874, 0.        , 0.43877674]])

将tfidf的矩阵放入pandas的DataFrame中：

In [35]:
#.toarray()方法将tf-idf矩阵抽取出来并生成一个数据框，元素a[i][j]表示j词在i类文本中的tf-idf权重
tf_idf_dataframe = pd.DataFrame(tfidf_matrix.toarray())

In [36]:
tf_idf_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,0.438777,0.541977,0.438777,0.0,0.0,0.358729,0.0,0.438777
1,0.0,0.272301,0.0,0.272301,0.0,0.853226,0.222624,0.0,0.272301
2,0.552805,0.0,0.0,0.0,0.552805,0.0,0.288477,0.552805,0.0
3,0.0,0.438777,0.541977,0.438777,0.0,0.0,0.358729,0.0,0.438777


把tf_idf_dataframe的columns设置为one-hot的词表，即可以展示出每个句子的情况：

In [37]:
#获取词袋模型中的所有词语（格式为list) ,作为数据框的columns
tf_idf_dataframe.columns = vectorizer.get_feature_names()

In [38]:
tf_idf_dataframe

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.438777,0.541977,0.438777,0.0,0.0,0.358729,0.0,0.438777
1,0.0,0.272301,0.0,0.272301,0.0,0.853226,0.222624,0.0,0.272301
2,0.552805,0.0,0.0,0.0,0.552805,0.0,0.288477,0.552805,0.0
3,0.0,0.438777,0.541977,0.438777,0.0,0.0,0.358729,0.0,0.438777


In [39]:
# numpy.argsort()方法，设置axis=1对每行进行排序，然后取前2位
tf_idf_sorted = np.argsort(tfidf_matrix.toarray(), axis=1)[:, -2:]


In [40]:
#提取前两位对应的索引的分词
tf_idf_dataframe.columns[tf_idf_sorted].values

array([['this', 'first'],
       ['this', 'second'],
       ['one', 'third'],
       ['this', 'first']], dtype=object)