# 参考
https://www.runoob.com/python/python-reg-expressions.html

https://www.liaoxuefeng.com/wiki/1016959663602400/1017639890281664

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import pandas as pd
import numpy as np

语料为含有n个句子的list，每个句子以空格分词。

In [3]:
# 语料
corpus = [
  "帮我 查下 明天 北京 天气 怎么样",
  "帮我 查下 今天 北京 天气 好不好",
  "帮我 查询 去 北京 的 火车",
  "帮我 查看 到 上海 的 火车",
  "帮我 查看 特朗普 的 新闻",
  "帮我 看看 有没有 北京 的 新闻",
  "帮我 搜索 上海 有 什么 好玩的",
  "帮我 找找 上海 东方明珠 在哪"
]


使用CountVectorizer把文本中的词语转化为词频矩阵，即先统计词数，然后使用one-hot向量。可以使用get_feature_names()获取特征名字（即词表）。

In [4]:
# 将文本中的词语转换为词频矩阵
vectorizer = CountVectorizer()
# 计算个词语出现的次数
X = vectorizer.fit_transform(corpus)
# 获取词袋中所有文本关键词
word = vectorizer.get_feature_names()
print(word)


['上海', '东方明珠', '什么', '今天', '北京', '在哪', '天气', '好不好', '好玩的', '帮我', '怎么样', '找找', '搜索', '新闻', '明天', '有没有', '查下', '查看', '查询', '火车', '特朗普', '看看']


In [5]:
# 查看词频结果
print(X.toarray())

[[0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 1 0 0 0 0 0]
 [0 0 0 1 1 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0]
 [1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0]
 [0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1]
 [1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0]]


使用TF-IDF进行计算，将矩阵X输入到transformer中。

In [6]:
transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(X)

tfidf_matrix中每一行表示一个句子的向量，第i个位置没有数值，则表示该词没有出现过，有值表示该词在该文章中的TF*IDF的值，进行从大到小排序后，提取出该位置的词即为所需要的关键词。

In [7]:
tfidf_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.3183848 ,
        0.        , 0.42081614, 0.        , 0.        , 0.20052115,
        0.50212047, 0.        , 0.        , 0.        , 0.50212047,
        0.        , 0.42081614, 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.50212047, 0.3183848 ,
        0.        , 0.42081614, 0.50212047, 0.        , 0.20052115,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.42081614, 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.42141948,
        0.        , 0.        , 0.        , 0.        , 0.26541316,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.6646151 , 0.55699932,
        0.        , 0.        ],
       [0.50057382, 0.        , 0.        , 0.        , 0.        ,
        0.       

将tfidf的矩阵放入pandas的DataFrame中：

In [8]:
#.toarray()方法将tf-idf矩阵抽取出来并生成一个数据框，元素a[i][j]表示j词在i类文本中的tf-idf权重
tf_idf_dataframe = pd.DataFrame(tfidf_matrix.toarray())

In [9]:
tf_idf_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0.0,0.0,0.0,0.0,0.318385,0.0,0.420816,0.0,0.0,0.200521,...,0.0,0.0,0.50212,0.0,0.420816,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.50212,0.318385,0.0,0.420816,0.50212,0.0,0.200521,...,0.0,0.0,0.0,0.0,0.420816,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.421419,0.0,0.0,0.0,0.0,0.265413,...,0.0,0.0,0.0,0.0,0.0,0.0,0.664615,0.556999,0.0,0.0
3,0.500574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.276418,...,0.0,0.0,0.0,0.0,0.0,0.580094,0.0,0.580094,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.249387,...,0.0,0.523367,0.0,0.0,0.0,0.523367,0.0,0.0,0.624484,0.0
5,0.0,0.0,0.0,0.0,0.350974,0.0,0.0,0.0,0.0,0.221046,...,0.0,0.46389,0.0,0.553517,0.0,0.0,0.0,0.0,0.0,0.553517
6,0.376863,0.0,0.52111,0.0,0.0,0.0,0.0,0.0,0.52111,0.208105,...,0.52111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.376863,0.52111,0.0,0.0,0.0,0.52111,0.0,0.0,0.0,0.208105,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


把tf_idf_dataframe的columns设置为one-hot的词表，即可以展示出每个句子的情况：

In [10]:
#获取词袋模型中的所有词语（格式为list) ,作为数据框的columns
tf_idf_dataframe.columns = vectorizer.get_feature_names()

In [11]:
tf_idf_dataframe

Unnamed: 0,上海,东方明珠,什么,今天,北京,在哪,天气,好不好,好玩的,帮我,...,搜索,新闻,明天,有没有,查下,查看,查询,火车,特朗普,看看
0,0.0,0.0,0.0,0.0,0.318385,0.0,0.420816,0.0,0.0,0.200521,...,0.0,0.0,0.50212,0.0,0.420816,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.50212,0.318385,0.0,0.420816,0.50212,0.0,0.200521,...,0.0,0.0,0.0,0.0,0.420816,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.421419,0.0,0.0,0.0,0.0,0.265413,...,0.0,0.0,0.0,0.0,0.0,0.0,0.664615,0.556999,0.0,0.0
3,0.500574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.276418,...,0.0,0.0,0.0,0.0,0.0,0.580094,0.0,0.580094,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.249387,...,0.0,0.523367,0.0,0.0,0.0,0.523367,0.0,0.0,0.624484,0.0
5,0.0,0.0,0.0,0.0,0.350974,0.0,0.0,0.0,0.0,0.221046,...,0.0,0.46389,0.0,0.553517,0.0,0.0,0.0,0.0,0.0,0.553517
6,0.376863,0.0,0.52111,0.0,0.0,0.0,0.0,0.0,0.52111,0.208105,...,0.52111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.376863,0.52111,0.0,0.0,0.0,0.52111,0.0,0.0,0.0,0.208105,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# numpy.argsort()方法，设置axis=1对每行进行排序，然后取前2位
tf_idf_sorted = np.argsort(tfidf_matrix.toarray(), axis=1)[:, -2:]


In [13]:
#提取前两位对应的索引的分词
tf_idf_dataframe.columns[tf_idf_sorted].values

array([['明天', '怎么样'],
       ['好不好', '今天'],
       ['火车', '查询'],
       ['查看', '火车'],
       ['查看', '特朗普'],
       ['有没有', '看看'],
       ['搜索', '什么'],
       ['在哪', '东方明珠']], dtype=object)