In [1]:
text_list= [
    '我喜欢吃苹果和桃子尤其是桃子',
    '小甲喜欢吃苹果', 
    '小乙喜爱吃西瓜', 
    '小丁喜欢吃苹果西瓜'
]


In [3]:
# sklearn会对语料自动进行分词，默认以空格拆分，并且默认过滤掉长度为1的token和标点符号；而gensim需要先对语料分词后才能处理
import jieba
ws = [jieba.lcut(s) for s in text_list]  # gensim的输入
ws_sk = [' '.join(s) for s in ws]  # sklearn的输入
print(ws)
print(ws_sk)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Dumping model to file cache /tmp/jieba.cache
Dump cache file failed.
Traceback (most recent call last):
  File "/data/fuwen/anaconda3/envs/bj1/lib/python3.7/site-packages/jieba/__init__.py", line 154, in initialize
    _replace_file(fpath, cache_file)
PermissionError: [Errno 1] Operation not permitted: '/tmp/tmp9g8n85fv' -> '/tmp/jieba.cache'
Loading model cost 0.923 seconds.
Prefix dict has been built successfully.


[['我', '喜欢', '吃', '苹果', '和', '桃子', '尤其', '是', '桃子'], ['小甲', '喜欢', '吃', '苹果'], ['小乙', '喜爱', '吃', '西瓜'], ['小丁', '喜欢', '吃', '苹果', '西瓜']]
['我 喜欢 吃 苹果 和 桃子 尤其 是 桃子', '小甲 喜欢 吃 苹果', '小乙 喜爱 吃 西瓜', '小丁 喜欢 吃 苹果 西瓜']


In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
# token_pattern为分词方式，默认过滤掉长度为1的token和标点符号，即`r"(?u)\b\w\w+\b"`，
#为了保证分词结果和我们使用jieba的分词结果一致，这里对分词方式做修改
# \w 匹配字母或数字或下划线或汉字 等价于 '[^A-Za-z0-9_]'
# [\w]+和\w+没有区别，都是匹配数字和字母下划线的多个字符
#\b单词的开头或结尾
# 将re.U提前在正则表达式中加入r"(?u)
vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
count = vectorizer.fit_transform(ws_sk)#我 喜欢 吃 苹果 和 桃子 尤其 是 桃子
# print(count)
print(vectorizer.get_feature_names())
print(vectorizer.vocabulary_)
print(count.toarray())

# get_feature_names()可看到所有文本的关键字
# vocabulary_可看到所有文本的关键字和其位置
# toarray()可看到词频矩阵的结果

#TfidfTransformer是统计CountVectorizer中每个词语的tf-idf权值
transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(count)
tfidf_matrix.toarray()


['吃', '和', '喜欢', '喜爱', '小丁', '小乙', '小甲', '尤其', '我', '是', '桃子', '苹果', '西瓜']
{'我': 8, '喜欢': 2, '吃': 0, '苹果': 11, '和': 1, '桃子': 10, '尤其': 7, '是': 9, '小甲': 6, '小乙': 5, '喜爱': 3, '西瓜': 12, '小丁': 4}
[[1 1 1 0 0 0 0 1 1 1 2 1 0]
 [1 0 1 0 0 0 1 0 0 0 0 1 0]
 [1 0 0 1 0 1 0 0 0 0 0 0 1]
 [1 0 1 0 1 0 0 0 0 0 0 1 1]]


array([[0.17311114, 0.33173127, 0.21173977, 0.        , 0.        ,
        0.        , 0.        , 0.33173127, 0.33173127, 0.33173127,
        0.66346254, 0.21173977, 0.        ],
       [0.3612126 , 0.        , 0.44181486, 0.        , 0.        ,
        0.        , 0.69218835, 0.        , 0.        , 0.        ,
        0.        , 0.44181486, 0.        ],
       [0.30675807, 0.        , 0.        , 0.58783765, 0.        ,
        0.58783765, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.46345796],
       [0.31707032, 0.        , 0.38782252, 0.        , 0.60759891,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.38782252, 0.47903796]])

In [9]:

#skl方法二
tfidf_vec = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
tfidf_matrix = tfidf_vec.fit_transform(ws_sk)

# tfidf_vec.get_feature_names()

# tfidf_vec.vocabulary_
tfidf_matrix.toarray()


array([[0.17311114, 0.33173127, 0.21173977, 0.        , 0.        ,
        0.        , 0.        , 0.33173127, 0.33173127, 0.33173127,
        0.66346254, 0.21173977, 0.        ],
       [0.3612126 , 0.        , 0.44181486, 0.        , 0.        ,
        0.        , 0.69218835, 0.        , 0.        , 0.        ,
        0.        , 0.44181486, 0.        ],
       [0.30675807, 0.        , 0.        , 0.58783765, 0.        ,
        0.58783765, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.46345796],
       [0.31707032, 0.        , 0.38782252, 0.        , 0.60759891,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.38782252, 0.47903796]])

In [17]:
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary(ws)  # 建立词典

print(dictionary.token2id)

corpus = [dictionary.doc2bow(doc) for doc in ws]

print(corpus)

tfidf_model = models.TfidfModel(corpus)
tfidf_matrix_gensim = tfidf_model[corpus]  # 得到语料的tfidf值
print(tfidf_matrix_gensim[0])



text_new = '小明喜欢吃苹果'  # 一条新数据
ws_new = jieba.lcut(text_new)
print(ws_new)

vec_new = dictionary.doc2bow(ws_new)
print(vec_new)

# 计算与新数据的相似度
index = similarities.SparseMatrixSimilarity(tfidf_model [corpus], num_features=len(dictionary.keys()))
# print(index)
print(index[tfidf_model[vec_new]])# 与第二条相似度最高，最后一条次高，结果还是比较合理的

{'吃': 0, '和': 1, '喜欢': 2, '尤其': 3, '我': 4, '是': 5, '桃子': 6, '苹果': 7, '小甲': 8, '喜爱': 9, '小乙': 10, '西瓜': 11, '小丁': 12}
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1)], [(0, 1), (2, 1), (7, 1), (8, 1)], [(0, 1), (9, 1), (10, 1), (11, 1)], [(0, 1), (2, 1), (7, 1), (11, 1), (12, 1)]]
[(1, 0.35166544195065214), (2, 0.07297717280499404), (3, 0.35166544195065214), (4, 0.35166544195065214), (5, 0.35166544195065214), (6, 0.7033308839013043), (7, 0.07297717280499404)]
['小明', '喜欢', '吃', '苹果']
[(0, 1), (2, 1), (7, 1)]
[0.1032053  0.28159946 0.         0.2538916 ]


In [34]:
tfidf_vec.vocabulary_

{'我': 8,
 '喜欢': 2,
 '吃': 0,
 '苹果': 11,
 '和': 1,
 '桃子': 10,
 '尤其': 7,
 '是': 9,
 '小甲': 6,
 '小乙': 5,
 '喜爱': 3,
 '西瓜': 12,
 '小丁': 4}

In [21]:
dictionary.token2id

{'吃': 0,
 '和': 1,
 '喜欢': 2,
 '尤其': 3,
 '我': 4,
 '是': 5,
 '桃子': 6,
 '苹果': 7,
 '小甲': 8,
 '喜爱': 9,
 '小乙': 10,
 '西瓜': 11,
 '小丁': 12}

In [25]:
print(list(tfidf_matrix.toarray()[0]))
print(tfidf_matrix_gensim[0])

[0.1731111372459707, 0.3317312678886485, 0.21173977118307816, 0.0, 0.0, 0.0, 0.0, 0.3317312678886485, 0.3317312678886485, 0.3317312678886485, 0.663462535777297, 0.21173977118307816, 0.0]
[(1, 0.35166544195065214), (2, 0.07297717280499404), (3, 0.35166544195065214), (4, 0.35166544195065214), (5, 0.35166544195065214), (6, 0.7033308839013043), (7, 0.07297717280499404)]


In [26]:
print(list(tfidf_matrix.toarray()[1]))
print(tfidf_matrix_gensim[1])

[0.36121259819515966, 0.0, 0.4418148601358603, 0.0, 0.0, 0.0, 0.6921883541575676, 0.0, 0.0, 0.0, 0.0, 0.4418148601358603, 0.0]
[(2, 0.19912088989639576), (7, 0.19912088989639576), (8, 0.9595320434533362)]
