/
cluster_demo.py
35 lines (31 loc) · 1.32 KB
/
cluster_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description:
"""
import sys
sys.path.append('..')
from pytextclassifier.textcluster import TextCluster
if __name__ == '__main__':
m = TextCluster(output_dir='models/cluster-toy', n_clusters=2)
print(m)
data = [
'Student debt to cost Britain billions within decades',
'Chinese education for TV experiment',
'Abbott government spends $8 million on higher education',
'Middle East and Asia boost investment in top level sports',
'Summit Series look launches HBO Canada sports doc series: Mudhar'
]
m.train(data)
m.load_model()
r = m.predict(['Abbott government spends $8 million on higher education media blitz',
'Middle East and Asia boost investment in top level sports'])
print(r)
########### load chinese train data from 1w data file
from sklearn.feature_extraction.text import TfidfVectorizer
tcluster = TextCluster(output_dir='models/cluster', feature=TfidfVectorizer(ngram_range=(1, 2)), n_clusters=10)
data = tcluster.load_file_data('thucnews_train_1w.txt', sep='\t', use_col=1)
feature, labels = tcluster.train(data[:5000])
tcluster.show_clusters(feature, labels, 'models/cluster/cluster_train_seg_samples.png')
r = tcluster.predict(data[:30])
print(r)